daru-io 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +2 -0
  4. data/.rspec_formatter.rb +24 -0
  5. data/.rubocop.yml +109 -0
  6. data/.travis.yml +30 -0
  7. data/.yardopts +2 -0
  8. data/CODE_OF_CONDUCT.md +46 -0
  9. data/CONTRIBUTING.md +65 -0
  10. data/Gemfile +20 -0
  11. data/Guardfile +7 -0
  12. data/LICENSE.md +21 -0
  13. data/README.md +654 -0
  14. data/Rakefile +12 -0
  15. data/daru-io.gemspec +39 -0
  16. data/lib/daru/io.rb +3 -0
  17. data/lib/daru/io/base.rb +45 -0
  18. data/lib/daru/io/exporters.rb +1 -0
  19. data/lib/daru/io/exporters/avro.rb +96 -0
  20. data/lib/daru/io/exporters/base.rb +54 -0
  21. data/lib/daru/io/exporters/csv.rb +103 -0
  22. data/lib/daru/io/exporters/excel.rb +148 -0
  23. data/lib/daru/io/exporters/json.rb +570 -0
  24. data/lib/daru/io/exporters/r_data.rb +66 -0
  25. data/lib/daru/io/exporters/rds.rb +79 -0
  26. data/lib/daru/io/exporters/sql.rb +55 -0
  27. data/lib/daru/io/importers.rb +1 -0
  28. data/lib/daru/io/importers/active_record.rb +75 -0
  29. data/lib/daru/io/importers/avro.rb +54 -0
  30. data/lib/daru/io/importers/base.rb +62 -0
  31. data/lib/daru/io/importers/csv.rb +190 -0
  32. data/lib/daru/io/importers/excel.rb +99 -0
  33. data/lib/daru/io/importers/excelx.rb +138 -0
  34. data/lib/daru/io/importers/html.rb +144 -0
  35. data/lib/daru/io/importers/json.rb +152 -0
  36. data/lib/daru/io/importers/mongo.rb +139 -0
  37. data/lib/daru/io/importers/plaintext.rb +97 -0
  38. data/lib/daru/io/importers/r_data.rb +74 -0
  39. data/lib/daru/io/importers/rds.rb +67 -0
  40. data/lib/daru/io/importers/redis.rb +135 -0
  41. data/lib/daru/io/importers/sql.rb +127 -0
  42. data/lib/daru/io/link.rb +80 -0
  43. data/lib/daru/io/version.rb +5 -0
  44. metadata +269 -0
@@ -0,0 +1,139 @@
1
+ require 'daru/io/importers/json'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Mongo Importer Class, that extends `from_mongo` method to `Daru::DataFrame`
7
+ class Mongo < JSON
8
+ Daru::DataFrame.register_io_module :from_mongo, self
9
+
10
+ # Checks for required gem dependencies of Mongo Importer
11
+ def initialize
12
+ super
13
+ optional_gem 'mongo'
14
+ end
15
+
16
+ # Loads data from a given connection
17
+ #
18
+ # @!method self.from(connection)
19
+ #
20
+ # @param connection [String or Hash or Mongo::Client] Contains details
21
+ # about a Mongo database / hosts to connect.
22
+ #
23
+ # @return [Daru::IO::Importers::Mongo]
24
+ #
25
+ # @example Loading from a connection string
26
+ # instance_1 = Daru::IO::Importers::Mongo.from('mongodb://127.0.0.1:27017/test')
27
+ #
28
+ # @example Loading from a connection hash
29
+ # instance_2 = Daru::IO::Importers::Mongo.from({ hosts: ['127.0.0.1:27017'], database: 'test' })
30
+ #
31
+ # @example Loading from a Mongo::Client connection
32
+ # instance_3 = Daru::IO::Importers::Mongo.from(Mongo::Client.new ['127.0.0.1:27017'], database: 'test')
33
+ def from(connection)
34
+ @client = get_client(connection)
35
+ self
36
+ end
37
+
38
+ # Imports a `Daru::DataFrame` from a Mongo Importer instance.
39
+ #
40
+ # @param collection [String or Symbol] A specific collection in the
41
+ # Mongo database, to import as `Daru::DataFrame`.
42
+ # @param columns [Array] JSON-path slectors to select specific fields
43
+ # from the JSON input.
44
+ # @param order [String or Array] Either a JSON-path selector string, or
45
+ # an array containing the order of the `Daru::DataFrame`. DO NOT
46
+ # provide both `order` and `named_columns` at the same time.
47
+ # @param index [String or Array] Either a JSON-path selector string, or
48
+ # an array containing the order of the `Daru::DataFrame`.
49
+ # @param filter [Hash] Filters and chooses Mongo documents that match
50
+ # the given `filter` from the collection.
51
+ # @param limit [Interger] Limits the number of Mongo documents to be
52
+ # parsed from the collection.
53
+ # @param skip [Integer] Skips `skip` number of documents from the Mongo
54
+ # collection.
55
+ # @param named_columns [Hash] JSON-path selectors to select specific
56
+ # fields from the JSON input. DO NOT provide both `order` and
57
+ # `named_columns` at the same time.
58
+ #
59
+ # @note
60
+ # - For more information on using JSON-path selectors, have a look at
61
+ # the explanations {http://www.rubydoc.info/gems/jsonpath/0.5.8 here}
62
+ # and {http://goessner.net/articles/JsonPath/ here}.
63
+ # - The Mongo gem faces `Argument Error : expected Proc Argument`
64
+ # issue due to the bug in MRI Ruby 2.4.0 mentioned
65
+ # {https://bugs.ruby-lang.org/issues/13107 here}. This seems to have
66
+ # been fixed in Ruby 2.4.1 onwards. Hence, please avoid using this
67
+ # Mongo Importer in Ruby version 2.4.0.
68
+ #
69
+ # @return [Daru::DataFrame]
70
+ #
71
+ # @example Importing without jsonpath selectors
72
+ # # The below 'cars' collection can be recreated in a Mongo shell with -
73
+ # # db.cars.drop()
74
+ # # db.cars.insert({name: "Audi", price: 52642})
75
+ # # db.cars.insert({name: "Mercedes", price: 57127})
76
+ # # db.cars.insert({name: "Volvo", price: 29000})
77
+ #
78
+ # df = instance.call('cars')
79
+ #
80
+ # #=> #<Daru::DataFrame(3x3)>
81
+ # # _id name price
82
+ # # 0 5948d0bfcd Audi 52642.0
83
+ # # 1 5948d0c6cd Mercedes 57127.0
84
+ # # 2 5948d0cecd Volvo 29000.0
85
+ #
86
+ # @example Importing with jsonpath selectors
87
+ # # The below 'cars' collection can be recreated in a Mongo shell with -
88
+ # # db.cars.drop()
89
+ # # db.cars.insert({name: "Audi", price: 52642, star: { fuel: 9.8, cost: 8.6, seats: 9.9, sound: 9.3 }})
90
+ # # db.cars.insert({name: "Mercedes", price: 57127, star: { fuel: 9.3, cost: 8.9, seats: 8.4, sound: 9.1 }})
91
+ # # db.cars.insert({name: "Volvo", price: 29000, star: { fuel: 7.8, cost: 9.9, seats: 8.2, sound: 8.9 }})
92
+ #
93
+ # df = instance.call(
94
+ # 'cars',
95
+ # '$.._id',
96
+ # '$..name',
97
+ # '$..price',
98
+ # '$..star..fuel',
99
+ # '$..star..cost'
100
+ # )
101
+ #
102
+ # #=> #<Daru::DataFrame(3x5)>
103
+ # # _id name price fuel cost
104
+ # # 0 5948d40b50 Audi 52642.0 9.8 8.6
105
+ # # 1 5948d42850 Mercedes 57127.0 9.3 8.9
106
+ # # 2 5948d44350 Volvo 29000.0 7.8 9.9
107
+ def call(collection, *columns, order: nil, index: nil,
108
+ filter: nil, limit: nil, skip: nil, **named_columns)
109
+ @json = ::JSON.parse(
110
+ @client[collection.to_sym]
111
+ .find(filter, skip: skip, limit: limit)
112
+ .to_json
113
+ )
114
+
115
+ super(*columns, order: order, index: index, **named_columns)
116
+ end
117
+
118
+ private
119
+
120
+ def get_client(connection)
121
+ case connection
122
+ when ::Mongo::Client
123
+ connection
124
+ when Hash
125
+ hosts = connection.delete :hosts
126
+ ::Mongo::Client.new(hosts, connection)
127
+ when String
128
+ ::Mongo::Client.new(connection)
129
+ else
130
+ raise ArgumentError,
131
+ "Expected #{connection} to be either a Mongo instance, "\
132
+ 'Mongo connection Hash, or Mongo connection URL String. '\
133
+ "Received #{connection.class} instead."
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,97 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Plaintext Importer Class, that extends `read_plaintext` method to
7
+ # `Daru::DataFrame`
8
+ class Plaintext < Base
9
+ Daru::DataFrame.register_io_module :read_plaintext, self
10
+
11
+ # Checks for required gem dependencies of Plaintext Importer
12
+ def initialize; end
13
+
14
+ # Reads data from a plaintext (.dat) file
15
+ #
16
+ # @!method self.read(path)
17
+ #
18
+ # @param path [String] Path to plaintext file, where the dataframe is to be
19
+ # imported from.
20
+ #
21
+ # @return [Daru::IO::Importers::Plaintext]
22
+ #
23
+ # @example Reading from plaintext file
24
+ # instance = Daru::IO::Importers::Plaintext.read("bank2.dat")
25
+ def read(path)
26
+ @file_data = File.read(path).split("\n").map do |line|
27
+ row = process_row(line.strip.split(/\s+/),[''])
28
+ next if row == ["\x1A"]
29
+ row
30
+ end
31
+ self
32
+ end
33
+
34
+ # Imports `Daru::DataFrame` from a Plaintext Importer instance
35
+ #
36
+ # @param fields [Array] An array of vectors.
37
+ #
38
+ # @return [Daru::DataFrame]
39
+ #
40
+ # @example Initializing with fields
41
+ # df = instance.call([:v1, :v2, :v3, :v4, :v5, :v6])
42
+ #
43
+ # #=> #<Daru::DataFrame(200x6)>
44
+ # # v1 v2 v3 v4 v5 v6
45
+ # # 0 214.8 131.0 131.1 9.0 9.7 141.0
46
+ # # 1 214.6 129.7 129.7 8.1 9.5 141.7
47
+ # # 2 214.8 129.7 129.7 8.7 9.6 142.2
48
+ # # 3 214.8 129.7 129.6 7.5 10.4 142.0
49
+ # # 4 215.0 129.6 129.7 10.4 7.7 141.8
50
+ # # 5 215.7 130.8 130.5 9.0 10.1 141.4
51
+ # # 6 215.5 129.5 129.7 7.9 9.6 141.6
52
+ # # 7 214.5 129.6 129.2 7.2 10.7 141.7
53
+ # # 8 214.9 129.4 129.7 8.2 11.0 141.9
54
+ # # 9 215.2 130.4 130.3 9.2 10.0 140.7
55
+ # # 10 215.3 130.4 130.3 7.9 11.7 141.8
56
+ # # 11 215.1 129.5 129.6 7.7 10.5 142.2
57
+ # # 12 215.2 130.8 129.6 7.9 10.8 141.4
58
+ # # 13 214.7 129.7 129.7 7.7 10.9 141.7
59
+ # # 14 215.1 129.9 129.7 7.7 10.8 141.8
60
+ # #... ... ... ... ... ... ...
61
+ def call(fields)
62
+ Daru::DataFrame.rows(@file_data, order: fields)
63
+ end
64
+
65
+ private
66
+
67
+ INT_PATTERN = /^[-+]?\d+$/
68
+ FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/
69
+
70
+ def process_row(row,empty)
71
+ row.to_a.map do |c|
72
+ if empty.include?(c)
73
+ # FIXME: As far as I can guess, it will never work.
74
+ # It is called only inside `from_plaintext`, and there
75
+ # data is splitted by `\s+` -- there is no chance that
76
+ # "empty" (currently just '') will be between data?..
77
+ nil
78
+ else
79
+ try_string_to_number(c)
80
+ end
81
+ end
82
+ end
83
+
84
+ def try_string_to_number(s)
85
+ case s
86
+ when INT_PATTERN
87
+ s.to_i
88
+ when FLOAT_PATTERN
89
+ s.tr(',', '.').to_f
90
+ else
91
+ s
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,74 @@
1
+ require 'daru/io/importers/rds'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # RData Importer Class, that extends `read_rdata` method to `Daru::DataFrame`
7
+ #
8
+ # @see Daru::IO::Importers::RDS For .rds format
9
+ class RData < RDS
10
+ Daru::DataFrame.register_io_module :read_rdata, self
11
+
12
+ # Checks for required gem dependencies of RData Importer
13
+ def initialize
14
+ super
15
+ end
16
+
17
+ # Reads data from a Rdata file
18
+ #
19
+ # @!method self.read(path)
20
+ #
21
+ # @param path [String] Path to RData file, where the dataframe is to be imported from.
22
+ #
23
+ # @return [Daru::IO::Importers::RData]
24
+ #
25
+ # @example Reading from rdata file
26
+ # instance = Daru::IO::Importers::RData.read('ACScounty.RData')
27
+ def read(path)
28
+ @instance = RSRuby.instance
29
+ @instance.eval_R("load('#{path}')")
30
+ self
31
+ end
32
+
33
+ # Imports a `Daru::DataFrame` from a RData Importer instance and rdata file
34
+ #
35
+ # @param variable [String] The variable to be imported from the
36
+ # variables stored in the RData file. Please note that the R
37
+ # variable to be imported from the RData file should be a
38
+ # `data.frame`
39
+ #
40
+ # @return [Daru::DataFrame]
41
+ #
42
+ # @example Importing a particular variable
43
+ # df = instance.call("ACS3")
44
+ #
45
+ # #=> #<Daru::DataFrame(1629x30)>
46
+ # # Abbreviati FIPS Non.US State cnty females.di ...
47
+ # # 0 AL 1001 14.7 alabama autauga 13.8 ...
48
+ # # 1 AL 1003 13.5 alabama baldwin 14.1 ...
49
+ # # 2 AL 1005 20.1 alabama barbour 16.1 ...
50
+ # # 3 AL 1009 18.0 alabama blount 13.7 ...
51
+ # # 4 AL 1015 18.6 alabama calhoun 12.9 ...
52
+ # # ... ... ... ... ... ... ... ...
53
+ def call(variable)
54
+ @variable = variable.to_s
55
+
56
+ validate_params
57
+
58
+ process_dataframe(@instance.send(@variable.to_sym))
59
+ end
60
+
61
+ private
62
+
63
+ def validate_params
64
+ valid_r_dataframe_variables = @instance.eval_R('Filter(function(x) is.data.frame(get(x)) , ls())')
65
+ return if valid_r_dataframe_variables.include?(@variable)
66
+
67
+ variable_type = @instance.eval_R("typeof(#{@variable})")
68
+ raise ArgumentError, "Expected the given R variable (#{@variable}) to be a data.frame, got a "\
69
+ "#{variable_type} instead."
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,67 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # RDS Importer Class, that extends `read_rds` method to `Daru::DataFrame`
7
+ #
8
+ # @see Daru::IO::Importers::RData For .Rdata format
9
+ class RDS < Base
10
+ Daru::DataFrame.register_io_module :read_rds, self
11
+
12
+ # Checks for required gem dependencies of RDS Importer
13
+ def initialize
14
+ optional_gem 'rsruby'
15
+ end
16
+
17
+ # Reads data from a rds file
18
+ #
19
+ # @!method self.read(path)
20
+ #
21
+ # @param path [String] Path to rds file, where the dataframe is to be
22
+ # imported from.
23
+ #
24
+ # @return [Daru::IO::Importers::RDS]
25
+ #
26
+ # @example Reading from rds file
27
+ # instance = Daru::IO::Importers::RDS.read('bc_sites.rds')
28
+ def read(path)
29
+ @instance = RSRuby.instance.eval_R("readRDS('#{path}')")
30
+ self
31
+ end
32
+
33
+ # Imports a `Daru::DataFrame` from a RDS Importer instance and rds file
34
+ #
35
+ # @return [Daru::DataFrame]
36
+ #
37
+ # @example Reading from a RDS file
38
+ # df = instance.call
39
+ #
40
+ # #=> #<Daru::DataFrame(1113x25)>
41
+ # # area descriptio epa_reach format_ver latitude location location_c ...
42
+ # # 0 016 GSPTN NaN 4.1 49.5 THOR IS 2MS22016 T ...
43
+ # # 1 012 CSPT NaN 4.1 50.6167 MITC BY 2MN26012 M ...
44
+ # # ... ... ... ... ... ... ... ... ...
45
+ def call
46
+ process_dataframe(@instance)
47
+ end
48
+
49
+ private
50
+
51
+ def process_dataframe(data)
52
+ data = data.map { |key, values| [key.to_sym, values.map { |val| convert_datatype(val) }] }.to_h
53
+ Daru::DataFrame.new(data)
54
+ end
55
+
56
+ def convert_datatype(value)
57
+ case value.to_s
58
+ when 'NaN' then nil
59
+ when value.to_f.to_s then value.to_f
60
+ when value.to_i.to_s then value.to_i
61
+ else value
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,135 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Redis Importer Class, that extends `from_redis` method to `Daru::DataFrame`
7
+ class Redis < Base
8
+ Daru::DataFrame.register_io_module :from_redis, self
9
+
10
+ # Checks for required gem dependencies of Redis Importer
11
+ def initialize
12
+ require 'json'
13
+ optional_gem 'redis'
14
+ end
15
+
16
+ # Loads data from a given connection
17
+ #
18
+ # @!method self.from(connection)
19
+ #
20
+ # @param connection [Hash or Redis Instance] Either a Hash of *Redis* configurations,
21
+ # or an existing *Redis* instance. For the hash configurations, have a
22
+ # look at
23
+ # [Redis#initialize](http://www.rubydoc.info/github/redis/redis-rb/Redis:initialize).
24
+ #
25
+ # @return [Daru::IO::Importers::Redis]
26
+ #
27
+ # @example Loading from a hash
28
+ # instance = Daru::IO::Importers::Redis.from({url: "redis://:[password]@[hostname]:[port]/[db]"})
29
+ #
30
+ # @example Loading from a Redis connection
31
+ # instance = Daru::IO::Importers::Redis.from(Redis.new({url: "redis://:[password]@[hostname]:[port]/[db]"}))
32
+ def from(connection={})
33
+ @client = get_client(connection)
34
+ self
35
+ end
36
+
37
+ # Imports a `Daru::DataFrame` from a Redis Importer instance
38
+ #
39
+ # @param keys [Array] Redis key(s) from whom, the `Daru::DataFrame`
40
+ # should be constructed. If no keys are given, all keys in the *Redis*
41
+ # connection will be used.
42
+ # @param match [String] A pattern to get matching keys.
43
+ # @param count [Integer] Number of matching keys to be obtained. Defaults to
44
+ # nil, to collect ALL matching keys.
45
+ #
46
+ # @return [Daru::DataFrame]
47
+ #
48
+ # @example Importing with no options
49
+ # # Say, the Redis connection has this setup
50
+ # # Key "10001" => { "name" => "Tyrion", "age" => 32 }.to_json
51
+ # # Key "10002" => { "name" => "Jamie", "age" => 37 }.to_json
52
+ # # Key "10003" => { "name" => "Cersei", "age" => 37 }.to_json
53
+ # # Key "10004" => { "name" => "Joffrey", "age" => 19 }.to_json
54
+ #
55
+ # df = instance.call
56
+ #
57
+ # #=> <Daru::DataFrame(4x2)>
58
+ # # name age
59
+ # # 10001 Tyrion 32
60
+ # # 10002 Jamie 37
61
+ # # 10003 Cersei 37
62
+ # # 10004 Joffrey 19
63
+ #
64
+ # @example Importing with keys
65
+ # # Say, the Redis connection has this setup
66
+ # # Key "10001" => { "name" => "Tyrion", "age" => 32 }.to_json
67
+ # # Key "10002" => { "name" => "Jamie", "age" => 37 }.to_json
68
+ # # Key "10003" => { "name" => "Cersei", "age" => 37 }.to_json
69
+ # # Key "10004" => { "name" => "Joffrey", "age" => 19 }.to_json
70
+ #
71
+ # df = instance.call("10001", "10002")
72
+ #
73
+ # #=> <Daru::DataFrame(2x2)>
74
+ # # name age
75
+ # # 10001 Tyrion 32
76
+ # # 10002 Jamie 37
77
+ #
78
+ # @example Importing with query for matching keys and count
79
+ # # Say, the Redis connection has this setup
80
+ # # Key "key:1" => { "name" => "name1", "age" => "age1" }.to_json
81
+ # # Key "key:2" => { "name" => "name2", "age" => "age2" }.to_json
82
+ # # Key "key:3" => { "name" => "name3", "age" => "age3" }.to_json
83
+ # # ...
84
+ # # Key "key:2000" => { "name" => "name2000", "age" => "age2000" }.to_json
85
+ #
86
+ # df = instance.call(match: "key:1*", count: 200)
87
+ #
88
+ # #=> #<Daru::DataFrame(200x2)>
89
+ # # name age
90
+ # # key:1927 name1927 age1927
91
+ # # key:1759 name1759 age1759
92
+ # # key:1703 name1703 age1703
93
+ # # key:1640 name1640 age1640
94
+ # # ... ... ...
95
+ def call(*keys, match: nil, count: nil)
96
+ @match = match
97
+ @count = count
98
+ @keys = keys
99
+ @keys = choose_keys(*@keys).map(&:to_sym)
100
+
101
+ vals = @keys.map { |key| ::JSON.parse(@client.get(key), symbolize_names: true) }
102
+ Base.guess_parse(@keys, vals)
103
+ end
104
+
105
+ private
106
+
107
+ def choose_keys(*keys)
108
+ return keys.to_a unless keys.empty?
109
+
110
+ cursor = nil
111
+ # Loop to iterate through paginated results of Redis#scan.
112
+ until cursor == '0' || (!@count.nil? && keys.count > (@count-1))
113
+ cursor, chunk = @client.scan(cursor, match: @match, count: @count)
114
+ keys.concat(chunk).uniq!
115
+ end
116
+ return keys[0..-1] if @count.nil?
117
+ keys[0..@count-1]
118
+ end
119
+
120
+ def get_client(connection)
121
+ case connection
122
+ when ::Redis
123
+ connection
124
+ when Hash
125
+ ::Redis.new connection
126
+ else
127
+ raise ArgumentError, "Expected '#{connection}' to be either "\
128
+ 'a Hash or an initialized Redis instance, '\
129
+ "but received #{connection.class} instead."
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end