daru-io 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +2 -0
  4. data/.rspec_formatter.rb +24 -0
  5. data/.rubocop.yml +109 -0
  6. data/.travis.yml +30 -0
  7. data/.yardopts +2 -0
  8. data/CODE_OF_CONDUCT.md +46 -0
  9. data/CONTRIBUTING.md +65 -0
  10. data/Gemfile +20 -0
  11. data/Guardfile +7 -0
  12. data/LICENSE.md +21 -0
  13. data/README.md +654 -0
  14. data/Rakefile +12 -0
  15. data/daru-io.gemspec +39 -0
  16. data/lib/daru/io.rb +3 -0
  17. data/lib/daru/io/base.rb +45 -0
  18. data/lib/daru/io/exporters.rb +1 -0
  19. data/lib/daru/io/exporters/avro.rb +96 -0
  20. data/lib/daru/io/exporters/base.rb +54 -0
  21. data/lib/daru/io/exporters/csv.rb +103 -0
  22. data/lib/daru/io/exporters/excel.rb +148 -0
  23. data/lib/daru/io/exporters/json.rb +570 -0
  24. data/lib/daru/io/exporters/r_data.rb +66 -0
  25. data/lib/daru/io/exporters/rds.rb +79 -0
  26. data/lib/daru/io/exporters/sql.rb +55 -0
  27. data/lib/daru/io/importers.rb +1 -0
  28. data/lib/daru/io/importers/active_record.rb +75 -0
  29. data/lib/daru/io/importers/avro.rb +54 -0
  30. data/lib/daru/io/importers/base.rb +62 -0
  31. data/lib/daru/io/importers/csv.rb +190 -0
  32. data/lib/daru/io/importers/excel.rb +99 -0
  33. data/lib/daru/io/importers/excelx.rb +138 -0
  34. data/lib/daru/io/importers/html.rb +144 -0
  35. data/lib/daru/io/importers/json.rb +152 -0
  36. data/lib/daru/io/importers/mongo.rb +139 -0
  37. data/lib/daru/io/importers/plaintext.rb +97 -0
  38. data/lib/daru/io/importers/r_data.rb +74 -0
  39. data/lib/daru/io/importers/rds.rb +67 -0
  40. data/lib/daru/io/importers/redis.rb +135 -0
  41. data/lib/daru/io/importers/sql.rb +127 -0
  42. data/lib/daru/io/link.rb +80 -0
  43. data/lib/daru/io/version.rb +5 -0
  44. metadata +269 -0
@@ -0,0 +1,139 @@
1
+ require 'daru/io/importers/json'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Mongo Importer Class, that extends `from_mongo` method to `Daru::DataFrame`
7
+ class Mongo < JSON
8
+ Daru::DataFrame.register_io_module :from_mongo, self
9
+
10
+ # Checks for required gem dependencies of Mongo Importer
11
+ def initialize
12
+ super
13
+ optional_gem 'mongo'
14
+ end
15
+
16
+ # Loads data from a given connection
17
+ #
18
+ # @!method self.from(connection)
19
+ #
20
+ # @param connection [String or Hash or Mongo::Client] Contains details
21
+ # about a Mongo database / hosts to connect.
22
+ #
23
+ # @return [Daru::IO::Importers::Mongo]
24
+ #
25
+ # @example Loading from a connection string
26
+ # instance_1 = Daru::IO::Importers::Mongo.from('mongodb://127.0.0.1:27017/test')
27
+ #
28
+ # @example Loading from a connection hash
29
+ # instance_2 = Daru::IO::Importers::Mongo.from({ hosts: ['127.0.0.1:27017'], database: 'test' })
30
+ #
31
+ # @example Loading from a Mongo::Client connection
32
+ # instance_3 = Daru::IO::Importers::Mongo.from(Mongo::Client.new ['127.0.0.1:27017'], database: 'test')
33
+ def from(connection)
34
+ @client = get_client(connection)
35
+ self
36
+ end
37
+
38
+ # Imports a `Daru::DataFrame` from a Mongo Importer instance.
39
+ #
40
+ # @param collection [String or Symbol] A specific collection in the
41
+ # Mongo database, to import as `Daru::DataFrame`.
42
+ # @param columns [Array] JSON-path slectors to select specific fields
43
+ # from the JSON input.
44
+ # @param order [String or Array] Either a JSON-path selector string, or
45
+ # an array containing the order of the `Daru::DataFrame`. DO NOT
46
+ # provide both `order` and `named_columns` at the same time.
47
+ # @param index [String or Array] Either a JSON-path selector string, or
48
+ # an array containing the order of the `Daru::DataFrame`.
49
+ # @param filter [Hash] Filters and chooses Mongo documents that match
50
+ # the given `filter` from the collection.
51
+ # @param limit [Interger] Limits the number of Mongo documents to be
52
+ # parsed from the collection.
53
+ # @param skip [Integer] Skips `skip` number of documents from the Mongo
54
+ # collection.
55
+ # @param named_columns [Hash] JSON-path selectors to select specific
56
+ # fields from the JSON input. DO NOT provide both `order` and
57
+ # `named_columns` at the same time.
58
+ #
59
+ # @note
60
+ # - For more information on using JSON-path selectors, have a look at
61
+ # the explanations {http://www.rubydoc.info/gems/jsonpath/0.5.8 here}
62
+ # and {http://goessner.net/articles/JsonPath/ here}.
63
+ # - The Mongo gem faces `Argument Error : expected Proc Argument`
64
+ # issue due to the bug in MRI Ruby 2.4.0 mentioned
65
+ # {https://bugs.ruby-lang.org/issues/13107 here}. This seems to have
66
+ # been fixed in Ruby 2.4.1 onwards. Hence, please avoid using this
67
+ # Mongo Importer in Ruby version 2.4.0.
68
+ #
69
+ # @return [Daru::DataFrame]
70
+ #
71
+ # @example Importing without jsonpath selectors
72
+ # # The below 'cars' collection can be recreated in a Mongo shell with -
73
+ # # db.cars.drop()
74
+ # # db.cars.insert({name: "Audi", price: 52642})
75
+ # # db.cars.insert({name: "Mercedes", price: 57127})
76
+ # # db.cars.insert({name: "Volvo", price: 29000})
77
+ #
78
+ # df = instance.call('cars')
79
+ #
80
+ # #=> #<Daru::DataFrame(3x3)>
81
+ # # _id name price
82
+ # # 0 5948d0bfcd Audi 52642.0
83
+ # # 1 5948d0c6cd Mercedes 57127.0
84
+ # # 2 5948d0cecd Volvo 29000.0
85
+ #
86
+ # @example Importing with jsonpath selectors
87
+ # # The below 'cars' collection can be recreated in a Mongo shell with -
88
+ # # db.cars.drop()
89
+ # # db.cars.insert({name: "Audi", price: 52642, star: { fuel: 9.8, cost: 8.6, seats: 9.9, sound: 9.3 }})
90
+ # # db.cars.insert({name: "Mercedes", price: 57127, star: { fuel: 9.3, cost: 8.9, seats: 8.4, sound: 9.1 }})
91
+ # # db.cars.insert({name: "Volvo", price: 29000, star: { fuel: 7.8, cost: 9.9, seats: 8.2, sound: 8.9 }})
92
+ #
93
+ # df = instance.call(
94
+ # 'cars',
95
+ # '$.._id',
96
+ # '$..name',
97
+ # '$..price',
98
+ # '$..star..fuel',
99
+ # '$..star..cost'
100
+ # )
101
+ #
102
+ # #=> #<Daru::DataFrame(3x5)>
103
+ # # _id name price fuel cost
104
+ # # 0 5948d40b50 Audi 52642.0 9.8 8.6
105
+ # # 1 5948d42850 Mercedes 57127.0 9.3 8.9
106
+ # # 2 5948d44350 Volvo 29000.0 7.8 9.9
107
+ def call(collection, *columns, order: nil, index: nil,
108
+ filter: nil, limit: nil, skip: nil, **named_columns)
109
+ @json = ::JSON.parse(
110
+ @client[collection.to_sym]
111
+ .find(filter, skip: skip, limit: limit)
112
+ .to_json
113
+ )
114
+
115
+ super(*columns, order: order, index: index, **named_columns)
116
+ end
117
+
118
+ private
119
+
120
+ def get_client(connection)
121
+ case connection
122
+ when ::Mongo::Client
123
+ connection
124
+ when Hash
125
+ hosts = connection.delete :hosts
126
+ ::Mongo::Client.new(hosts, connection)
127
+ when String
128
+ ::Mongo::Client.new(connection)
129
+ else
130
+ raise ArgumentError,
131
+ "Expected #{connection} to be either a Mongo instance, "\
132
+ 'Mongo connection Hash, or Mongo connection URL String. '\
133
+ "Received #{connection.class} instead."
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,97 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Plaintext Importer Class, that extends `read_plaintext` method to
7
+ # `Daru::DataFrame`
8
+ class Plaintext < Base
9
+ Daru::DataFrame.register_io_module :read_plaintext, self
10
+
11
+ # Checks for required gem dependencies of Plaintext Importer
12
+ def initialize; end
13
+
14
+ # Reads data from a plaintext (.dat) file
15
+ #
16
+ # @!method self.read(path)
17
+ #
18
+ # @param path [String] Path to plaintext file, where the dataframe is to be
19
+ # imported from.
20
+ #
21
+ # @return [Daru::IO::Importers::Plaintext]
22
+ #
23
+ # @example Reading from plaintext file
24
+ # instance = Daru::IO::Importers::Plaintext.read("bank2.dat")
25
+ def read(path)
26
+ @file_data = File.read(path).split("\n").map do |line|
27
+ row = process_row(line.strip.split(/\s+/),[''])
28
+ next if row == ["\x1A"]
29
+ row
30
+ end
31
+ self
32
+ end
33
+
34
+ # Imports `Daru::DataFrame` from a Plaintext Importer instance
35
+ #
36
+ # @param fields [Array] An array of vectors.
37
+ #
38
+ # @return [Daru::DataFrame]
39
+ #
40
+ # @example Initializing with fields
41
+ # df = instance.call([:v1, :v2, :v3, :v4, :v5, :v6])
42
+ #
43
+ # #=> #<Daru::DataFrame(200x6)>
44
+ # # v1 v2 v3 v4 v5 v6
45
+ # # 0 214.8 131.0 131.1 9.0 9.7 141.0
46
+ # # 1 214.6 129.7 129.7 8.1 9.5 141.7
47
+ # # 2 214.8 129.7 129.7 8.7 9.6 142.2
48
+ # # 3 214.8 129.7 129.6 7.5 10.4 142.0
49
+ # # 4 215.0 129.6 129.7 10.4 7.7 141.8
50
+ # # 5 215.7 130.8 130.5 9.0 10.1 141.4
51
+ # # 6 215.5 129.5 129.7 7.9 9.6 141.6
52
+ # # 7 214.5 129.6 129.2 7.2 10.7 141.7
53
+ # # 8 214.9 129.4 129.7 8.2 11.0 141.9
54
+ # # 9 215.2 130.4 130.3 9.2 10.0 140.7
55
+ # # 10 215.3 130.4 130.3 7.9 11.7 141.8
56
+ # # 11 215.1 129.5 129.6 7.7 10.5 142.2
57
+ # # 12 215.2 130.8 129.6 7.9 10.8 141.4
58
+ # # 13 214.7 129.7 129.7 7.7 10.9 141.7
59
+ # # 14 215.1 129.9 129.7 7.7 10.8 141.8
60
+ # #... ... ... ... ... ... ...
61
+ def call(fields)
62
+ Daru::DataFrame.rows(@file_data, order: fields)
63
+ end
64
+
65
+ private
66
+
67
+ INT_PATTERN = /^[-+]?\d+$/
68
+ FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/
69
+
70
+ def process_row(row,empty)
71
+ row.to_a.map do |c|
72
+ if empty.include?(c)
73
+ # FIXME: As far as I can guess, it will never work.
74
+ # It is called only inside `from_plaintext`, and there
75
+ # data is splitted by `\s+` -- there is no chance that
76
+ # "empty" (currently just '') will be between data?..
77
+ nil
78
+ else
79
+ try_string_to_number(c)
80
+ end
81
+ end
82
+ end
83
+
84
+ def try_string_to_number(s)
85
+ case s
86
+ when INT_PATTERN
87
+ s.to_i
88
+ when FLOAT_PATTERN
89
+ s.tr(',', '.').to_f
90
+ else
91
+ s
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,74 @@
1
+ require 'daru/io/importers/rds'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # RData Importer Class, that extends `read_rdata` method to `Daru::DataFrame`
7
+ #
8
+ # @see Daru::IO::Importers::RDS For .rds format
9
+ class RData < RDS
10
+ Daru::DataFrame.register_io_module :read_rdata, self
11
+
12
+ # Checks for required gem dependencies of RData Importer
13
+ def initialize
14
+ super
15
+ end
16
+
17
+ # Reads data from a Rdata file
18
+ #
19
+ # @!method self.read(path)
20
+ #
21
+ # @param path [String] Path to RData file, where the dataframe is to be imported from.
22
+ #
23
+ # @return [Daru::IO::Importers::RData]
24
+ #
25
+ # @example Reading from rdata file
26
+ # instance = Daru::IO::Importers::RData.read('ACScounty.RData')
27
+ def read(path)
28
+ @instance = RSRuby.instance
29
+ @instance.eval_R("load('#{path}')")
30
+ self
31
+ end
32
+
33
+ # Imports a `Daru::DataFrame` from a RData Importer instance and rdata file
34
+ #
35
+ # @param variable [String] The variable to be imported from the
36
+ # variables stored in the RData file. Please note that the R
37
+ # variable to be imported from the RData file should be a
38
+ # `data.frame`
39
+ #
40
+ # @return [Daru::DataFrame]
41
+ #
42
+ # @example Importing a particular variable
43
+ # df = instance.call("ACS3")
44
+ #
45
+ # #=> #<Daru::DataFrame(1629x30)>
46
+ # # Abbreviati FIPS Non.US State cnty females.di ...
47
+ # # 0 AL 1001 14.7 alabama autauga 13.8 ...
48
+ # # 1 AL 1003 13.5 alabama baldwin 14.1 ...
49
+ # # 2 AL 1005 20.1 alabama barbour 16.1 ...
50
+ # # 3 AL 1009 18.0 alabama blount 13.7 ...
51
+ # # 4 AL 1015 18.6 alabama calhoun 12.9 ...
52
+ # # ... ... ... ... ... ... ... ...
53
+ def call(variable)
54
+ @variable = variable.to_s
55
+
56
+ validate_params
57
+
58
+ process_dataframe(@instance.send(@variable.to_sym))
59
+ end
60
+
61
+ private
62
+
63
+ def validate_params
64
+ valid_r_dataframe_variables = @instance.eval_R('Filter(function(x) is.data.frame(get(x)) , ls())')
65
+ return if valid_r_dataframe_variables.include?(@variable)
66
+
67
+ variable_type = @instance.eval_R("typeof(#{@variable})")
68
+ raise ArgumentError, "Expected the given R variable (#{@variable}) to be a data.frame, got a "\
69
+ "#{variable_type} instead."
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,67 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # RDS Importer Class, that extends `read_rds` method to `Daru::DataFrame`
7
+ #
8
+ # @see Daru::IO::Importers::RData For .Rdata format
9
+ class RDS < Base
10
+ Daru::DataFrame.register_io_module :read_rds, self
11
+
12
+ # Checks for required gem dependencies of RDS Importer
13
+ def initialize
14
+ optional_gem 'rsruby'
15
+ end
16
+
17
+ # Reads data from a rds file
18
+ #
19
+ # @!method self.read(path)
20
+ #
21
+ # @param path [String] Path to rds file, where the dataframe is to be
22
+ # imported from.
23
+ #
24
+ # @return [Daru::IO::Importers::RDS]
25
+ #
26
+ # @example Reading from rds file
27
+ # instance = Daru::IO::Importers::RDS.read('bc_sites.rds')
28
+ def read(path)
29
+ @instance = RSRuby.instance.eval_R("readRDS('#{path}')")
30
+ self
31
+ end
32
+
33
+ # Imports a `Daru::DataFrame` from a RDS Importer instance and rds file
34
+ #
35
+ # @return [Daru::DataFrame]
36
+ #
37
+ # @example Reading from a RDS file
38
+ # df = instance.call
39
+ #
40
+ # #=> #<Daru::DataFrame(1113x25)>
41
+ # # area descriptio epa_reach format_ver latitude location location_c ...
42
+ # # 0 016 GSPTN NaN 4.1 49.5 THOR IS 2MS22016 T ...
43
+ # # 1 012 CSPT NaN 4.1 50.6167 MITC BY 2MN26012 M ...
44
+ # # ... ... ... ... ... ... ... ... ...
45
+ def call
46
+ process_dataframe(@instance)
47
+ end
48
+
49
+ private
50
+
51
+ def process_dataframe(data)
52
+ data = data.map { |key, values| [key.to_sym, values.map { |val| convert_datatype(val) }] }.to_h
53
+ Daru::DataFrame.new(data)
54
+ end
55
+
56
+ def convert_datatype(value)
57
+ case value.to_s
58
+ when 'NaN' then nil
59
+ when value.to_f.to_s then value.to_f
60
+ when value.to_i.to_s then value.to_i
61
+ else value
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,135 @@
1
+ require 'daru/io/importers/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Importers
6
+ # Redis Importer Class, that extends `from_redis` method to `Daru::DataFrame`
7
+ class Redis < Base
8
+ Daru::DataFrame.register_io_module :from_redis, self
9
+
10
+ # Checks for required gem dependencies of Redis Importer
11
+ def initialize
12
+ require 'json'
13
+ optional_gem 'redis'
14
+ end
15
+
16
+ # Loads data from a given connection
17
+ #
18
+ # @!method self.from(connection)
19
+ #
20
+ # @param connection [Hash or Redis Instance] Either a Hash of *Redis* configurations,
21
+ # or an existing *Redis* instance. For the hash configurations, have a
22
+ # look at
23
+ # [Redis#initialize](http://www.rubydoc.info/github/redis/redis-rb/Redis:initialize).
24
+ #
25
+ # @return [Daru::IO::Importers::Redis]
26
+ #
27
+ # @example Loading from a hash
28
+ # instance = Daru::IO::Importers::Redis.from({url: "redis://:[password]@[hostname]:[port]/[db]"})
29
+ #
30
+ # @example Loading from a Redis connection
31
+ # instance = Daru::IO::Importers::Redis.from(Redis.new({url: "redis://:[password]@[hostname]:[port]/[db]"}))
32
+ def from(connection={})
33
+ @client = get_client(connection)
34
+ self
35
+ end
36
+
37
+ # Imports a `Daru::DataFrame` from a Redis Importer instance
38
+ #
39
+ # @param keys [Array] Redis key(s) from whom, the `Daru::DataFrame`
40
+ # should be constructed. If no keys are given, all keys in the *Redis*
41
+ # connection will be used.
42
+ # @param match [String] A pattern to get matching keys.
43
+ # @param count [Integer] Number of matching keys to be obtained. Defaults to
44
+ # nil, to collect ALL matching keys.
45
+ #
46
+ # @return [Daru::DataFrame]
47
+ #
48
+ # @example Importing with no options
49
+ # # Say, the Redis connection has this setup
50
+ # # Key "10001" => { "name" => "Tyrion", "age" => 32 }.to_json
51
+ # # Key "10002" => { "name" => "Jamie", "age" => 37 }.to_json
52
+ # # Key "10003" => { "name" => "Cersei", "age" => 37 }.to_json
53
+ # # Key "10004" => { "name" => "Joffrey", "age" => 19 }.to_json
54
+ #
55
+ # df = instance.call
56
+ #
57
+ # #=> <Daru::DataFrame(4x2)>
58
+ # # name age
59
+ # # 10001 Tyrion 32
60
+ # # 10002 Jamie 37
61
+ # # 10003 Cersei 37
62
+ # # 10004 Joffrey 19
63
+ #
64
+ # @example Importing with keys
65
+ # # Say, the Redis connection has this setup
66
+ # # Key "10001" => { "name" => "Tyrion", "age" => 32 }.to_json
67
+ # # Key "10002" => { "name" => "Jamie", "age" => 37 }.to_json
68
+ # # Key "10003" => { "name" => "Cersei", "age" => 37 }.to_json
69
+ # # Key "10004" => { "name" => "Joffrey", "age" => 19 }.to_json
70
+ #
71
+ # df = instance.call("10001", "10002")
72
+ #
73
+ # #=> <Daru::DataFrame(2x2)>
74
+ # # name age
75
+ # # 10001 Tyrion 32
76
+ # # 10002 Jamie 37
77
+ #
78
+ # @example Importing with query for matching keys and count
79
+ # # Say, the Redis connection has this setup
80
+ # # Key "key:1" => { "name" => "name1", "age" => "age1" }.to_json
81
+ # # Key "key:2" => { "name" => "name2", "age" => "age2" }.to_json
82
+ # # Key "key:3" => { "name" => "name3", "age" => "age3" }.to_json
83
+ # # ...
84
+ # # Key "key:2000" => { "name" => "name2000", "age" => "age2000" }.to_json
85
+ #
86
+ # df = instance.call(match: "key:1*", count: 200)
87
+ #
88
+ # #=> #<Daru::DataFrame(200x2)>
89
+ # # name age
90
+ # # key:1927 name1927 age1927
91
+ # # key:1759 name1759 age1759
92
+ # # key:1703 name1703 age1703
93
+ # # key:1640 name1640 age1640
94
+ # # ... ... ...
95
+ def call(*keys, match: nil, count: nil)
96
+ @match = match
97
+ @count = count
98
+ @keys = keys
99
+ @keys = choose_keys(*@keys).map(&:to_sym)
100
+
101
+ vals = @keys.map { |key| ::JSON.parse(@client.get(key), symbolize_names: true) }
102
+ Base.guess_parse(@keys, vals)
103
+ end
104
+
105
+ private
106
+
107
+ def choose_keys(*keys)
108
+ return keys.to_a unless keys.empty?
109
+
110
+ cursor = nil
111
+ # Loop to iterate through paginated results of Redis#scan.
112
+ until cursor == '0' || (!@count.nil? && keys.count > (@count-1))
113
+ cursor, chunk = @client.scan(cursor, match: @match, count: @count)
114
+ keys.concat(chunk).uniq!
115
+ end
116
+ return keys[0..-1] if @count.nil?
117
+ keys[0..@count-1]
118
+ end
119
+
120
+ def get_client(connection)
121
+ case connection
122
+ when ::Redis
123
+ connection
124
+ when Hash
125
+ ::Redis.new connection
126
+ else
127
+ raise ArgumentError, "Expected '#{connection}' to be either "\
128
+ 'a Hash or an initialized Redis instance, '\
129
+ "but received #{connection.class} instead."
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end