daru-io 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +2 -0
- data/.rspec_formatter.rb +24 -0
- data/.rubocop.yml +109 -0
- data/.travis.yml +30 -0
- data/.yardopts +2 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +65 -0
- data/Gemfile +20 -0
- data/Guardfile +7 -0
- data/LICENSE.md +21 -0
- data/README.md +654 -0
- data/Rakefile +12 -0
- data/daru-io.gemspec +39 -0
- data/lib/daru/io.rb +3 -0
- data/lib/daru/io/base.rb +45 -0
- data/lib/daru/io/exporters.rb +1 -0
- data/lib/daru/io/exporters/avro.rb +96 -0
- data/lib/daru/io/exporters/base.rb +54 -0
- data/lib/daru/io/exporters/csv.rb +103 -0
- data/lib/daru/io/exporters/excel.rb +148 -0
- data/lib/daru/io/exporters/json.rb +570 -0
- data/lib/daru/io/exporters/r_data.rb +66 -0
- data/lib/daru/io/exporters/rds.rb +79 -0
- data/lib/daru/io/exporters/sql.rb +55 -0
- data/lib/daru/io/importers.rb +1 -0
- data/lib/daru/io/importers/active_record.rb +75 -0
- data/lib/daru/io/importers/avro.rb +54 -0
- data/lib/daru/io/importers/base.rb +62 -0
- data/lib/daru/io/importers/csv.rb +190 -0
- data/lib/daru/io/importers/excel.rb +99 -0
- data/lib/daru/io/importers/excelx.rb +138 -0
- data/lib/daru/io/importers/html.rb +144 -0
- data/lib/daru/io/importers/json.rb +152 -0
- data/lib/daru/io/importers/mongo.rb +139 -0
- data/lib/daru/io/importers/plaintext.rb +97 -0
- data/lib/daru/io/importers/r_data.rb +74 -0
- data/lib/daru/io/importers/rds.rb +67 -0
- data/lib/daru/io/importers/redis.rb +135 -0
- data/lib/daru/io/importers/sql.rb +127 -0
- data/lib/daru/io/link.rb +80 -0
- data/lib/daru/io/version.rb +5 -0
- metadata +269 -0
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'daru/io/importers/base'
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
module IO
|
5
|
+
module Importers
|
6
|
+
# Excel Importer Class, that extends `read_excel` method to `Daru::DataFrame`
|
7
|
+
#
|
8
|
+
# @see Daru::IO::Importers::Excelx For .xlsx format
|
9
|
+
class Excel < Base
|
10
|
+
Daru::DataFrame.register_io_module :read_excel do |*args, &io_block|
|
11
|
+
if args.first.end_with?('.xlsx')
|
12
|
+
require 'daru/io/importers/excelx'
|
13
|
+
Daru::IO::Importers::Excelx.new(*args[1..-1], &io_block).read(*args[0])
|
14
|
+
else
|
15
|
+
Daru::IO::Importers::Excel.new(*args[1..-1], &io_block).read(*args[0])
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Checks for required gem dependencies of Excel Importer
|
20
|
+
def initialize
|
21
|
+
optional_gem 'spreadsheet', '~> 1.1.1'
|
22
|
+
end
|
23
|
+
|
24
|
+
# Reads from an excel (.xls) file
|
25
|
+
#
|
26
|
+
# @!method self.read(path)
|
27
|
+
#
|
28
|
+
# @param path [String] Path of Excel file, where the DataFrame is to be imported from.
|
29
|
+
#
|
30
|
+
# @return [Daru::IO::Importers::Excel]
|
31
|
+
#
|
32
|
+
# @example Reading from an excel file
|
33
|
+
# instance = Daru::IO::Importers::Excel.read("test_xls.xls")
|
34
|
+
def read(path)
|
35
|
+
@file_data = Spreadsheet.open(path)
|
36
|
+
self
|
37
|
+
end
|
38
|
+
|
39
|
+
# Imports a `Daru::DataFrame` from an Excel Importer instance
|
40
|
+
#
|
41
|
+
# @param worksheet_id [Integer] The index of the worksheet in the excel file,
|
42
|
+
# from where the `Daru::DataFrame` will be imported. By default, the first
|
43
|
+
# worksheet has `:worksheet_id` as 0. In general, the n-th worksheet has
|
44
|
+
# its worksheet_id as n-1.
|
45
|
+
#
|
46
|
+
# If worksheet_id option is not given, it is taken as 0 by default and the
|
47
|
+
# `Daru::DataFrame` will be imported from the first worksheet in the excel file.
|
48
|
+
# @param headers [Boolean] Defaults to true. When set to true, first row of the
|
49
|
+
# given worksheet_id is used as the order of the Daru::DataFrame and data of
|
50
|
+
# the Dataframe consists of the remaining rows.
|
51
|
+
#
|
52
|
+
# @return [Daru::DataFrame]
|
53
|
+
#
|
54
|
+
# default_instance = Daru::IO::Importers::Excel.new
|
55
|
+
#
|
56
|
+
# @example Importing from a default worksheet
|
57
|
+
# df = instance.call
|
58
|
+
#
|
59
|
+
# #=> #<Daru::DataFrame(6x5)>
|
60
|
+
# # id name age city a1
|
61
|
+
# # 0 1 Alex 20 New York a,b
|
62
|
+
# # 1 2 Claude 23 London b,c
|
63
|
+
# # 2 3 Peter 25 London a
|
64
|
+
# # 3 4 Franz nil Paris nil
|
65
|
+
# # 4 5 George 5.5 Tome a,b,c
|
66
|
+
# # 5 6 Fernand nil nil nil
|
67
|
+
#
|
68
|
+
# @example Importing from a specific worksheet
|
69
|
+
# df = instance.call(worksheet_id: 0)
|
70
|
+
#
|
71
|
+
# #=> #<Daru::DataFrame(6x5)>
|
72
|
+
# # id name age city a1
|
73
|
+
# # 0 1 Alex 20 New York a,b
|
74
|
+
# # 1 2 Claude 23 London b,c
|
75
|
+
# # 2 3 Peter 25 London a
|
76
|
+
# # 3 4 Franz nil Paris nil
|
77
|
+
# # 4 5 George 5.5 Tome a,b,c
|
78
|
+
# # 5 6 Fernand nil nil nil
|
79
|
+
def call(worksheet_id: 0, headers: true)
|
80
|
+
worksheet = @file_data.worksheet(worksheet_id)
|
81
|
+
headers = if headers
|
82
|
+
ArrayHelper.recode_repeated(worksheet.row(0)).map(&:to_sym)
|
83
|
+
else
|
84
|
+
(0..worksheet.row(0).to_a.size-1).to_a
|
85
|
+
end
|
86
|
+
|
87
|
+
df = Daru::DataFrame.new({})
|
88
|
+
headers.each_with_index do |h,i|
|
89
|
+
col = worksheet.column(i).to_a
|
90
|
+
col.delete_at(0) if headers
|
91
|
+
df[h] = col
|
92
|
+
end
|
93
|
+
|
94
|
+
df
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'daru/io/importers/base'
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
module IO
|
5
|
+
module Importers
|
6
|
+
# Excelx Importer Class, that handles .xlsx files in the Excel Importer
|
7
|
+
#
|
8
|
+
# @see Daru::IO::Importers::Excel For .xls format
|
9
|
+
class Excelx < Base
|
10
|
+
# Checks for required gem dependencies of Excelx Importer
|
11
|
+
def initialize
|
12
|
+
optional_gem 'roo', '~> 2.7.0'
|
13
|
+
end
|
14
|
+
|
15
|
+
# Reads from an excelx (xlsx) file
|
16
|
+
#
|
17
|
+
# @!method self.read(path)
|
18
|
+
#
|
19
|
+
# @param path [String] Local / Remote path of xlsx file, where the DataFrame is
|
20
|
+
# to be imported from.
|
21
|
+
#
|
22
|
+
# @return [Daru::IO::Importers::Excelx]
|
23
|
+
#
|
24
|
+
# @example Reading from a local xlsx file
|
25
|
+
# local_instance = Daru::IO::Importers::Excelx.read("Stock-counts-sheet.xlsx")
|
26
|
+
#
|
27
|
+
# @example Reading from a remote xlsx file
|
28
|
+
# url = "https://www.exact.com/uk/images/downloads/getting-started-excel-sheets/Stock-counts-sheet.xlsx"
|
29
|
+
# remote_instance = Daru::IO::Importers::Excelx.read(url)
|
30
|
+
def read(path)
|
31
|
+
@file_data = Roo::Excelx.new(path)
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
# Imports a `Daru::DataFrame` from an Excelx Importer instance
|
36
|
+
#
|
37
|
+
# @param sheet [Integer or String] Imports from a specific sheet
|
38
|
+
# @param skiprows [Integer] Skips the first `:skiprows` number of rows from the
|
39
|
+
# sheet being parsed.
|
40
|
+
# @param skipcols [Integer] Skips the first `:skipcols` number of columns from the
|
41
|
+
# sheet being parsed.
|
42
|
+
# @param order [Boolean] Defaults to true. When set to true, first row of the
|
43
|
+
# given sheet is used as the order of the Daru::DataFrame and data of
|
44
|
+
# the Dataframe consists of the remaining rows.
|
45
|
+
# @param index [Boolean] Defaults to false. When set to true, first column of the
|
46
|
+
# given sheet is used as the index of the Daru::DataFrame and data of
|
47
|
+
# the Dataframe consists of the remaining columns.
|
48
|
+
#
|
49
|
+
# When set to false, a default order (0 to n-1) is chosen for the DataFrame,
|
50
|
+
# and the data of the DataFrame consists of all rows in the sheet.
|
51
|
+
#
|
52
|
+
# @return [Daru::DataFrame]
|
53
|
+
#
|
54
|
+
# @example Importing from specific sheet
|
55
|
+
# df = local_instance.call(sheet: 'Example Stock Counts')
|
56
|
+
#
|
57
|
+
# #=> <Daru::DataFrame(15x7)>
|
58
|
+
# # Status Stock coun Item code New Descriptio Stock coun Offset G/L
|
59
|
+
# # 0 H 1 nil nil New stock 2014-08-01 nil
|
60
|
+
# # 1 nil 1 IND300654 2 New stock 2014-08-01 51035
|
61
|
+
# # 2 nil 1 IND43201 5 New stock 2014-08-01 51035
|
62
|
+
# # 3 nil 1 OUT30045 3 New stock 2014-08-01 51035
|
63
|
+
# # ... ... ... ... ... ... ... ...
|
64
|
+
#
|
65
|
+
# @example Importing from a remote URL and default sheet
|
66
|
+
# df = remote_instance.call
|
67
|
+
#
|
68
|
+
# #=> <Daru::DataFrame(15x7)>
|
69
|
+
# # Status Stock coun Item code New Descriptio Stock coun Offset G/L
|
70
|
+
# # 0 H 1 nil nil New stock 2014-08-01 nil
|
71
|
+
# # 1 nil 1 IND300654 2 New stock 2014-08-01 51035
|
72
|
+
# # 2 nil 1 IND43201 5 New stock 2014-08-01 51035
|
73
|
+
# # 3 nil 1 OUT30045 3 New stock 2014-08-01 51035
|
74
|
+
# # ... ... ... ... ... ... ... ...
|
75
|
+
#
|
76
|
+
# @example Importing without headers
|
77
|
+
# df = local_instance.call(sheet: 'Example Stock Counts', headers: false)
|
78
|
+
#
|
79
|
+
# #=> <Daru::DataFrame(16x7)>
|
80
|
+
# # 0 1 2 3 4 5 6
|
81
|
+
# # 0 Status Stock coun Item code New Descriptio Stock coun Offset G/L
|
82
|
+
# # 1 H 1 nil nil New stock 2014-08-01 nil
|
83
|
+
# # 2 nil 1 IND300654 2 New stock 2014-08-01 51035
|
84
|
+
# # 3 nil 1 IND43201 5 New stock 2014-08-01 51035
|
85
|
+
# # 4 nil 1 OUT30045 3 New stock 2014-08-01 51035
|
86
|
+
# # ... ... ... ... ... ... ... ...
|
87
|
+
def call(sheet: 0, skiprows: 0, skipcols: 0, order: true, index: false)
|
88
|
+
@order = order
|
89
|
+
@index = index
|
90
|
+
worksheet = @file_data.sheet(sheet)
|
91
|
+
@data = strip_html_tags(skip_data(worksheet.to_a, skiprows, skipcols))
|
92
|
+
@index = process_index
|
93
|
+
@order = process_order || (0..@data.first.length-1)
|
94
|
+
@data = process_data
|
95
|
+
|
96
|
+
Daru::DataFrame.rows(@data, order: @order, index: @index)
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def process_data
|
102
|
+
return skip_data(@data, 1, 1) if @order && @index
|
103
|
+
return skip_data(@data, 1, 0) if @order
|
104
|
+
return skip_data(@data, 0, 1) if @index
|
105
|
+
@data
|
106
|
+
end
|
107
|
+
|
108
|
+
def process_index
|
109
|
+
return nil unless @index
|
110
|
+
@index = @data.transpose.first
|
111
|
+
@index = skip_data(@index, 1) if @order
|
112
|
+
@index
|
113
|
+
end
|
114
|
+
|
115
|
+
def process_order
|
116
|
+
return nil unless @order
|
117
|
+
@order = @data.first
|
118
|
+
@order = skip_data(@order, 1) if @index
|
119
|
+
@order
|
120
|
+
end
|
121
|
+
|
122
|
+
def skip_data(data, rows, cols=nil)
|
123
|
+
return data[rows..-1].map { |row| row[cols..-1] } unless cols.nil?
|
124
|
+
data[rows..-1]
|
125
|
+
end
|
126
|
+
|
127
|
+
def strip_html_tags(data)
|
128
|
+
data.map do |row|
|
129
|
+
row.map do |ele|
|
130
|
+
next ele unless ele.is_a?(String)
|
131
|
+
ele.gsub(/<[^>]+>/, '')
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'daru/io/importers/base'
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
module IO
|
5
|
+
module Importers
|
6
|
+
# HTML Importer Class, that extends `read_html` method to `Daru::DataFrame`
|
7
|
+
#
|
8
|
+
# @note
|
9
|
+
# Please note that this module works only for static table elements on a
|
10
|
+
# HTML page, and won't work in cases where the data is being loaded into
|
11
|
+
# the HTML table by inline Javascript.
|
12
|
+
class HTML < Base
|
13
|
+
Daru::DataFrame.register_io_module :read_html, self
|
14
|
+
|
15
|
+
# Checks for required gem dependencies of HTML Importer
|
16
|
+
def initialize
|
17
|
+
require 'open-uri'
|
18
|
+
optional_gem 'nokogiri'
|
19
|
+
end
|
20
|
+
|
21
|
+
# Reads from a html file / website
|
22
|
+
#
|
23
|
+
# @!method self.read(path)
|
24
|
+
#
|
25
|
+
# @param path [String] Website URL / path to HTML file, where the
|
26
|
+
# DataFrame is to be imported from.
|
27
|
+
#
|
28
|
+
# @return [Daru::IO::Importers::HTML]
|
29
|
+
#
|
30
|
+
# @example Reading from a website url file
|
31
|
+
# instance = Daru::IO::Importers::HTML.read('http://www.moneycontrol.com/')
|
32
|
+
def read(path)
|
33
|
+
@file_data = Nokogiri.parse(open(path).read)
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
# Imports Array of `Daru::DataFrame`s from a HTML Importer instance
|
38
|
+
#
|
39
|
+
# @param match [String] A `String` to match and choose a particular table(s)
|
40
|
+
# from multiple tables of a HTML page.
|
41
|
+
# @param index [Array or Daru::Index or Daru::MultiIndex] If given, it
|
42
|
+
# overrides the parsed index. Have a look at `:index` option, at
|
43
|
+
# {http://www.rubydoc.info/gems/daru/0.1.5/Daru%2FDataFrame:initialize
|
44
|
+
# Daru::DataFrame#initialize}
|
45
|
+
# @param order [Array or Daru::Index or Daru::MultiIndex] If given, it
|
46
|
+
# overrides the parsed order. Have a look at `:order` option
|
47
|
+
# [here](http://www.rubydoc.info/gems/daru/0.1.5/Daru%2FDataFrame:initialize)
|
48
|
+
# @param name [String] As `name` of the imported `Daru::DataFrame` isn't
|
49
|
+
# parsed automatically by the module, users can set the name attribute to
|
50
|
+
# their `Daru::DataFrame` manually, through this option.
|
51
|
+
#
|
52
|
+
# See `:name` option
|
53
|
+
# [here](http://www.rubydoc.info/gems/daru/0.1.5/Daru%2FDataFrame:initialize)
|
54
|
+
#
|
55
|
+
# @return [Array<Daru::DataFrame>]
|
56
|
+
#
|
57
|
+
# @example Importing with matching tables
|
58
|
+
# list_of_dfs = instance.call(match: 'Sun Pharma')
|
59
|
+
# list_of_dfs.count
|
60
|
+
# #=> 4
|
61
|
+
#
|
62
|
+
# df = list_of_dfs.first
|
63
|
+
#
|
64
|
+
# # As the website keeps changing everyday, the output might not be exactly
|
65
|
+
# # the same as the one obtained below. Nevertheless, a Daru::DataFrame
|
66
|
+
# # should be obtained (as long as 'Sun Pharma' is there on the website).
|
67
|
+
#
|
68
|
+
# #=> <Daru::DataFrame(5x4)>
|
69
|
+
# # Company Price Change Value (Rs
|
70
|
+
# # 0 Sun Pharma 502.60 -65.05 2,117.87
|
71
|
+
# # 1 Reliance 1356.90 19.60 745.10
|
72
|
+
# # 2 Tech Mahin 379.45 -49.70 650.22
|
73
|
+
# # 3 ITC 315.85 6.75 621.12
|
74
|
+
# # 4 HDFC 1598.85 50.95 553.91
|
75
|
+
def call(match: nil, order: nil, index: nil, name: nil)
|
76
|
+
@match = match
|
77
|
+
@options = {name: name, index: index, order: order}
|
78
|
+
|
79
|
+
@file_data
|
80
|
+
.search('table')
|
81
|
+
.map { |table| parse_table(table) }
|
82
|
+
.compact
|
83
|
+
.keep_if { |table| satisfy_dimension(table) && search(table) }
|
84
|
+
.map { |table| decide_values(table, @options) }
|
85
|
+
.map { |table| table_to_dataframe(table) }
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
# Allows user to override the scraped order / index / data
|
91
|
+
def decide_values(scraped_val, user_val)
|
92
|
+
scraped_val.merge(user_val) { |_key, scraped, user| user || scraped }
|
93
|
+
end
|
94
|
+
|
95
|
+
# Splits headers (all th tags) into order and index. Wherein,
|
96
|
+
# Order : All <th> tags on first proper row of HTML table
|
97
|
+
# index : All <th> tags on first proper column of HTML table
|
98
|
+
def parse_hash(headers, size, headers_size)
|
99
|
+
headers_index = headers.find_index { |x| x.count == headers_size }
|
100
|
+
order = headers[headers_index]
|
101
|
+
order_index = order.count - size
|
102
|
+
order = order[order_index..-1]
|
103
|
+
indice = headers[headers_index+1..-1].flatten
|
104
|
+
indice = nil if indice.to_a.empty?
|
105
|
+
[order, indice]
|
106
|
+
end
|
107
|
+
|
108
|
+
def parse_table(table)
|
109
|
+
headers, headers_size = scrape_tag(table,'th')
|
110
|
+
data, size = scrape_tag(table, 'td')
|
111
|
+
data = data.keep_if { |x| x.count == size }
|
112
|
+
order, indice = parse_hash(headers, size, headers_size) if headers_size >= size
|
113
|
+
return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count>0
|
114
|
+
{data: data.compact, index: indice, order: order}
|
115
|
+
end
|
116
|
+
|
117
|
+
def scrape_tag(table, tag)
|
118
|
+
arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
|
119
|
+
size = arr.map(&:count).max
|
120
|
+
[arr, size]
|
121
|
+
end
|
122
|
+
|
123
|
+
def satisfy_dimension(table)
|
124
|
+
return false if @options[:order] && table[:data].first.size != @options[:order].size
|
125
|
+
return false if @options[:index] && table[:data].size != @options[:index].size
|
126
|
+
true
|
127
|
+
end
|
128
|
+
|
129
|
+
def search(table)
|
130
|
+
@match.nil? ? true : table.to_s.include?(@match)
|
131
|
+
end
|
132
|
+
|
133
|
+
def table_to_dataframe(table)
|
134
|
+
Daru::DataFrame.rows(
|
135
|
+
table[:data],
|
136
|
+
index: table[:index],
|
137
|
+
order: table[:order],
|
138
|
+
name: table[:name]
|
139
|
+
)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
require 'daru/io/importers/base'
|
2
|
+
|
3
|
+
module Daru
|
4
|
+
module IO
|
5
|
+
module Importers
|
6
|
+
# JSON Importer Class, that extends `from_json` and `read_json` methods
|
7
|
+
# to `Daru::DataFrame`
|
8
|
+
class JSON < Base
|
9
|
+
Daru::DataFrame.register_io_module :from_json, self
|
10
|
+
Daru::DataFrame.register_io_module :read_json, self
|
11
|
+
|
12
|
+
# Checks for required gem dependencies of JSON Importer
|
13
|
+
def initialize
|
14
|
+
require 'open-uri'
|
15
|
+
require 'json'
|
16
|
+
optional_gem 'jsonpath'
|
17
|
+
end
|
18
|
+
|
19
|
+
# Reads data from a json file / remote json response
|
20
|
+
#
|
21
|
+
# @!method self.read(path)
|
22
|
+
#
|
23
|
+
# @param path [String] Local / Remote path to json file, where the dataframe is to be imported
|
24
|
+
# from.
|
25
|
+
#
|
26
|
+
# @return [Daru::IO::Importers::JSON]
|
27
|
+
#
|
28
|
+
# @example Reading from simply nested remote json response
|
29
|
+
# url = 'https://data.nasa.gov/resource/2vr3-k9wn.json'
|
30
|
+
# simple_read_instance = Daru::IO::Importers::JSON.read(url)
|
31
|
+
#
|
32
|
+
# @example Reading from complexy nested remote json response
|
33
|
+
# url = 'http://api.tvmaze.com/singlesearch/shows?q=game-of-thrones&embed=episodes'
|
34
|
+
# complex_read_instance = Daru::IO::Importers::JSON.read(url)
|
35
|
+
def read(path)
|
36
|
+
@file_data = ::JSON.parse(open(path).read)
|
37
|
+
@json = @file_data
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
# Loads from a Ruby structure of Hashes and / or Arrays
|
42
|
+
#
|
43
|
+
# @!method self.from(instance)
|
44
|
+
#
|
45
|
+
# @param instance [Hash or Array] A simple / complexly nested JSON structure
|
46
|
+
#
|
47
|
+
# @return [Daru::IO::Importers::JSON]
|
48
|
+
#
|
49
|
+
# @example Loading from Ruby Hash of Arrays
|
50
|
+
# from_instance = Daru::IO::Importers::JSON.from({x: [1,4], y: [2,5], z: [3, 6]})
|
51
|
+
def from(instance)
|
52
|
+
@file_data = instance
|
53
|
+
@json = @file_data.is_a?(String) ? ::JSON.parse(@file_data) : @file_data
|
54
|
+
self
|
55
|
+
end
|
56
|
+
|
57
|
+
# Imports a `Daru::DataFrame` from a JSON Importer instance
|
58
|
+
#
|
59
|
+
# @param columns [Array] JSON-path slectors to select specific fields
|
60
|
+
# from the JSON input.
|
61
|
+
# @param order [String or Array] Either a JSON-path selector string, or
|
62
|
+
# an array containing the order of the `Daru::DataFrame`.
|
63
|
+
# @param index [String or Array] Either a JSON-path selector string, or
|
64
|
+
# an array containing the order of the `Daru::DataFrame`.
|
65
|
+
# @param named_columns [Hash] JSON-path slectors to select specific fields
|
66
|
+
# from the JSON input.
|
67
|
+
#
|
68
|
+
# @return [Daru::DataFrame]
|
69
|
+
#
|
70
|
+
# @note For more information on using JSON-path selectors, have a look at
|
71
|
+
# the explanations {http://www.rubydoc.info/gems/jsonpath/0.5.8 here}
|
72
|
+
# and {http://goessner.net/articles/JsonPath/ here}.
|
73
|
+
#
|
74
|
+
# @example Importing without jsonpath selectors
|
75
|
+
# df = simple_read_instance.call
|
76
|
+
#
|
77
|
+
# #=> #<Daru::DataFrame(202x10)>
|
78
|
+
# # designation discovery_ h_mag i_deg moid_au orbit_clas period_yr ...
|
79
|
+
# # 0 419880 (20 2011-01-07 19.7 9.65 0.035 Apollo 4.06 ...
|
80
|
+
# # 1 419624 (20 2010-09-17 20.5 14.52 0.028 Apollo 1 ...
|
81
|
+
# # 2 414772 (20 2010-07-28 19 23.11 0.333 Apollo 1.31 ...
|
82
|
+
# # ... ... ... ... ... ... ... ... ...
|
83
|
+
#
|
84
|
+
# @example Importing with jsonpath selectors
|
85
|
+
# df = complex_read_instance.call(
|
86
|
+
# "$.._embedded..episodes..name",
|
87
|
+
# "$.._embedded..episodes..season",
|
88
|
+
# "$.._embedded..episodes..number",
|
89
|
+
# index: (10..70).to_a,
|
90
|
+
# RunTime: "$.._embedded..episodes..runtime"
|
91
|
+
# )
|
92
|
+
#
|
93
|
+
# #=> #<Daru::DataFrame(61x4)>
|
94
|
+
# # name season number RunTime
|
95
|
+
# # 10 Winter is 1 1 60
|
96
|
+
# # 11 The Kingsr 1 2 60
|
97
|
+
# # 12 Lord Snow 1 3 60
|
98
|
+
# # ... ... ... ... ...
|
99
|
+
#
|
100
|
+
# @example Importing from `from` method
|
101
|
+
# df = from_instance.call
|
102
|
+
#
|
103
|
+
# #=> #<Daru::DataFrame(2x3)>
|
104
|
+
# # x y z
|
105
|
+
# # 0 1 2 3
|
106
|
+
# # 1 4 5 6
|
107
|
+
def call(*columns, order: nil, index: nil, **named_columns)
|
108
|
+
init_opts(*columns, order: order, index: index, **named_columns)
|
109
|
+
@data = fetch_data
|
110
|
+
@index = at_jsonpath(@index)
|
111
|
+
@order = at_jsonpath(@order)
|
112
|
+
@order ||= Array.new(@columns.count) { |x| x } + @named_columns.keys
|
113
|
+
|
114
|
+
Daru::DataFrame.new(@data, order: @order, index: @index)
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def at_jsonpath(jsonpath)
|
120
|
+
jsonpath.is_a?(String) ? JsonPath.on(@json, jsonpath) : jsonpath
|
121
|
+
end
|
122
|
+
|
123
|
+
def fetch_data
|
124
|
+
return @json if @columns.empty? && @named_columns.empty?
|
125
|
+
|
126
|
+
# If only one unnamed column is provided without any named_columns,
|
127
|
+
# entire dataset is assumed to reside in that JSON-path.
|
128
|
+
return at_jsonpath(@columns.first) if @columns.size == 1 && @named_columns.empty?
|
129
|
+
data_columns = @columns + @named_columns.values
|
130
|
+
data_columns.map { |col| at_jsonpath(col) }
|
131
|
+
end
|
132
|
+
|
133
|
+
def init_opts(*columns, order: nil, index: nil, **named_columns)
|
134
|
+
@columns = columns
|
135
|
+
@order = order
|
136
|
+
@index = index
|
137
|
+
@named_columns = named_columns
|
138
|
+
|
139
|
+
validate_params
|
140
|
+
end
|
141
|
+
|
142
|
+
def validate_params
|
143
|
+
return if @order.nil? || @named_columns.empty?
|
144
|
+
|
145
|
+
raise ArgumentError,
|
146
|
+
'Do not pass on order and named columns together, at the same '\
|
147
|
+
'function call. Please use only order or only named_columns.'
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|