daru 0.1.5 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +21 -7
- data/.travis.yml +10 -5
- data/CONTRIBUTING.md +15 -10
- data/History.md +124 -2
- data/README.md +37 -9
- data/ReleasePolicy.md +20 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/statistics.rb +6 -6
- data/benchmarks/where_clause.rb +1 -1
- data/benchmarks/where_vs_filter.rb +1 -1
- data/daru.gemspec +17 -41
- data/lib/daru.rb +10 -13
- data/lib/daru/accessors/gsl_wrapper.rb +1 -1
- data/lib/daru/accessors/nmatrix_wrapper.rb +2 -0
- data/lib/daru/category.rb +29 -15
- data/lib/daru/configuration.rb +34 -0
- data/lib/daru/core/group_by.rb +158 -77
- data/lib/daru/core/merge.rb +12 -3
- data/lib/daru/core/query.rb +20 -4
- data/lib/daru/dataframe.rb +692 -118
- data/lib/daru/date_time/index.rb +14 -11
- data/lib/daru/date_time/offsets.rb +9 -1
- data/lib/daru/extensions/which_dsl.rb +55 -0
- data/lib/daru/formatters/table.rb +3 -5
- data/lib/daru/index/categorical_index.rb +4 -4
- data/lib/daru/index/index.rb +131 -42
- data/lib/daru/index/multi_index.rb +118 -10
- data/lib/daru/io/csv/converters.rb +21 -0
- data/lib/daru/io/io.rb +105 -33
- data/lib/daru/io/sql_data_source.rb +10 -0
- data/lib/daru/iruby/templates/dataframe.html.erb +4 -51
- data/lib/daru/iruby/templates/dataframe_mi.html.erb +3 -56
- data/lib/daru/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru/iruby/templates/vector.html.erb +3 -25
- data/lib/daru/iruby/templates/vector_mi.html.erb +3 -34
- data/lib/daru/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru/maths/arithmetic/vector.rb +38 -2
- data/lib/daru/maths/statistics/dataframe.rb +28 -30
- data/lib/daru/maths/statistics/vector.rb +295 -41
- data/lib/daru/plotting/gruff/dataframe.rb +13 -15
- data/lib/daru/plotting/nyaplot/category.rb +1 -1
- data/lib/daru/plotting/nyaplot/dataframe.rb +15 -4
- data/lib/daru/plotting/nyaplot/vector.rb +1 -2
- data/lib/daru/vector.rb +308 -96
- data/lib/daru/version.rb +1 -1
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/gsl_wrapper_spec.rb +38 -35
- data/spec/accessors/nmatrix_wrapper_spec.rb +25 -22
- data/spec/category_spec.rb +24 -20
- data/spec/core/group_by_spec.rb +238 -4
- data/spec/core/merge_spec.rb +1 -1
- data/spec/core/query_spec.rb +65 -50
- data/spec/daru_spec.rb +22 -0
- data/spec/dataframe_spec.rb +473 -16
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +34 -16
- data/spec/date_time/offsets_spec.rb +14 -0
- data/spec/extensions/rserve_spec.rb +1 -1
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +55 -55
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +29 -0
- data/spec/index/categorical_index_spec.rb +33 -33
- data/spec/index/index_spec.rb +160 -41
- data/spec/index/multi_index_spec.rb +143 -33
- data/spec/io/io_spec.rb +246 -2
- data/spec/io/sql_data_source_spec.rb +31 -41
- data/spec/iruby/dataframe_spec.rb +17 -19
- data/spec/iruby/vector_spec.rb +26 -28
- data/spec/maths/arithmetic/dataframe_spec.rb +1 -1
- data/spec/maths/arithmetic/vector_spec.rb +18 -0
- data/spec/maths/statistics/vector_spec.rb +153 -15
- data/spec/plotting/gruff/category_spec.rb +3 -3
- data/spec/plotting/gruff/dataframe_spec.rb +14 -4
- data/spec/plotting/gruff/vector_spec.rb +9 -9
- data/spec/plotting/nyaplot/category_spec.rb +5 -9
- data/spec/plotting/nyaplot/dataframe_spec.rb +95 -47
- data/spec/plotting/nyaplot/vector_spec.rb +5 -11
- data/spec/shared/vector_display_spec.rb +12 -14
- data/spec/spec_helper.rb +30 -7
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +306 -72
- metadata +96 -55
- data/spec/fixtures/stock_data.csv +0 -500
@@ -1,5 +1,5 @@
|
|
1
1
|
module Daru
|
2
|
-
class MultiIndex < Index
|
2
|
+
class MultiIndex < Index # rubocop:disable Metrics/ClassLength
|
3
3
|
def each(&block)
|
4
4
|
to_a.each(&block)
|
5
5
|
end
|
@@ -9,11 +9,74 @@ module Daru
|
|
9
9
|
end
|
10
10
|
|
11
11
|
attr_reader :labels
|
12
|
+
attr_reader :name
|
12
13
|
|
13
14
|
def levels
|
14
15
|
@levels.map(&:keys)
|
15
16
|
end
|
16
17
|
|
18
|
+
# names and levels should be of same size. If size of Array `name` is less
|
19
|
+
# or greater than size of array `levels` then it raises `SizeError`.
|
20
|
+
# If user don't want to put name for particular level then user must put
|
21
|
+
# empty string in that index of Array `name`.
|
22
|
+
# For example there is multi_index of 3 levels and user don't want to name
|
23
|
+
# level 0, then do multi_index.name = ['', 'level1_name1', 'level2_name']
|
24
|
+
#
|
25
|
+
# @example
|
26
|
+
#
|
27
|
+
# # set the name during initialization
|
28
|
+
#
|
29
|
+
# mi = Daru::MultiIndex.new(
|
30
|
+
# levels: [[:a,:b,:c], [:one, :two]],
|
31
|
+
# labels: [[0,0,1,1,2,2], [0,1,0,1,0,1]], name: ['s1', 's2'])
|
32
|
+
#
|
33
|
+
# # =>
|
34
|
+
# # <Daru::MultiIndex(6x2)>
|
35
|
+
# # s1 s2
|
36
|
+
# # a one
|
37
|
+
# # two
|
38
|
+
# # b one
|
39
|
+
# # two
|
40
|
+
# # c one
|
41
|
+
# # two
|
42
|
+
#
|
43
|
+
# # set new name
|
44
|
+
#
|
45
|
+
# mi.name = ['k1', 'k2']
|
46
|
+
# => ["k1", "k2"]
|
47
|
+
#
|
48
|
+
# mi
|
49
|
+
# =>
|
50
|
+
# # #<Daru::MultiIndex(6x2)>
|
51
|
+
# # k1 k2
|
52
|
+
# # a one
|
53
|
+
# # two
|
54
|
+
# # b one
|
55
|
+
# # two
|
56
|
+
# # c one
|
57
|
+
# # two
|
58
|
+
#
|
59
|
+
# # access the name
|
60
|
+
#
|
61
|
+
# mi.name
|
62
|
+
# => ["k1", "k2"]
|
63
|
+
#
|
64
|
+
# # If you don't want to name level 0
|
65
|
+
#
|
66
|
+
# mi.name = ['', 'k2']
|
67
|
+
# => ["", "k2"]
|
68
|
+
#
|
69
|
+
# mi
|
70
|
+
# =>
|
71
|
+
# #<Daru::MultiIndex(6x2)>
|
72
|
+
# # k2
|
73
|
+
# # a one
|
74
|
+
# # two
|
75
|
+
# # b one
|
76
|
+
# # two
|
77
|
+
# # c one
|
78
|
+
# # two
|
79
|
+
#
|
17
80
|
def initialize opts={}
|
18
81
|
labels = opts[:labels]
|
19
82
|
levels = opts[:levels]
|
@@ -24,6 +87,12 @@ module Daru
|
|
24
87
|
|
25
88
|
@labels = labels
|
26
89
|
@levels = levels.map { |e| e.map.with_index.to_h }
|
90
|
+
self.name = opts[:name] unless opts[:name].nil?
|
91
|
+
end
|
92
|
+
|
93
|
+
def name=(names)
|
94
|
+
validate_name names, @labels
|
95
|
+
@name = names
|
27
96
|
end
|
28
97
|
|
29
98
|
def incorrect_fields?(_labels, levels)
|
@@ -84,7 +153,7 @@ module Daru
|
|
84
153
|
# Returns positions given indexes or positions
|
85
154
|
# @note If the arugent is both a valid index and a valid position,
|
86
155
|
# it will treated as valid index
|
87
|
-
# @param [Array<object>]
|
156
|
+
# @param indexes [Array<object>] indexes or positions
|
88
157
|
# @example
|
89
158
|
# idx = Daru::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
90
159
|
# idx.pos :a
|
@@ -109,7 +178,7 @@ module Daru
|
|
109
178
|
|
110
179
|
# Takes positional values and returns subset of the self
|
111
180
|
# capturing the indexes at mentioned positions
|
112
|
-
# @param [Array<Integer>] positional values
|
181
|
+
# @param positions [Array<Integer>] positional values
|
113
182
|
# @return [object] index object
|
114
183
|
# @example
|
115
184
|
# idx = Daru::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
@@ -128,12 +197,12 @@ module Daru
|
|
128
197
|
end
|
129
198
|
|
130
199
|
def add *indexes
|
131
|
-
Daru::MultiIndex.from_tuples
|
200
|
+
Daru::MultiIndex.from_tuples(to_a + [indexes])
|
132
201
|
end
|
133
202
|
|
134
203
|
def reorder(new_order)
|
135
204
|
from = to_a
|
136
|
-
|
205
|
+
MultiIndex.from_tuples(new_order.map { |i| from[i] })
|
137
206
|
end
|
138
207
|
|
139
208
|
def try_retrieve_from_integer int
|
@@ -171,8 +240,42 @@ module Daru
|
|
171
240
|
end
|
172
241
|
end
|
173
242
|
|
243
|
+
def remove_layer layer_index
|
244
|
+
@levels.delete_at(layer_index)
|
245
|
+
@labels.delete_at(layer_index)
|
246
|
+
@name.delete_at(layer_index) unless @name.nil?
|
247
|
+
|
248
|
+
coerce_index
|
249
|
+
end
|
250
|
+
|
251
|
+
def coerce_index
|
252
|
+
if @levels.size == 1
|
253
|
+
elements = to_a.flatten
|
254
|
+
|
255
|
+
if elements.uniq.length == elements.length
|
256
|
+
Daru::Index.new(elements)
|
257
|
+
else
|
258
|
+
Daru::CategoricalIndex.new(elements)
|
259
|
+
end
|
260
|
+
else
|
261
|
+
self
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
# Array `name` must have same length as levels and labels.
|
266
|
+
def validate_name names, levels
|
267
|
+
error_msg = "'names' and 'levels' should be of same size. Size of the "\
|
268
|
+
"'name' array is #{names.size} and size of the MultiIndex 'levels' and "\
|
269
|
+
"'labels' is #{labels.size}."
|
270
|
+
suggestion_msg = "If you don\'t want to set name for particular level " \
|
271
|
+
"(say level 'i') then put empty string on index 'i' of the 'name' Array."
|
272
|
+
|
273
|
+
raise SizeError, error_msg if names.size > levels.size
|
274
|
+
raise SizeError, [error_msg, suggestion_msg].join("\n") if names.size < levels.size
|
275
|
+
end
|
276
|
+
|
174
277
|
private :find_all_indexes, :multi_index_from_multiple_selections,
|
175
|
-
:retrieve_from_range, :retrieve_from_tuples
|
278
|
+
:retrieve_from_range, :retrieve_from_tuples, :validate_name
|
176
279
|
|
177
280
|
def key index
|
178
281
|
raise ArgumentError, "Key #{index} is too large" if index >= @labels[0].size
|
@@ -183,7 +286,7 @@ module Daru
|
|
183
286
|
end
|
184
287
|
|
185
288
|
def dup
|
186
|
-
MultiIndex.new levels: levels.dup, labels: labels
|
289
|
+
MultiIndex.new levels: levels.dup, labels: labels.dup, name: (@name.nil? ? nil : @name.dup)
|
187
290
|
end
|
188
291
|
|
189
292
|
def drop_left_level by=1
|
@@ -204,8 +307,9 @@ module Daru
|
|
204
307
|
|
205
308
|
def include? tuple
|
206
309
|
return false unless tuple.is_a? Enumerable
|
207
|
-
tuple.flatten.
|
208
|
-
|
310
|
+
@labels[0...tuple.flatten.size]
|
311
|
+
.transpose
|
312
|
+
.include?(tuple.flatten.each_with_index.map { |e, i| @levels[i][e] })
|
209
313
|
end
|
210
314
|
|
211
315
|
def size
|
@@ -232,7 +336,7 @@ module Daru
|
|
232
336
|
|
233
337
|
def inspect threshold=20
|
234
338
|
"#<Daru::MultiIndex(#{size}x#{width})>\n" +
|
235
|
-
Formatters::Table.format([], row_headers: sparse_tuples, threshold: threshold)
|
339
|
+
Formatters::Table.format([], headers: @name, row_headers: sparse_tuples, threshold: threshold)
|
236
340
|
end
|
237
341
|
|
238
342
|
def to_html
|
@@ -262,5 +366,9 @@ module Daru
|
|
262
366
|
[nil] * (cur.size - left.size) + left.map(&:first)
|
263
367
|
}
|
264
368
|
end
|
369
|
+
|
370
|
+
def to_df
|
371
|
+
Daru::DataFrame.new(@name.zip(to_a.transpose).to_h)
|
372
|
+
end
|
265
373
|
end
|
266
374
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Daru
|
2
|
+
module IO
|
3
|
+
module CSV
|
4
|
+
CONVERTERS = {
|
5
|
+
boolean: lambda { |f, _|
|
6
|
+
case f.downcase.strip
|
7
|
+
when 'true'
|
8
|
+
true
|
9
|
+
when 'false'
|
10
|
+
false
|
11
|
+
else
|
12
|
+
f
|
13
|
+
end
|
14
|
+
},
|
15
|
+
string: lambda { |f, _|
|
16
|
+
f
|
17
|
+
}
|
18
|
+
}.freeze
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/daru/io/io.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
module Daru
|
2
|
+
require_relative 'csv/converters.rb'
|
2
3
|
module IOHelpers
|
3
4
|
class << self
|
4
5
|
def process_row(row,empty)
|
@@ -39,14 +40,11 @@ module Daru
|
|
39
40
|
|
40
41
|
def from_excel path, opts={}
|
41
42
|
opts = {
|
42
|
-
worksheet_id: 0
|
43
|
+
worksheet_id: 0,
|
44
|
+
row_id: 0
|
43
45
|
}.merge opts
|
44
46
|
|
45
|
-
|
46
|
-
book = Spreadsheet.open path
|
47
|
-
worksheet = book.worksheet worksheet_id
|
48
|
-
headers = ArrayHelper.recode_repeated(worksheet.row(0)).map(&:to_sym)
|
49
|
-
|
47
|
+
worksheet, headers = read_from_excel(path, opts)
|
50
48
|
df = Daru::DataFrame.new({})
|
51
49
|
headers.each_with_index do |h,i|
|
52
50
|
col = worksheet.column(i).to_a
|
@@ -57,6 +55,18 @@ module Daru
|
|
57
55
|
df
|
58
56
|
end
|
59
57
|
|
58
|
+
def read_from_excel path, opts
|
59
|
+
optional_gem 'spreadsheet', '~>1.1.1'
|
60
|
+
|
61
|
+
worksheet_id = opts[:worksheet_id]
|
62
|
+
row_id = opts[:row_id]
|
63
|
+
book = Spreadsheet.open path
|
64
|
+
worksheet = book.worksheet worksheet_id
|
65
|
+
headers = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)
|
66
|
+
|
67
|
+
[worksheet, headers]
|
68
|
+
end
|
69
|
+
|
60
70
|
def dataframe_write_excel dataframe, path, _opts={}
|
61
71
|
book = Spreadsheet::Workbook.new
|
62
72
|
sheet = book.create_worksheet
|
@@ -76,7 +86,6 @@ module Daru
|
|
76
86
|
# Functions for loading/writing CSV files
|
77
87
|
def from_csv path, opts={}
|
78
88
|
daru_options, opts = from_csv_prepare_opts opts
|
79
|
-
|
80
89
|
# Preprocess headers for detecting and correcting repetition in
|
81
90
|
# case the :headers option is not specified.
|
82
91
|
hsh =
|
@@ -86,7 +95,6 @@ module Daru
|
|
86
95
|
from_csv_hash(path, opts)
|
87
96
|
.tap { |hash| daru_options[:order] = hash.keys }
|
88
97
|
end
|
89
|
-
|
90
98
|
Daru::DataFrame.new(hsh,daru_options)
|
91
99
|
end
|
92
100
|
|
@@ -111,11 +119,10 @@ module Daru
|
|
111
119
|
|
112
120
|
# Execute a query and create a data frame from the result
|
113
121
|
#
|
114
|
-
# @param
|
122
|
+
# @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
115
123
|
# @param query [String] The query to be executed
|
116
124
|
#
|
117
125
|
# @return A dataframe containing the data resulting from the query
|
118
|
-
|
119
126
|
def from_sql(db, query)
|
120
127
|
require 'daru/io/sql_data_source'
|
121
128
|
SqlDataSource.make_dataframe(db, query)
|
@@ -135,23 +142,11 @@ module Daru
|
|
135
142
|
#
|
136
143
|
# @return A dataframe containing the data in the given relation
|
137
144
|
def from_activerecord(relation, *fields)
|
138
|
-
if fields.empty?
|
139
|
-
|
140
|
-
record.attributes.symbolize_keys
|
141
|
-
end
|
142
|
-
return Daru::DataFrame.new(records)
|
143
|
-
else
|
144
|
-
fields = fields.map(&:to_sym)
|
145
|
-
end
|
146
|
-
|
147
|
-
vectors = fields.map { |name| [name, Daru::Vector.new([], name: name)] }.to_h
|
145
|
+
fields = relation.klass.column_names if fields.empty?
|
146
|
+
fields = fields.map(&:to_sym)
|
148
147
|
|
149
|
-
|
150
|
-
|
151
|
-
df.add_row(Array(record))
|
152
|
-
end
|
153
|
-
df.update
|
154
|
-
end
|
148
|
+
result = relation.pluck(*fields).transpose
|
149
|
+
Daru::DataFrame.new(result, order: fields).tap(&:update)
|
155
150
|
end
|
156
151
|
|
157
152
|
# Loading data from plain text files
|
@@ -186,13 +181,34 @@ module Daru
|
|
186
181
|
end
|
187
182
|
end
|
188
183
|
|
184
|
+
def from_html path, opts
|
185
|
+
optional_gem 'mechanize', '~>2.7.5'
|
186
|
+
page = Mechanize.new.get(path)
|
187
|
+
page.search('table').map { |table| html_parse_table table }
|
188
|
+
.keep_if { |table| html_search table, opts[:match] }
|
189
|
+
.compact
|
190
|
+
.map { |table| html_decide_values table, opts }
|
191
|
+
.map { |table| html_table_to_dataframe table }
|
192
|
+
end
|
193
|
+
|
189
194
|
private
|
190
195
|
|
191
|
-
|
196
|
+
def optional_gem(name, version)
|
197
|
+
gem name, version
|
198
|
+
require name
|
199
|
+
rescue LoadError
|
200
|
+
Daru.error "\nInstall the #{name} gem version #{version} for using"\
|
201
|
+
" #{name} functions."
|
202
|
+
end
|
203
|
+
|
204
|
+
DARU_OPT_KEYS = %i[clone order index name].freeze
|
192
205
|
|
193
206
|
def from_csv_prepare_opts opts
|
194
207
|
opts[:col_sep] ||= ','
|
195
|
-
opts[:
|
208
|
+
opts[:skip_blanks] ||= true
|
209
|
+
opts[:converters] ||= [:numeric]
|
210
|
+
|
211
|
+
opts[:converters] = from_csv_prepare_converters(opts[:converters])
|
196
212
|
|
197
213
|
daru_options = opts.keys.each_with_object({}) do |k, hash|
|
198
214
|
hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
|
@@ -200,11 +216,22 @@ module Daru
|
|
200
216
|
[daru_options, opts]
|
201
217
|
end
|
202
218
|
|
219
|
+
def from_csv_prepare_converters(converters)
|
220
|
+
Array(converters).flat_map do |c|
|
221
|
+
if ::CSV::Converters[c]
|
222
|
+
::CSV::Converters[c]
|
223
|
+
elsif Daru::IO::CSV::CONVERTERS[c]
|
224
|
+
Daru::IO::CSV::CONVERTERS[c]
|
225
|
+
else
|
226
|
+
c
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
203
231
|
def from_csv_hash_with_headers(path, opts)
|
204
232
|
opts[:header_converters] ||= :symbol
|
205
|
-
|
206
233
|
::CSV
|
207
|
-
.
|
234
|
+
.parse(open(path), opts)
|
208
235
|
.tap { |c| yield c if block_given? }
|
209
236
|
.by_col.map { |col_name, values| [col_name, values] }.to_h
|
210
237
|
end
|
@@ -212,15 +239,60 @@ module Daru
|
|
212
239
|
def from_csv_hash(path, opts)
|
213
240
|
csv_as_arrays =
|
214
241
|
::CSV
|
215
|
-
.open(path,
|
242
|
+
.parse(open(path), **opts)
|
216
243
|
.tap { |c| yield c if block_given? }
|
217
244
|
.to_a
|
218
|
-
|
219
245
|
headers = ArrayHelper.recode_repeated(csv_as_arrays.shift)
|
220
246
|
csv_as_arrays = csv_as_arrays.transpose
|
221
|
-
|
222
247
|
headers.each_with_index.map { |h, i| [h, csv_as_arrays[i]] }.to_h
|
223
248
|
end
|
249
|
+
|
250
|
+
def html_parse_table(table)
|
251
|
+
headers, headers_size = html_scrape_tag(table,'th')
|
252
|
+
data, size = html_scrape_tag(table, 'td')
|
253
|
+
data = data.keep_if { |x| x.count == size }
|
254
|
+
order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
|
255
|
+
return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count>0
|
256
|
+
{data: data.compact, index: indice, order: order}
|
257
|
+
end
|
258
|
+
|
259
|
+
def html_scrape_tag(table, tag)
|
260
|
+
arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
|
261
|
+
size = arr.map(&:count).max
|
262
|
+
[arr, size]
|
263
|
+
end
|
264
|
+
|
265
|
+
# Splits headers (all th tags) into order and index. Wherein,
|
266
|
+
# Order : All <th> tags on first proper row of HTML table
|
267
|
+
# index : All <th> tags on first proper column of HTML table
|
268
|
+
def html_parse_hash(headers, size, headers_size)
|
269
|
+
headers_index = headers.find_index { |x| x.count == headers_size }
|
270
|
+
order = headers[headers_index]
|
271
|
+
order_index = order.count - size
|
272
|
+
order = order[order_index..-1]
|
273
|
+
indice = headers[headers_index+1..-1].flatten
|
274
|
+
indice = nil if indice.to_a.empty?
|
275
|
+
[order, indice]
|
276
|
+
end
|
277
|
+
|
278
|
+
def html_search(table, match=nil)
|
279
|
+
match.nil? ? true : (table.to_s.include? match)
|
280
|
+
end
|
281
|
+
|
282
|
+
# Allows user to override the scraped order / index / data
|
283
|
+
def html_decide_values(scraped_val={}, user_val={})
|
284
|
+
%I[data index name order].each do |key|
|
285
|
+
user_val[key] ||= scraped_val[key]
|
286
|
+
end
|
287
|
+
user_val
|
288
|
+
end
|
289
|
+
|
290
|
+
def html_table_to_dataframe(table)
|
291
|
+
Daru::DataFrame.rows table[:data],
|
292
|
+
index: table[:index],
|
293
|
+
order: table[:order],
|
294
|
+
name: table[:name]
|
295
|
+
end
|
224
296
|
end
|
225
297
|
end
|
226
298
|
end
|