daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,374 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class MultiIndex < Index # rubocop:disable Metrics/ClassLength
|
3
|
+
def each(&block)
|
4
|
+
to_a.each(&block)
|
5
|
+
end
|
6
|
+
|
7
|
+
def map(&block)
|
8
|
+
to_a.map(&block)
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :labels, :name
|
12
|
+
|
13
|
+
def levels
|
14
|
+
@levels.map(&:keys)
|
15
|
+
end
|
16
|
+
|
17
|
+
# names and levels should be of same size. If size of Array `name` is less
|
18
|
+
# or greater than size of array `levels` then it raises `SizeError`.
|
19
|
+
# If user don't want to put name for particular level then user must put
|
20
|
+
# empty string in that index of Array `name`.
|
21
|
+
# For example there is multi_index of 3 levels and user don't want to name
|
22
|
+
# level 0, then do multi_index.name = ['', 'level1_name1', 'level2_name']
|
23
|
+
#
|
24
|
+
# @example
|
25
|
+
#
|
26
|
+
# # set the name during initialization
|
27
|
+
#
|
28
|
+
# mi = DaruLite::MultiIndex.new(
|
29
|
+
# levels: [[:a,:b,:c], [:one, :two]],
|
30
|
+
# labels: [[0,0,1,1,2,2], [0,1,0,1,0,1]], name: ['s1', 's2'])
|
31
|
+
#
|
32
|
+
# # =>
|
33
|
+
# # <DaruLite::MultiIndex(6x2)>
|
34
|
+
# # s1 s2
|
35
|
+
# # a one
|
36
|
+
# # two
|
37
|
+
# # b one
|
38
|
+
# # two
|
39
|
+
# # c one
|
40
|
+
# # two
|
41
|
+
#
|
42
|
+
# # set new name
|
43
|
+
#
|
44
|
+
# mi.name = ['k1', 'k2']
|
45
|
+
# => ["k1", "k2"]
|
46
|
+
#
|
47
|
+
# mi
|
48
|
+
# =>
|
49
|
+
# # #<DaruLite::MultiIndex(6x2)>
|
50
|
+
# # k1 k2
|
51
|
+
# # a one
|
52
|
+
# # two
|
53
|
+
# # b one
|
54
|
+
# # two
|
55
|
+
# # c one
|
56
|
+
# # two
|
57
|
+
#
|
58
|
+
# # access the name
|
59
|
+
#
|
60
|
+
# mi.name
|
61
|
+
# => ["k1", "k2"]
|
62
|
+
#
|
63
|
+
# # If you don't want to name level 0
|
64
|
+
#
|
65
|
+
# mi.name = ['', 'k2']
|
66
|
+
# => ["", "k2"]
|
67
|
+
#
|
68
|
+
# mi
|
69
|
+
# =>
|
70
|
+
# #<DaruLite::MultiIndex(6x2)>
|
71
|
+
# # k2
|
72
|
+
# # a one
|
73
|
+
# # two
|
74
|
+
# # b one
|
75
|
+
# # two
|
76
|
+
# # c one
|
77
|
+
# # two
|
78
|
+
#
|
79
|
+
def initialize(opts = {})
|
80
|
+
labels = opts[:labels]
|
81
|
+
levels = opts[:levels]
|
82
|
+
|
83
|
+
raise ArgumentError, 'Must specify both labels and levels' unless labels && levels
|
84
|
+
raise ArgumentError, 'Labels and levels should be same size' if labels.size != levels.size
|
85
|
+
raise ArgumentError, 'Incorrect labels and levels' if incorrect_fields?(labels, levels)
|
86
|
+
|
87
|
+
@labels = labels
|
88
|
+
@levels = levels.map { |e| e.map.with_index.to_h }
|
89
|
+
self.name = opts[:name] unless opts[:name].nil?
|
90
|
+
end
|
91
|
+
|
92
|
+
def name=(names)
|
93
|
+
validate_name names, @labels
|
94
|
+
@name = names
|
95
|
+
end
|
96
|
+
|
97
|
+
def incorrect_fields?(_labels, levels)
|
98
|
+
levels[0].size # FIXME: without this exact call some specs are failing
|
99
|
+
|
100
|
+
levels.any? { |e| e.uniq.size != e.size }
|
101
|
+
end
|
102
|
+
|
103
|
+
private :incorrect_fields?
|
104
|
+
|
105
|
+
def self.from_arrays(arrays)
|
106
|
+
levels = arrays.map { |e| e.uniq.sort_by(&:to_s) }
|
107
|
+
|
108
|
+
labels = arrays.each_with_index.map do |arry, level_index|
|
109
|
+
level = levels[level_index]
|
110
|
+
arry.map { |lvl| level.index(lvl) }
|
111
|
+
end
|
112
|
+
|
113
|
+
MultiIndex.new labels: labels, levels: levels
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.from_tuples(tuples)
|
117
|
+
from_arrays tuples.transpose
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.try_from_tuples(tuples)
|
121
|
+
from_tuples(tuples) if tuples.respond_to?(:first) && tuples.first.is_a?(Array)
|
122
|
+
end
|
123
|
+
|
124
|
+
def [](*key)
|
125
|
+
key.flatten!
|
126
|
+
if key[0].is_a?(Range)
|
127
|
+
retrieve_from_range(key[0])
|
128
|
+
elsif key[0].is_a?(Integer) && key.size == 1
|
129
|
+
try_retrieve_from_integer(key[0])
|
130
|
+
else
|
131
|
+
begin
|
132
|
+
retrieve_from_tuples key
|
133
|
+
rescue NoMethodError
|
134
|
+
raise IndexError, "Specified index #{key.inspect} do not exist"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def valid?(*indexes)
|
140
|
+
# FIXME: This is perhaps not a good method
|
141
|
+
pos(*indexes)
|
142
|
+
true
|
143
|
+
rescue IndexError
|
144
|
+
false
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns positions given indexes or positions
|
148
|
+
# @note If the arugent is both a valid index and a valid position,
|
149
|
+
# it will treated as valid index
|
150
|
+
# @param indexes [Array<object>] indexes or positions
|
151
|
+
# @example
|
152
|
+
# idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
153
|
+
# idx.pos :a
|
154
|
+
# # => [0, 1]
|
155
|
+
def pos(*indexes)
|
156
|
+
if indexes.first.is_a? Integer
|
157
|
+
return indexes.first if indexes.size == 1
|
158
|
+
|
159
|
+
return indexes
|
160
|
+
end
|
161
|
+
res = self[indexes]
|
162
|
+
return res if res.is_a? Integer
|
163
|
+
|
164
|
+
res.map { |i| self[i] }
|
165
|
+
end
|
166
|
+
|
167
|
+
def subset(*indexes)
|
168
|
+
if indexes.first.is_a? Integer
|
169
|
+
MultiIndex.from_tuples(indexes.map { |index| key(index) })
|
170
|
+
else
|
171
|
+
self[indexes].conform indexes
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Takes positional values and returns subset of the self
|
176
|
+
# capturing the indexes at mentioned positions
|
177
|
+
# @param positions [Array<Integer>] positional values
|
178
|
+
# @return [object] index object
|
179
|
+
# @example
|
180
|
+
# idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
181
|
+
# idx.at 0, 1
|
182
|
+
# # => #<DaruLite::MultiIndex(2x2)>
|
183
|
+
# # a one
|
184
|
+
# # two
|
185
|
+
def at(*positions)
|
186
|
+
positions = preprocess_positions(*positions)
|
187
|
+
validate_positions(*positions)
|
188
|
+
if positions.is_a? Integer
|
189
|
+
key(positions)
|
190
|
+
else
|
191
|
+
DaruLite::MultiIndex.from_tuples(positions.map { |v| key(v) })
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def add(*indexes)
|
196
|
+
DaruLite::MultiIndex.from_tuples(to_a + [indexes])
|
197
|
+
end
|
198
|
+
|
199
|
+
def reorder(new_order)
|
200
|
+
from = to_a
|
201
|
+
MultiIndex.from_tuples(new_order.map { |i| from[i] })
|
202
|
+
end
|
203
|
+
|
204
|
+
def try_retrieve_from_integer(int)
|
205
|
+
@levels[0].key?(int) ? retrieve_from_tuples([int]) : int
|
206
|
+
end
|
207
|
+
|
208
|
+
def retrieve_from_range(range)
|
209
|
+
MultiIndex.from_tuples(range.map { |index| key(index) })
|
210
|
+
end
|
211
|
+
|
212
|
+
def retrieve_from_tuples(key)
|
213
|
+
chosen = []
|
214
|
+
|
215
|
+
key.each_with_index do |k, depth|
|
216
|
+
level_index = @levels[depth][k]
|
217
|
+
raise IndexError, "Specified index #{key.inspect} do not exist" if level_index.nil?
|
218
|
+
|
219
|
+
label = @labels[depth]
|
220
|
+
chosen = find_all_indexes label, level_index, chosen
|
221
|
+
end
|
222
|
+
|
223
|
+
return chosen[0] if chosen.size == 1 && key.size == @levels.size
|
224
|
+
|
225
|
+
multi_index_from_multiple_selections(chosen)
|
226
|
+
end
|
227
|
+
|
228
|
+
def multi_index_from_multiple_selections(chosen)
|
229
|
+
MultiIndex.from_tuples(chosen.map { |e| key(e) })
|
230
|
+
end
|
231
|
+
|
232
|
+
def find_all_indexes(label, level_index, chosen)
|
233
|
+
if chosen.empty?
|
234
|
+
label.each_with_index
|
235
|
+
.select { |lbl, _| lbl == level_index }.map(&:last)
|
236
|
+
else
|
237
|
+
chosen.keep_if { |c| label[c] == level_index }
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def remove_layer(layer_index)
|
242
|
+
@levels.delete_at(layer_index)
|
243
|
+
@labels.delete_at(layer_index)
|
244
|
+
@name&.delete_at(layer_index)
|
245
|
+
|
246
|
+
coerce_index
|
247
|
+
end
|
248
|
+
|
249
|
+
def coerce_index
|
250
|
+
if @levels.size == 1
|
251
|
+
elements = to_a.flatten
|
252
|
+
|
253
|
+
if elements.uniq.length == elements.length
|
254
|
+
DaruLite::Index.new(elements)
|
255
|
+
else
|
256
|
+
DaruLite::CategoricalIndex.new(elements)
|
257
|
+
end
|
258
|
+
else
|
259
|
+
self
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
# Array `name` must have same length as levels and labels.
|
264
|
+
def validate_name(names, levels)
|
265
|
+
error_msg = "'names' and 'levels' should be of same size. Size of the " \
|
266
|
+
"'name' array is #{names.size} and size of the MultiIndex 'levels' and " \
|
267
|
+
"'labels' is #{labels.size}."
|
268
|
+
suggestion_msg = 'If you do not want to set name for particular level ' \
|
269
|
+
"(say level 'i') then put empty string on index 'i' of the 'name' Array."
|
270
|
+
|
271
|
+
raise SizeError, error_msg if names.size > levels.size
|
272
|
+
raise SizeError, [error_msg, suggestion_msg].join("\n") if names.size < levels.size
|
273
|
+
end
|
274
|
+
|
275
|
+
private :find_all_indexes, :multi_index_from_multiple_selections,
|
276
|
+
:retrieve_from_range, :retrieve_from_tuples, :validate_name
|
277
|
+
|
278
|
+
def key(index)
|
279
|
+
raise ArgumentError, "Key #{index} is too large" if index >= @labels[0].size
|
280
|
+
|
281
|
+
@labels
|
282
|
+
.each_with_index
|
283
|
+
.map { |label, i| @levels[i].keys[label[index]] }
|
284
|
+
end
|
285
|
+
|
286
|
+
def dup
|
287
|
+
MultiIndex.new levels: levels.dup, labels: labels.dup, name: @name&.dup
|
288
|
+
end
|
289
|
+
|
290
|
+
def drop_left_level(by = 1)
|
291
|
+
MultiIndex.from_arrays to_a.transpose[by..]
|
292
|
+
end
|
293
|
+
|
294
|
+
def |(other)
|
295
|
+
MultiIndex.from_tuples(to_a | other.to_a)
|
296
|
+
end
|
297
|
+
|
298
|
+
def &(other)
|
299
|
+
MultiIndex.from_tuples(to_a & other.to_a)
|
300
|
+
end
|
301
|
+
|
302
|
+
def empty?
|
303
|
+
@labels.flatten.empty? && @levels.all?(&:empty?)
|
304
|
+
end
|
305
|
+
|
306
|
+
def include?(tuple)
|
307
|
+
return false unless tuple.is_a? Enumerable
|
308
|
+
|
309
|
+
@labels[0...tuple.flatten.size]
|
310
|
+
.transpose
|
311
|
+
.include?(tuple.flatten.each_with_index.map { |e, i| @levels[i][e] })
|
312
|
+
end
|
313
|
+
|
314
|
+
def size
|
315
|
+
@labels[0].size
|
316
|
+
end
|
317
|
+
|
318
|
+
def width
|
319
|
+
@levels.size
|
320
|
+
end
|
321
|
+
|
322
|
+
def ==(other)
|
323
|
+
self.class == other.class &&
|
324
|
+
labels == other.labels &&
|
325
|
+
levels == other.levels
|
326
|
+
end
|
327
|
+
|
328
|
+
def to_a
|
329
|
+
(0...size).map { |e| key(e) }
|
330
|
+
end
|
331
|
+
|
332
|
+
def values
|
333
|
+
Array.new(size) { |i| i }
|
334
|
+
end
|
335
|
+
|
336
|
+
def inspect(threshold = 20)
|
337
|
+
"#<DaruLite::MultiIndex(#{size}x#{width})>\n" +
|
338
|
+
Formatters::Table.format([], headers: @name, row_headers: sparse_tuples, threshold: threshold)
|
339
|
+
end
|
340
|
+
|
341
|
+
def to_html
|
342
|
+
path = File.expand_path('../iruby/templates/multi_index.html.erb', __dir__)
|
343
|
+
ERB.new(File.read(path).strip).result(binding)
|
344
|
+
end
|
345
|
+
|
346
|
+
# Provide a MultiIndex for sub vector produced
|
347
|
+
#
|
348
|
+
# @param input_indexes [Array] the input by user to index the vector
|
349
|
+
# @return [Object] the MultiIndex object for sub vector produced
|
350
|
+
def conform(input_indexes)
|
351
|
+
return self if input_indexes[0].is_a? Range
|
352
|
+
|
353
|
+
drop_left_level input_indexes.size
|
354
|
+
end
|
355
|
+
|
356
|
+
# Return tuples with nils in place of repeating values, like this:
|
357
|
+
#
|
358
|
+
# [:a , :bar, :one]
|
359
|
+
# [nil, nil , :two]
|
360
|
+
# [nil, :foo, :one]
|
361
|
+
#
|
362
|
+
def sparse_tuples
|
363
|
+
tuples = to_a
|
364
|
+
[tuples.first] + each_cons(2).map do |prev, cur|
|
365
|
+
left = cur.zip(prev).drop_while { |c, p| c == p }
|
366
|
+
Array.new(cur.size - left.size) + left.map(&:first)
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
def to_df
|
371
|
+
DaruLite::DataFrame.new(@name.zip(to_a.transpose).to_h)
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DaruLite
|
2
|
+
module IO
|
3
|
+
module CSV
|
4
|
+
CONVERTERS = {
|
5
|
+
boolean: lambda { |f, _|
|
6
|
+
case f.downcase.strip
|
7
|
+
when 'true'
|
8
|
+
true
|
9
|
+
when 'false'
|
10
|
+
false
|
11
|
+
else
|
12
|
+
f
|
13
|
+
end
|
14
|
+
},
|
15
|
+
string: lambda { |f, _|
|
16
|
+
f
|
17
|
+
}
|
18
|
+
}.freeze
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,294 @@
|
|
1
|
+
module DaruLite
|
2
|
+
require_relative 'csv/converters'
|
3
|
+
module IOHelpers
|
4
|
+
class << self
|
5
|
+
def process_row(row, empty)
|
6
|
+
row.to_a.map do |c|
|
7
|
+
if empty.include?(c)
|
8
|
+
# FIXME: As far as I can guess, it will never work.
|
9
|
+
# It is called only inside `from_plaintext`, and there
|
10
|
+
# data is splitted by `\s+` -- there is no chance that
|
11
|
+
# "empty" (currently just '') will be between data?..
|
12
|
+
nil
|
13
|
+
else
|
14
|
+
try_string_to_number(c)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def open_local_or_remote_file(path)
|
20
|
+
uri = URI.parse(path)
|
21
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) ? uri.open : File.open(uri.path)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
INT_PATTERN = /^[-+]?\d+$/.freeze
|
27
|
+
FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/.freeze
|
28
|
+
|
29
|
+
def try_string_to_number(s)
|
30
|
+
case s
|
31
|
+
when INT_PATTERN
|
32
|
+
s.to_i
|
33
|
+
when FLOAT_PATTERN
|
34
|
+
s.tr(',', '.').to_f
|
35
|
+
else
|
36
|
+
s
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module IO
|
43
|
+
class << self
|
44
|
+
# Functions for loading/writing Excel files.
|
45
|
+
|
46
|
+
def from_excel(path, opts = {})
|
47
|
+
opts = {
|
48
|
+
worksheet_id: 0,
|
49
|
+
row_id: 0
|
50
|
+
}.merge opts
|
51
|
+
|
52
|
+
worksheet, headers = read_from_excel(path, opts)
|
53
|
+
df = DaruLite::DataFrame.new({})
|
54
|
+
headers.each_with_index do |h, i|
|
55
|
+
col = worksheet.column(i).to_a
|
56
|
+
col.delete_at 0
|
57
|
+
df[h] = col
|
58
|
+
end
|
59
|
+
|
60
|
+
df
|
61
|
+
end
|
62
|
+
|
63
|
+
def read_from_excel(path, opts)
|
64
|
+
optional_gem 'spreadsheet', '~>1.3.0'
|
65
|
+
|
66
|
+
worksheet_id = opts[:worksheet_id]
|
67
|
+
row_id = opts[:row_id]
|
68
|
+
book = Spreadsheet.open path
|
69
|
+
worksheet = book.worksheet worksheet_id
|
70
|
+
headers = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)
|
71
|
+
|
72
|
+
[worksheet, headers]
|
73
|
+
end
|
74
|
+
|
75
|
+
def dataframe_write_excel(dataframe, path, _opts = {})
|
76
|
+
book = Spreadsheet::Workbook.new
|
77
|
+
sheet = book.create_worksheet
|
78
|
+
format = Spreadsheet::Format.new color: :blue, weight: :bold
|
79
|
+
|
80
|
+
sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
|
81
|
+
sheet.row(0).default_format = format
|
82
|
+
i = 1
|
83
|
+
dataframe.each_row do |row|
|
84
|
+
sheet.row(i).concat(row.to_a)
|
85
|
+
i += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
book.write(path)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Functions for loading/writing CSV files
|
92
|
+
def from_csv(path, opts = {})
|
93
|
+
daru_options, opts = from_csv_prepare_opts opts
|
94
|
+
# Preprocess headers for detecting and correcting repetition in
|
95
|
+
# case the :headers option is not specified.
|
96
|
+
hsh =
|
97
|
+
if opts[:headers]
|
98
|
+
from_csv_hash_with_headers(path, opts)
|
99
|
+
else
|
100
|
+
from_csv_hash(path, opts)
|
101
|
+
.tap { |hash| daru_options[:order] = hash.keys }
|
102
|
+
end
|
103
|
+
DaruLite::DataFrame.new(hsh, daru_options)
|
104
|
+
end
|
105
|
+
|
106
|
+
def dataframe_write_csv(dataframe, path, opts = {})
|
107
|
+
options = {
|
108
|
+
converters: :numeric
|
109
|
+
}.merge(opts)
|
110
|
+
|
111
|
+
writer = ::CSV.open(path, 'w', **options)
|
112
|
+
writer << dataframe.vectors.to_a unless options[:headers] == false
|
113
|
+
|
114
|
+
dataframe.each_row do |row|
|
115
|
+
writer << if options[:convert_comma]
|
116
|
+
row.map { |v| v.to_s.tr('.', ',') }
|
117
|
+
else
|
118
|
+
row.to_a
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
writer.close
|
123
|
+
end
|
124
|
+
|
125
|
+
# Execute a query and create a data frame from the result
|
126
|
+
#
|
127
|
+
# @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
128
|
+
# @param query [String] The query to be executed
|
129
|
+
#
|
130
|
+
# @return A dataframe containing the data resulting from the query
|
131
|
+
def from_sql(db, query)
|
132
|
+
require 'daru_lite/io/sql_data_source'
|
133
|
+
SqlDataSource.make_dataframe(db, query)
|
134
|
+
end
|
135
|
+
|
136
|
+
def dataframe_write_sql(ds, dbh, table)
|
137
|
+
require 'dbi'
|
138
|
+
query = "INSERT INTO #{table} (#{ds.vectors.to_a.join(',')}) VALUES (#{(['?'] * ds.vectors.size).join(',')})"
|
139
|
+
sth = dbh.prepare(query)
|
140
|
+
ds.each_row { |c| sth.execute(*c.to_a) }
|
141
|
+
true
|
142
|
+
end
|
143
|
+
|
144
|
+
# Load dataframe from AR::Relation
|
145
|
+
#
|
146
|
+
# @param relation [ActiveRecord::Relation] A relation to be used to load the contents of dataframe
|
147
|
+
#
|
148
|
+
# @return A dataframe containing the data in the given relation
|
149
|
+
def from_activerecord(relation, *fields)
|
150
|
+
fields = relation.klass.column_names if fields.empty?
|
151
|
+
fields = fields.map(&:to_sym)
|
152
|
+
|
153
|
+
result = relation.pluck(*fields).transpose
|
154
|
+
DaruLite::DataFrame.new(result, order: fields).tap(&:update)
|
155
|
+
end
|
156
|
+
|
157
|
+
# Loading data from plain text files
|
158
|
+
|
159
|
+
def from_plaintext(filename, fields)
|
160
|
+
ds = DaruLite::DataFrame.new({}, order: fields)
|
161
|
+
fp = File.open(filename, 'r')
|
162
|
+
fp.each_line do |line|
|
163
|
+
row = DaruLite::IOHelpers.process_row(line.strip.split(/\s+/), [''])
|
164
|
+
next if row == ["\x1A"]
|
165
|
+
|
166
|
+
ds.add_row(row)
|
167
|
+
end
|
168
|
+
ds.update
|
169
|
+
fields.each { |f| ds[f].rename f }
|
170
|
+
ds
|
171
|
+
end
|
172
|
+
|
173
|
+
# Loading and writing Marshalled DataFrame/Vector
|
174
|
+
def save(klass, filename)
|
175
|
+
fp = File.open(filename, 'w')
|
176
|
+
Marshal.dump(klass, fp)
|
177
|
+
fp.close
|
178
|
+
end
|
179
|
+
|
180
|
+
def load(filename)
|
181
|
+
if File.exist? filename
|
182
|
+
o = false
|
183
|
+
File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
|
184
|
+
o
|
185
|
+
else
|
186
|
+
false
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
private
|
191
|
+
|
192
|
+
def optional_gem(name, version)
|
193
|
+
gem name, version
|
194
|
+
require name
|
195
|
+
rescue LoadError
|
196
|
+
DaruLite.error "\nInstall the #{name} gem version #{version} for using #{name} functions."
|
197
|
+
end
|
198
|
+
|
199
|
+
DARU_OPT_KEYS = %i[clone order index name].freeze
|
200
|
+
|
201
|
+
def from_csv_prepare_opts(opts)
|
202
|
+
opts[:col_sep] ||= ','
|
203
|
+
opts[:skip_blanks] ||= true
|
204
|
+
opts[:converters] ||= [:numeric]
|
205
|
+
|
206
|
+
opts[:converters] = from_csv_prepare_converters(opts[:converters])
|
207
|
+
|
208
|
+
daru_options = opts.keys.each_with_object({}) do |k, hash|
|
209
|
+
hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
|
210
|
+
end
|
211
|
+
[daru_options, opts]
|
212
|
+
end
|
213
|
+
|
214
|
+
def from_csv_prepare_converters(converters)
|
215
|
+
Array(converters).flat_map do |c|
|
216
|
+
if ::CSV::Converters[c]
|
217
|
+
::CSV::Converters[c]
|
218
|
+
elsif DaruLite::IO::CSV::CONVERTERS[c]
|
219
|
+
DaruLite::IO::CSV::CONVERTERS[c]
|
220
|
+
else
|
221
|
+
c
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def from_csv_hash_with_headers(path, opts)
|
227
|
+
opts[:header_converters] ||= :symbol
|
228
|
+
::CSV
|
229
|
+
.parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
|
230
|
+
.tap { |c| yield c if block_given? }
|
231
|
+
.by_col.to_h { |col_name, values| [col_name, values] }
|
232
|
+
end
|
233
|
+
|
234
|
+
def from_csv_hash(path, opts)
|
235
|
+
csv_as_arrays =
|
236
|
+
::CSV
|
237
|
+
.parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
|
238
|
+
.tap { |c| yield c if block_given? }
|
239
|
+
.to_a
|
240
|
+
headers = ArrayHelper.recode_repeated(csv_as_arrays.shift)
|
241
|
+
csv_as_arrays = csv_as_arrays.transpose
|
242
|
+
headers.each_with_index.to_h { |h, i| [h, csv_as_arrays[i]] }
|
243
|
+
end
|
244
|
+
|
245
|
+
def html_parse_table(table)
|
246
|
+
headers, headers_size = html_scrape_tag(table, 'th')
|
247
|
+
data, size = html_scrape_tag(table, 'td')
|
248
|
+
data = data.keep_if { |x| x.count == size }
|
249
|
+
order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
|
250
|
+
return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count.positive?
|
251
|
+
|
252
|
+
{ data: data.compact, index: indice, order: order }
|
253
|
+
end
|
254
|
+
|
255
|
+
def html_scrape_tag(table, tag)
|
256
|
+
arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
|
257
|
+
size = arr.map(&:count).max
|
258
|
+
[arr, size]
|
259
|
+
end
|
260
|
+
|
261
|
+
# Splits headers (all th tags) into order and index. Wherein,
|
262
|
+
# Order : All <th> tags on first proper row of HTML table
|
263
|
+
# index : All <th> tags on first proper column of HTML table
|
264
|
+
def html_parse_hash(headers, size, headers_size)
|
265
|
+
headers_index = headers.find_index { |x| x.count == headers_size }
|
266
|
+
order = headers[headers_index]
|
267
|
+
order_index = order.count - size
|
268
|
+
order = order[order_index..]
|
269
|
+
indice = headers[headers_index + 1..].flatten
|
270
|
+
indice = nil if indice.to_a.empty?
|
271
|
+
[order, indice]
|
272
|
+
end
|
273
|
+
|
274
|
+
def html_search(table, match = nil)
|
275
|
+
match.nil? ? true : (table.to_s.include? match)
|
276
|
+
end
|
277
|
+
|
278
|
+
# Allows user to override the scraped order / index / data
|
279
|
+
def html_decide_values(scraped_val = {}, user_val = {})
|
280
|
+
%I[data index name order].each do |key|
|
281
|
+
user_val[key] ||= scraped_val[key]
|
282
|
+
end
|
283
|
+
user_val
|
284
|
+
end
|
285
|
+
|
286
|
+
def html_table_to_dataframe(table)
|
287
|
+
DaruLite::DataFrame.rows table[:data],
|
288
|
+
index: table[:index],
|
289
|
+
order: table[:order],
|
290
|
+
name: table[:name]
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
end
|