daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
class MultiIndex < Index # rubocop:disable Metrics/ClassLength
|
|
3
|
+
def each(&block)
|
|
4
|
+
to_a.each(&block)
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
def map(&block)
|
|
8
|
+
to_a.map(&block)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
attr_reader :labels, :name
|
|
12
|
+
|
|
13
|
+
def levels
|
|
14
|
+
@levels.map(&:keys)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# names and levels should be of same size. If size of Array `name` is less
|
|
18
|
+
# or greater than size of array `levels` then it raises `SizeError`.
|
|
19
|
+
# If user don't want to put name for particular level then user must put
|
|
20
|
+
# empty string in that index of Array `name`.
|
|
21
|
+
# For example there is multi_index of 3 levels and user don't want to name
|
|
22
|
+
# level 0, then do multi_index.name = ['', 'level1_name1', 'level2_name']
|
|
23
|
+
#
|
|
24
|
+
# @example
|
|
25
|
+
#
|
|
26
|
+
# # set the name during initialization
|
|
27
|
+
#
|
|
28
|
+
# mi = DaruLite::MultiIndex.new(
|
|
29
|
+
# levels: [[:a,:b,:c], [:one, :two]],
|
|
30
|
+
# labels: [[0,0,1,1,2,2], [0,1,0,1,0,1]], name: ['s1', 's2'])
|
|
31
|
+
#
|
|
32
|
+
# # =>
|
|
33
|
+
# # <DaruLite::MultiIndex(6x2)>
|
|
34
|
+
# # s1 s2
|
|
35
|
+
# # a one
|
|
36
|
+
# # two
|
|
37
|
+
# # b one
|
|
38
|
+
# # two
|
|
39
|
+
# # c one
|
|
40
|
+
# # two
|
|
41
|
+
#
|
|
42
|
+
# # set new name
|
|
43
|
+
#
|
|
44
|
+
# mi.name = ['k1', 'k2']
|
|
45
|
+
# => ["k1", "k2"]
|
|
46
|
+
#
|
|
47
|
+
# mi
|
|
48
|
+
# =>
|
|
49
|
+
# # #<DaruLite::MultiIndex(6x2)>
|
|
50
|
+
# # k1 k2
|
|
51
|
+
# # a one
|
|
52
|
+
# # two
|
|
53
|
+
# # b one
|
|
54
|
+
# # two
|
|
55
|
+
# # c one
|
|
56
|
+
# # two
|
|
57
|
+
#
|
|
58
|
+
# # access the name
|
|
59
|
+
#
|
|
60
|
+
# mi.name
|
|
61
|
+
# => ["k1", "k2"]
|
|
62
|
+
#
|
|
63
|
+
# # If you don't want to name level 0
|
|
64
|
+
#
|
|
65
|
+
# mi.name = ['', 'k2']
|
|
66
|
+
# => ["", "k2"]
|
|
67
|
+
#
|
|
68
|
+
# mi
|
|
69
|
+
# =>
|
|
70
|
+
# #<DaruLite::MultiIndex(6x2)>
|
|
71
|
+
# # k2
|
|
72
|
+
# # a one
|
|
73
|
+
# # two
|
|
74
|
+
# # b one
|
|
75
|
+
# # two
|
|
76
|
+
# # c one
|
|
77
|
+
# # two
|
|
78
|
+
#
|
|
79
|
+
def initialize(opts = {})
|
|
80
|
+
labels = opts[:labels]
|
|
81
|
+
levels = opts[:levels]
|
|
82
|
+
|
|
83
|
+
raise ArgumentError, 'Must specify both labels and levels' unless labels && levels
|
|
84
|
+
raise ArgumentError, 'Labels and levels should be same size' if labels.size != levels.size
|
|
85
|
+
raise ArgumentError, 'Incorrect labels and levels' if incorrect_fields?(labels, levels)
|
|
86
|
+
|
|
87
|
+
@labels = labels
|
|
88
|
+
@levels = levels.map { |e| e.map.with_index.to_h }
|
|
89
|
+
self.name = opts[:name] unless opts[:name].nil?
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def name=(names)
|
|
93
|
+
validate_name names, @labels
|
|
94
|
+
@name = names
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def incorrect_fields?(_labels, levels)
|
|
98
|
+
levels[0].size # FIXME: without this exact call some specs are failing
|
|
99
|
+
|
|
100
|
+
levels.any? { |e| e.uniq.size != e.size }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private :incorrect_fields?
|
|
104
|
+
|
|
105
|
+
def self.from_arrays(arrays)
|
|
106
|
+
levels = arrays.map { |e| e.uniq.sort_by(&:to_s) }
|
|
107
|
+
|
|
108
|
+
labels = arrays.each_with_index.map do |arry, level_index|
|
|
109
|
+
level = levels[level_index]
|
|
110
|
+
arry.map { |lvl| level.index(lvl) }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
MultiIndex.new labels: labels, levels: levels
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def self.from_tuples(tuples)
|
|
117
|
+
from_arrays tuples.transpose
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def self.try_from_tuples(tuples)
|
|
121
|
+
from_tuples(tuples) if tuples.respond_to?(:first) && tuples.first.is_a?(Array)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def [](*key)
|
|
125
|
+
key.flatten!
|
|
126
|
+
if key[0].is_a?(Range)
|
|
127
|
+
retrieve_from_range(key[0])
|
|
128
|
+
elsif key[0].is_a?(Integer) && key.size == 1
|
|
129
|
+
try_retrieve_from_integer(key[0])
|
|
130
|
+
else
|
|
131
|
+
begin
|
|
132
|
+
retrieve_from_tuples key
|
|
133
|
+
rescue NoMethodError
|
|
134
|
+
raise IndexError, "Specified index #{key.inspect} do not exist"
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def valid?(*indexes)
|
|
140
|
+
# FIXME: This is perhaps not a good method
|
|
141
|
+
pos(*indexes)
|
|
142
|
+
true
|
|
143
|
+
rescue IndexError
|
|
144
|
+
false
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Returns positions given indexes or positions
|
|
148
|
+
# @note If the arugent is both a valid index and a valid position,
|
|
149
|
+
# it will treated as valid index
|
|
150
|
+
# @param indexes [Array<object>] indexes or positions
|
|
151
|
+
# @example
|
|
152
|
+
# idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
|
153
|
+
# idx.pos :a
|
|
154
|
+
# # => [0, 1]
|
|
155
|
+
def pos(*indexes)
|
|
156
|
+
if indexes.first.is_a? Integer
|
|
157
|
+
return indexes.first if indexes.size == 1
|
|
158
|
+
|
|
159
|
+
return indexes
|
|
160
|
+
end
|
|
161
|
+
res = self[indexes]
|
|
162
|
+
return res if res.is_a? Integer
|
|
163
|
+
|
|
164
|
+
res.map { |i| self[i] }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def subset(*indexes)
|
|
168
|
+
if indexes.first.is_a? Integer
|
|
169
|
+
MultiIndex.from_tuples(indexes.map { |index| key(index) })
|
|
170
|
+
else
|
|
171
|
+
self[indexes].conform indexes
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Takes positional values and returns subset of the self
|
|
176
|
+
# capturing the indexes at mentioned positions
|
|
177
|
+
# @param positions [Array<Integer>] positional values
|
|
178
|
+
# @return [object] index object
|
|
179
|
+
# @example
|
|
180
|
+
# idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
|
|
181
|
+
# idx.at 0, 1
|
|
182
|
+
# # => #<DaruLite::MultiIndex(2x2)>
|
|
183
|
+
# # a one
|
|
184
|
+
# # two
|
|
185
|
+
def at(*positions)
|
|
186
|
+
positions = preprocess_positions(*positions)
|
|
187
|
+
validate_positions(*positions)
|
|
188
|
+
if positions.is_a? Integer
|
|
189
|
+
key(positions)
|
|
190
|
+
else
|
|
191
|
+
DaruLite::MultiIndex.from_tuples(positions.map { |v| key(v) })
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def add(*indexes)
|
|
196
|
+
DaruLite::MultiIndex.from_tuples(to_a + [indexes])
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def reorder(new_order)
|
|
200
|
+
from = to_a
|
|
201
|
+
MultiIndex.from_tuples(new_order.map { |i| from[i] })
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def try_retrieve_from_integer(int)
|
|
205
|
+
@levels[0].key?(int) ? retrieve_from_tuples([int]) : int
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def retrieve_from_range(range)
|
|
209
|
+
MultiIndex.from_tuples(range.map { |index| key(index) })
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def retrieve_from_tuples(key)
|
|
213
|
+
chosen = []
|
|
214
|
+
|
|
215
|
+
key.each_with_index do |k, depth|
|
|
216
|
+
level_index = @levels[depth][k]
|
|
217
|
+
raise IndexError, "Specified index #{key.inspect} do not exist" if level_index.nil?
|
|
218
|
+
|
|
219
|
+
label = @labels[depth]
|
|
220
|
+
chosen = find_all_indexes label, level_index, chosen
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
return chosen[0] if chosen.size == 1 && key.size == @levels.size
|
|
224
|
+
|
|
225
|
+
multi_index_from_multiple_selections(chosen)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def multi_index_from_multiple_selections(chosen)
|
|
229
|
+
MultiIndex.from_tuples(chosen.map { |e| key(e) })
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def find_all_indexes(label, level_index, chosen)
|
|
233
|
+
if chosen.empty?
|
|
234
|
+
label.each_with_index
|
|
235
|
+
.select { |lbl, _| lbl == level_index }.map(&:last)
|
|
236
|
+
else
|
|
237
|
+
chosen.keep_if { |c| label[c] == level_index }
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def remove_layer(layer_index)
|
|
242
|
+
@levels.delete_at(layer_index)
|
|
243
|
+
@labels.delete_at(layer_index)
|
|
244
|
+
@name&.delete_at(layer_index)
|
|
245
|
+
|
|
246
|
+
coerce_index
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def coerce_index
|
|
250
|
+
if @levels.size == 1
|
|
251
|
+
elements = to_a.flatten
|
|
252
|
+
|
|
253
|
+
if elements.uniq.length == elements.length
|
|
254
|
+
DaruLite::Index.new(elements)
|
|
255
|
+
else
|
|
256
|
+
DaruLite::CategoricalIndex.new(elements)
|
|
257
|
+
end
|
|
258
|
+
else
|
|
259
|
+
self
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Array `name` must have same length as levels and labels.
|
|
264
|
+
def validate_name(names, levels)
|
|
265
|
+
error_msg = "'names' and 'levels' should be of same size. Size of the " \
|
|
266
|
+
"'name' array is #{names.size} and size of the MultiIndex 'levels' and " \
|
|
267
|
+
"'labels' is #{labels.size}."
|
|
268
|
+
suggestion_msg = 'If you do not want to set name for particular level ' \
|
|
269
|
+
"(say level 'i') then put empty string on index 'i' of the 'name' Array."
|
|
270
|
+
|
|
271
|
+
raise SizeError, error_msg if names.size > levels.size
|
|
272
|
+
raise SizeError, [error_msg, suggestion_msg].join("\n") if names.size < levels.size
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
private :find_all_indexes, :multi_index_from_multiple_selections,
|
|
276
|
+
:retrieve_from_range, :retrieve_from_tuples, :validate_name
|
|
277
|
+
|
|
278
|
+
def key(index)
|
|
279
|
+
raise ArgumentError, "Key #{index} is too large" if index >= @labels[0].size
|
|
280
|
+
|
|
281
|
+
@labels
|
|
282
|
+
.each_with_index
|
|
283
|
+
.map { |label, i| @levels[i].keys[label[index]] }
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def dup
|
|
287
|
+
MultiIndex.new levels: levels.dup, labels: labels.dup, name: @name&.dup
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def drop_left_level(by = 1)
|
|
291
|
+
MultiIndex.from_arrays to_a.transpose[by..]
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def |(other)
|
|
295
|
+
MultiIndex.from_tuples(to_a | other.to_a)
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def &(other)
|
|
299
|
+
MultiIndex.from_tuples(to_a & other.to_a)
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
def empty?
|
|
303
|
+
@labels.flatten.empty? && @levels.all?(&:empty?)
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
def include?(tuple)
|
|
307
|
+
return false unless tuple.is_a? Enumerable
|
|
308
|
+
|
|
309
|
+
@labels[0...tuple.flatten.size]
|
|
310
|
+
.transpose
|
|
311
|
+
.include?(tuple.flatten.each_with_index.map { |e, i| @levels[i][e] })
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def size
|
|
315
|
+
@labels[0].size
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def width
|
|
319
|
+
@levels.size
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def ==(other)
|
|
323
|
+
self.class == other.class &&
|
|
324
|
+
labels == other.labels &&
|
|
325
|
+
levels == other.levels
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def to_a
|
|
329
|
+
(0...size).map { |e| key(e) }
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def values
|
|
333
|
+
Array.new(size) { |i| i }
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def inspect(threshold = 20)
|
|
337
|
+
"#<DaruLite::MultiIndex(#{size}x#{width})>\n" +
|
|
338
|
+
Formatters::Table.format([], headers: @name, row_headers: sparse_tuples, threshold: threshold)
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
def to_html
|
|
342
|
+
path = File.expand_path('../iruby/templates/multi_index.html.erb', __dir__)
|
|
343
|
+
ERB.new(File.read(path).strip).result(binding)
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Provide a MultiIndex for sub vector produced
|
|
347
|
+
#
|
|
348
|
+
# @param input_indexes [Array] the input by user to index the vector
|
|
349
|
+
# @return [Object] the MultiIndex object for sub vector produced
|
|
350
|
+
def conform(input_indexes)
|
|
351
|
+
return self if input_indexes[0].is_a? Range
|
|
352
|
+
|
|
353
|
+
drop_left_level input_indexes.size
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Return tuples with nils in place of repeating values, like this:
|
|
357
|
+
#
|
|
358
|
+
# [:a , :bar, :one]
|
|
359
|
+
# [nil, nil , :two]
|
|
360
|
+
# [nil, :foo, :one]
|
|
361
|
+
#
|
|
362
|
+
def sparse_tuples
|
|
363
|
+
tuples = to_a
|
|
364
|
+
[tuples.first] + each_cons(2).map do |prev, cur|
|
|
365
|
+
left = cur.zip(prev).drop_while { |c, p| c == p }
|
|
366
|
+
Array.new(cur.size - left.size) + left.map(&:first)
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
def to_df
|
|
371
|
+
DaruLite::DataFrame.new(@name.zip(to_a.transpose).to_h)
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
module IO
|
|
3
|
+
module CSV
|
|
4
|
+
CONVERTERS = {
|
|
5
|
+
boolean: lambda { |f, _|
|
|
6
|
+
case f.downcase.strip
|
|
7
|
+
when 'true'
|
|
8
|
+
true
|
|
9
|
+
when 'false'
|
|
10
|
+
false
|
|
11
|
+
else
|
|
12
|
+
f
|
|
13
|
+
end
|
|
14
|
+
},
|
|
15
|
+
string: lambda { |f, _|
|
|
16
|
+
f
|
|
17
|
+
}
|
|
18
|
+
}.freeze
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
require_relative 'csv/converters'
|
|
3
|
+
module IOHelpers
|
|
4
|
+
class << self
|
|
5
|
+
def process_row(row, empty)
|
|
6
|
+
row.to_a.map do |c|
|
|
7
|
+
if empty.include?(c)
|
|
8
|
+
# FIXME: As far as I can guess, it will never work.
|
|
9
|
+
# It is called only inside `from_plaintext`, and there
|
|
10
|
+
# data is splitted by `\s+` -- there is no chance that
|
|
11
|
+
# "empty" (currently just '') will be between data?..
|
|
12
|
+
nil
|
|
13
|
+
else
|
|
14
|
+
try_string_to_number(c)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def open_local_or_remote_file(path)
|
|
20
|
+
uri = URI.parse(path)
|
|
21
|
+
uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) ? uri.open : File.open(uri.path)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
INT_PATTERN = /^[-+]?\d+$/.freeze
|
|
27
|
+
FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/.freeze
|
|
28
|
+
|
|
29
|
+
def try_string_to_number(s)
|
|
30
|
+
case s
|
|
31
|
+
when INT_PATTERN
|
|
32
|
+
s.to_i
|
|
33
|
+
when FLOAT_PATTERN
|
|
34
|
+
s.tr(',', '.').to_f
|
|
35
|
+
else
|
|
36
|
+
s
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
module IO
|
|
43
|
+
class << self
|
|
44
|
+
# Functions for loading/writing Excel files.
|
|
45
|
+
|
|
46
|
+
def from_excel(path, opts = {})
|
|
47
|
+
opts = {
|
|
48
|
+
worksheet_id: 0,
|
|
49
|
+
row_id: 0
|
|
50
|
+
}.merge opts
|
|
51
|
+
|
|
52
|
+
worksheet, headers = read_from_excel(path, opts)
|
|
53
|
+
df = DaruLite::DataFrame.new({})
|
|
54
|
+
headers.each_with_index do |h, i|
|
|
55
|
+
col = worksheet.column(i).to_a
|
|
56
|
+
col.delete_at 0
|
|
57
|
+
df[h] = col
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
df
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def read_from_excel(path, opts)
|
|
64
|
+
optional_gem 'spreadsheet', '~>1.3.0'
|
|
65
|
+
|
|
66
|
+
worksheet_id = opts[:worksheet_id]
|
|
67
|
+
row_id = opts[:row_id]
|
|
68
|
+
book = Spreadsheet.open path
|
|
69
|
+
worksheet = book.worksheet worksheet_id
|
|
70
|
+
headers = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)
|
|
71
|
+
|
|
72
|
+
[worksheet, headers]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def dataframe_write_excel(dataframe, path, _opts = {})
|
|
76
|
+
book = Spreadsheet::Workbook.new
|
|
77
|
+
sheet = book.create_worksheet
|
|
78
|
+
format = Spreadsheet::Format.new color: :blue, weight: :bold
|
|
79
|
+
|
|
80
|
+
sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
|
|
81
|
+
sheet.row(0).default_format = format
|
|
82
|
+
i = 1
|
|
83
|
+
dataframe.each_row do |row|
|
|
84
|
+
sheet.row(i).concat(row.to_a)
|
|
85
|
+
i += 1
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
book.write(path)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Functions for loading/writing CSV files
|
|
92
|
+
def from_csv(path, opts = {})
|
|
93
|
+
daru_options, opts = from_csv_prepare_opts opts
|
|
94
|
+
# Preprocess headers for detecting and correcting repetition in
|
|
95
|
+
# case the :headers option is not specified.
|
|
96
|
+
hsh =
|
|
97
|
+
if opts[:headers]
|
|
98
|
+
from_csv_hash_with_headers(path, opts)
|
|
99
|
+
else
|
|
100
|
+
from_csv_hash(path, opts)
|
|
101
|
+
.tap { |hash| daru_options[:order] = hash.keys }
|
|
102
|
+
end
|
|
103
|
+
DaruLite::DataFrame.new(hsh, daru_options)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def dataframe_write_csv(dataframe, path, opts = {})
|
|
107
|
+
options = {
|
|
108
|
+
converters: :numeric
|
|
109
|
+
}.merge(opts)
|
|
110
|
+
|
|
111
|
+
writer = ::CSV.open(path, 'w', **options)
|
|
112
|
+
writer << dataframe.vectors.to_a unless options[:headers] == false
|
|
113
|
+
|
|
114
|
+
dataframe.each_row do |row|
|
|
115
|
+
writer << if options[:convert_comma]
|
|
116
|
+
row.map { |v| v.to_s.tr('.', ',') }
|
|
117
|
+
else
|
|
118
|
+
row.to_a
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
writer.close
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Execute a query and create a data frame from the result
|
|
126
|
+
#
|
|
127
|
+
# @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
|
128
|
+
# @param query [String] The query to be executed
|
|
129
|
+
#
|
|
130
|
+
# @return A dataframe containing the data resulting from the query
|
|
131
|
+
def from_sql(db, query)
|
|
132
|
+
require 'daru_lite/io/sql_data_source'
|
|
133
|
+
SqlDataSource.make_dataframe(db, query)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def dataframe_write_sql(ds, dbh, table)
|
|
137
|
+
require 'dbi'
|
|
138
|
+
query = "INSERT INTO #{table} (#{ds.vectors.to_a.join(',')}) VALUES (#{(['?'] * ds.vectors.size).join(',')})"
|
|
139
|
+
sth = dbh.prepare(query)
|
|
140
|
+
ds.each_row { |c| sth.execute(*c.to_a) }
|
|
141
|
+
true
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Load dataframe from AR::Relation
|
|
145
|
+
#
|
|
146
|
+
# @param relation [ActiveRecord::Relation] A relation to be used to load the contents of dataframe
|
|
147
|
+
#
|
|
148
|
+
# @return A dataframe containing the data in the given relation
|
|
149
|
+
def from_activerecord(relation, *fields)
|
|
150
|
+
fields = relation.klass.column_names if fields.empty?
|
|
151
|
+
fields = fields.map(&:to_sym)
|
|
152
|
+
|
|
153
|
+
result = relation.pluck(*fields).transpose
|
|
154
|
+
DaruLite::DataFrame.new(result, order: fields).tap(&:update)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Loading data from plain text files
|
|
158
|
+
|
|
159
|
+
def from_plaintext(filename, fields)
|
|
160
|
+
ds = DaruLite::DataFrame.new({}, order: fields)
|
|
161
|
+
fp = File.open(filename, 'r')
|
|
162
|
+
fp.each_line do |line|
|
|
163
|
+
row = DaruLite::IOHelpers.process_row(line.strip.split(/\s+/), [''])
|
|
164
|
+
next if row == ["\x1A"]
|
|
165
|
+
|
|
166
|
+
ds.add_row(row)
|
|
167
|
+
end
|
|
168
|
+
ds.update
|
|
169
|
+
fields.each { |f| ds[f].rename f }
|
|
170
|
+
ds
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Loading and writing Marshalled DataFrame/Vector
|
|
174
|
+
def save(klass, filename)
|
|
175
|
+
fp = File.open(filename, 'w')
|
|
176
|
+
Marshal.dump(klass, fp)
|
|
177
|
+
fp.close
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def load(filename)
|
|
181
|
+
if File.exist? filename
|
|
182
|
+
o = false
|
|
183
|
+
File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
|
|
184
|
+
o
|
|
185
|
+
else
|
|
186
|
+
false
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
private
|
|
191
|
+
|
|
192
|
+
def optional_gem(name, version)
|
|
193
|
+
gem name, version
|
|
194
|
+
require name
|
|
195
|
+
rescue LoadError
|
|
196
|
+
DaruLite.error "\nInstall the #{name} gem version #{version} for using #{name} functions."
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
DARU_OPT_KEYS = %i[clone order index name].freeze
|
|
200
|
+
|
|
201
|
+
def from_csv_prepare_opts(opts)
|
|
202
|
+
opts[:col_sep] ||= ','
|
|
203
|
+
opts[:skip_blanks] ||= true
|
|
204
|
+
opts[:converters] ||= [:numeric]
|
|
205
|
+
|
|
206
|
+
opts[:converters] = from_csv_prepare_converters(opts[:converters])
|
|
207
|
+
|
|
208
|
+
daru_options = opts.keys.each_with_object({}) do |k, hash|
|
|
209
|
+
hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
|
|
210
|
+
end
|
|
211
|
+
[daru_options, opts]
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def from_csv_prepare_converters(converters)
|
|
215
|
+
Array(converters).flat_map do |c|
|
|
216
|
+
if ::CSV::Converters[c]
|
|
217
|
+
::CSV::Converters[c]
|
|
218
|
+
elsif DaruLite::IO::CSV::CONVERTERS[c]
|
|
219
|
+
DaruLite::IO::CSV::CONVERTERS[c]
|
|
220
|
+
else
|
|
221
|
+
c
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def from_csv_hash_with_headers(path, opts)
|
|
227
|
+
opts[:header_converters] ||= :symbol
|
|
228
|
+
::CSV
|
|
229
|
+
.parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
|
|
230
|
+
.tap { |c| yield c if block_given? }
|
|
231
|
+
.by_col.to_h { |col_name, values| [col_name, values] }
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def from_csv_hash(path, opts)
|
|
235
|
+
csv_as_arrays =
|
|
236
|
+
::CSV
|
|
237
|
+
.parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
|
|
238
|
+
.tap { |c| yield c if block_given? }
|
|
239
|
+
.to_a
|
|
240
|
+
headers = ArrayHelper.recode_repeated(csv_as_arrays.shift)
|
|
241
|
+
csv_as_arrays = csv_as_arrays.transpose
|
|
242
|
+
headers.each_with_index.to_h { |h, i| [h, csv_as_arrays[i]] }
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def html_parse_table(table)
|
|
246
|
+
headers, headers_size = html_scrape_tag(table, 'th')
|
|
247
|
+
data, size = html_scrape_tag(table, 'td')
|
|
248
|
+
data = data.keep_if { |x| x.count == size }
|
|
249
|
+
order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
|
|
250
|
+
return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count.positive?
|
|
251
|
+
|
|
252
|
+
{ data: data.compact, index: indice, order: order }
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def html_scrape_tag(table, tag)
|
|
256
|
+
arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
|
|
257
|
+
size = arr.map(&:count).max
|
|
258
|
+
[arr, size]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Splits headers (all th tags) into order and index. Wherein,
|
|
262
|
+
# Order : All <th> tags on first proper row of HTML table
|
|
263
|
+
# index : All <th> tags on first proper column of HTML table
|
|
264
|
+
def html_parse_hash(headers, size, headers_size)
|
|
265
|
+
headers_index = headers.find_index { |x| x.count == headers_size }
|
|
266
|
+
order = headers[headers_index]
|
|
267
|
+
order_index = order.count - size
|
|
268
|
+
order = order[order_index..]
|
|
269
|
+
indice = headers[headers_index + 1..].flatten
|
|
270
|
+
indice = nil if indice.to_a.empty?
|
|
271
|
+
[order, indice]
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def html_search(table, match = nil)
|
|
275
|
+
match.nil? ? true : (table.to_s.include? match)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Allows user to override the scraped order / index / data
|
|
279
|
+
def html_decide_values(scraped_val = {}, user_val = {})
|
|
280
|
+
%I[data index name order].each do |key|
|
|
281
|
+
user_val[key] ||= scraped_val[key]
|
|
282
|
+
end
|
|
283
|
+
user_val
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def html_table_to_dataframe(table)
|
|
287
|
+
DaruLite::DataFrame.rows table[:data],
|
|
288
|
+
index: table[:index],
|
|
289
|
+
order: table[:order],
|
|
290
|
+
name: table[:name]
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|