daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,374 @@
1
+ module DaruLite
2
+ class MultiIndex < Index # rubocop:disable Metrics/ClassLength
3
+ def each(&block)
4
+ to_a.each(&block)
5
+ end
6
+
7
+ def map(&block)
8
+ to_a.map(&block)
9
+ end
10
+
11
+ attr_reader :labels, :name
12
+
13
+ def levels
14
+ @levels.map(&:keys)
15
+ end
16
+
17
+ # names and levels should be of same size. If size of Array `name` is less
18
+ # or greater than size of array `levels` then it raises `SizeError`.
19
+ # If user don't want to put name for particular level then user must put
20
+ # empty string in that index of Array `name`.
21
+ # For example there is multi_index of 3 levels and user don't want to name
22
+ # level 0, then do multi_index.name = ['', 'level1_name1', 'level2_name']
23
+ #
24
+ # @example
25
+ #
26
+ # # set the name during initialization
27
+ #
28
+ # mi = DaruLite::MultiIndex.new(
29
+ # levels: [[:a,:b,:c], [:one, :two]],
30
+ # labels: [[0,0,1,1,2,2], [0,1,0,1,0,1]], name: ['s1', 's2'])
31
+ #
32
+ # # =>
33
+ # # <DaruLite::MultiIndex(6x2)>
34
+ # # s1 s2
35
+ # # a one
36
+ # # two
37
+ # # b one
38
+ # # two
39
+ # # c one
40
+ # # two
41
+ #
42
+ # # set new name
43
+ #
44
+ # mi.name = ['k1', 'k2']
45
+ # => ["k1", "k2"]
46
+ #
47
+ # mi
48
+ # =>
49
+ # # #<DaruLite::MultiIndex(6x2)>
50
+ # # k1 k2
51
+ # # a one
52
+ # # two
53
+ # # b one
54
+ # # two
55
+ # # c one
56
+ # # two
57
+ #
58
+ # # access the name
59
+ #
60
+ # mi.name
61
+ # => ["k1", "k2"]
62
+ #
63
+ # # If you don't want to name level 0
64
+ #
65
+ # mi.name = ['', 'k2']
66
+ # => ["", "k2"]
67
+ #
68
+ # mi
69
+ # =>
70
+ # #<DaruLite::MultiIndex(6x2)>
71
+ # # k2
72
+ # # a one
73
+ # # two
74
+ # # b one
75
+ # # two
76
+ # # c one
77
+ # # two
78
+ #
79
+ def initialize(opts = {})
80
+ labels = opts[:labels]
81
+ levels = opts[:levels]
82
+
83
+ raise ArgumentError, 'Must specify both labels and levels' unless labels && levels
84
+ raise ArgumentError, 'Labels and levels should be same size' if labels.size != levels.size
85
+ raise ArgumentError, 'Incorrect labels and levels' if incorrect_fields?(labels, levels)
86
+
87
+ @labels = labels
88
+ @levels = levels.map { |e| e.map.with_index.to_h }
89
+ self.name = opts[:name] unless opts[:name].nil?
90
+ end
91
+
92
+ def name=(names)
93
+ validate_name names, @labels
94
+ @name = names
95
+ end
96
+
97
+ def incorrect_fields?(_labels, levels)
98
+ levels[0].size # FIXME: without this exact call some specs are failing
99
+
100
+ levels.any? { |e| e.uniq.size != e.size }
101
+ end
102
+
103
+ private :incorrect_fields?
104
+
105
+ def self.from_arrays(arrays)
106
+ levels = arrays.map { |e| e.uniq.sort_by(&:to_s) }
107
+
108
+ labels = arrays.each_with_index.map do |arry, level_index|
109
+ level = levels[level_index]
110
+ arry.map { |lvl| level.index(lvl) }
111
+ end
112
+
113
+ MultiIndex.new labels: labels, levels: levels
114
+ end
115
+
116
+ def self.from_tuples(tuples)
117
+ from_arrays tuples.transpose
118
+ end
119
+
120
+ def self.try_from_tuples(tuples)
121
+ from_tuples(tuples) if tuples.respond_to?(:first) && tuples.first.is_a?(Array)
122
+ end
123
+
124
+ def [](*key)
125
+ key.flatten!
126
+ if key[0].is_a?(Range)
127
+ retrieve_from_range(key[0])
128
+ elsif key[0].is_a?(Integer) && key.size == 1
129
+ try_retrieve_from_integer(key[0])
130
+ else
131
+ begin
132
+ retrieve_from_tuples key
133
+ rescue NoMethodError
134
+ raise IndexError, "Specified index #{key.inspect} do not exist"
135
+ end
136
+ end
137
+ end
138
+
139
+ def valid?(*indexes)
140
+ # FIXME: This is perhaps not a good method
141
+ pos(*indexes)
142
+ true
143
+ rescue IndexError
144
+ false
145
+ end
146
+
147
+ # Returns positions given indexes or positions
148
+ # @note If the arugent is both a valid index and a valid position,
149
+ # it will treated as valid index
150
+ # @param indexes [Array<object>] indexes or positions
151
+ # @example
152
+ # idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
153
+ # idx.pos :a
154
+ # # => [0, 1]
155
+ def pos(*indexes)
156
+ if indexes.first.is_a? Integer
157
+ return indexes.first if indexes.size == 1
158
+
159
+ return indexes
160
+ end
161
+ res = self[indexes]
162
+ return res if res.is_a? Integer
163
+
164
+ res.map { |i| self[i] }
165
+ end
166
+
167
+ def subset(*indexes)
168
+ if indexes.first.is_a? Integer
169
+ MultiIndex.from_tuples(indexes.map { |index| key(index) })
170
+ else
171
+ self[indexes].conform indexes
172
+ end
173
+ end
174
+
175
+ # Takes positional values and returns subset of the self
176
+ # capturing the indexes at mentioned positions
177
+ # @param positions [Array<Integer>] positional values
178
+ # @return [object] index object
179
+ # @example
180
+ # idx = DaruLite::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
181
+ # idx.at 0, 1
182
+ # # => #<DaruLite::MultiIndex(2x2)>
183
+ # # a one
184
+ # # two
185
+ def at(*positions)
186
+ positions = preprocess_positions(*positions)
187
+ validate_positions(*positions)
188
+ if positions.is_a? Integer
189
+ key(positions)
190
+ else
191
+ DaruLite::MultiIndex.from_tuples(positions.map { |v| key(v) })
192
+ end
193
+ end
194
+
195
+ def add(*indexes)
196
+ DaruLite::MultiIndex.from_tuples(to_a + [indexes])
197
+ end
198
+
199
+ def reorder(new_order)
200
+ from = to_a
201
+ MultiIndex.from_tuples(new_order.map { |i| from[i] })
202
+ end
203
+
204
+ def try_retrieve_from_integer(int)
205
+ @levels[0].key?(int) ? retrieve_from_tuples([int]) : int
206
+ end
207
+
208
+ def retrieve_from_range(range)
209
+ MultiIndex.from_tuples(range.map { |index| key(index) })
210
+ end
211
+
212
+ def retrieve_from_tuples(key)
213
+ chosen = []
214
+
215
+ key.each_with_index do |k, depth|
216
+ level_index = @levels[depth][k]
217
+ raise IndexError, "Specified index #{key.inspect} do not exist" if level_index.nil?
218
+
219
+ label = @labels[depth]
220
+ chosen = find_all_indexes label, level_index, chosen
221
+ end
222
+
223
+ return chosen[0] if chosen.size == 1 && key.size == @levels.size
224
+
225
+ multi_index_from_multiple_selections(chosen)
226
+ end
227
+
228
+ def multi_index_from_multiple_selections(chosen)
229
+ MultiIndex.from_tuples(chosen.map { |e| key(e) })
230
+ end
231
+
232
+ def find_all_indexes(label, level_index, chosen)
233
+ if chosen.empty?
234
+ label.each_with_index
235
+ .select { |lbl, _| lbl == level_index }.map(&:last)
236
+ else
237
+ chosen.keep_if { |c| label[c] == level_index }
238
+ end
239
+ end
240
+
241
+ def remove_layer(layer_index)
242
+ @levels.delete_at(layer_index)
243
+ @labels.delete_at(layer_index)
244
+ @name&.delete_at(layer_index)
245
+
246
+ coerce_index
247
+ end
248
+
249
+ def coerce_index
250
+ if @levels.size == 1
251
+ elements = to_a.flatten
252
+
253
+ if elements.uniq.length == elements.length
254
+ DaruLite::Index.new(elements)
255
+ else
256
+ DaruLite::CategoricalIndex.new(elements)
257
+ end
258
+ else
259
+ self
260
+ end
261
+ end
262
+
263
+ # Array `name` must have same length as levels and labels.
264
+ def validate_name(names, levels)
265
+ error_msg = "'names' and 'levels' should be of same size. Size of the " \
266
+ "'name' array is #{names.size} and size of the MultiIndex 'levels' and " \
267
+ "'labels' is #{labels.size}."
268
+ suggestion_msg = 'If you do not want to set name for particular level ' \
269
+ "(say level 'i') then put empty string on index 'i' of the 'name' Array."
270
+
271
+ raise SizeError, error_msg if names.size > levels.size
272
+ raise SizeError, [error_msg, suggestion_msg].join("\n") if names.size < levels.size
273
+ end
274
+
275
+ private :find_all_indexes, :multi_index_from_multiple_selections,
276
+ :retrieve_from_range, :retrieve_from_tuples, :validate_name
277
+
278
+ def key(index)
279
+ raise ArgumentError, "Key #{index} is too large" if index >= @labels[0].size
280
+
281
+ @labels
282
+ .each_with_index
283
+ .map { |label, i| @levels[i].keys[label[index]] }
284
+ end
285
+
286
+ def dup
287
+ MultiIndex.new levels: levels.dup, labels: labels.dup, name: @name&.dup
288
+ end
289
+
290
+ def drop_left_level(by = 1)
291
+ MultiIndex.from_arrays to_a.transpose[by..]
292
+ end
293
+
294
+ def |(other)
295
+ MultiIndex.from_tuples(to_a | other.to_a)
296
+ end
297
+
298
+ def &(other)
299
+ MultiIndex.from_tuples(to_a & other.to_a)
300
+ end
301
+
302
+ def empty?
303
+ @labels.flatten.empty? && @levels.all?(&:empty?)
304
+ end
305
+
306
+ def include?(tuple)
307
+ return false unless tuple.is_a? Enumerable
308
+
309
+ @labels[0...tuple.flatten.size]
310
+ .transpose
311
+ .include?(tuple.flatten.each_with_index.map { |e, i| @levels[i][e] })
312
+ end
313
+
314
+ def size
315
+ @labels[0].size
316
+ end
317
+
318
+ def width
319
+ @levels.size
320
+ end
321
+
322
+ def ==(other)
323
+ self.class == other.class &&
324
+ labels == other.labels &&
325
+ levels == other.levels
326
+ end
327
+
328
+ def to_a
329
+ (0...size).map { |e| key(e) }
330
+ end
331
+
332
+ def values
333
+ Array.new(size) { |i| i }
334
+ end
335
+
336
+ def inspect(threshold = 20)
337
+ "#<DaruLite::MultiIndex(#{size}x#{width})>\n" +
338
+ Formatters::Table.format([], headers: @name, row_headers: sparse_tuples, threshold: threshold)
339
+ end
340
+
341
+ def to_html
342
+ path = File.expand_path('../iruby/templates/multi_index.html.erb', __dir__)
343
+ ERB.new(File.read(path).strip).result(binding)
344
+ end
345
+
346
+ # Provide a MultiIndex for sub vector produced
347
+ #
348
+ # @param input_indexes [Array] the input by user to index the vector
349
+ # @return [Object] the MultiIndex object for sub vector produced
350
+ def conform(input_indexes)
351
+ return self if input_indexes[0].is_a? Range
352
+
353
+ drop_left_level input_indexes.size
354
+ end
355
+
356
+ # Return tuples with nils in place of repeating values, like this:
357
+ #
358
+ # [:a , :bar, :one]
359
+ # [nil, nil , :two]
360
+ # [nil, :foo, :one]
361
+ #
362
+ def sparse_tuples
363
+ tuples = to_a
364
+ [tuples.first] + each_cons(2).map do |prev, cur|
365
+ left = cur.zip(prev).drop_while { |c, p| c == p }
366
+ Array.new(cur.size - left.size) + left.map(&:first)
367
+ end
368
+ end
369
+
370
+ def to_df
371
+ DaruLite::DataFrame.new(@name.zip(to_a.transpose).to_h)
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,21 @@
1
+ module DaruLite
2
+ module IO
3
+ module CSV
4
+ CONVERTERS = {
5
+ boolean: lambda { |f, _|
6
+ case f.downcase.strip
7
+ when 'true'
8
+ true
9
+ when 'false'
10
+ false
11
+ else
12
+ f
13
+ end
14
+ },
15
+ string: lambda { |f, _|
16
+ f
17
+ }
18
+ }.freeze
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,294 @@
1
+ module DaruLite
2
+ require_relative 'csv/converters'
3
+ module IOHelpers
4
+ class << self
5
+ def process_row(row, empty)
6
+ row.to_a.map do |c|
7
+ if empty.include?(c)
8
+ # FIXME: As far as I can guess, it will never work.
9
+ # It is called only inside `from_plaintext`, and there
10
+ # data is splitted by `\s+` -- there is no chance that
11
+ # "empty" (currently just '') will be between data?..
12
+ nil
13
+ else
14
+ try_string_to_number(c)
15
+ end
16
+ end
17
+ end
18
+
19
+ def open_local_or_remote_file(path)
20
+ uri = URI.parse(path)
21
+ uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) ? uri.open : File.open(uri.path)
22
+ end
23
+
24
+ private
25
+
26
+ INT_PATTERN = /^[-+]?\d+$/.freeze
27
+ FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/.freeze
28
+
29
+ def try_string_to_number(s)
30
+ case s
31
+ when INT_PATTERN
32
+ s.to_i
33
+ when FLOAT_PATTERN
34
+ s.tr(',', '.').to_f
35
+ else
36
+ s
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ module IO
43
+ class << self
44
+ # Functions for loading/writing Excel files.
45
+
46
+ def from_excel(path, opts = {})
47
+ opts = {
48
+ worksheet_id: 0,
49
+ row_id: 0
50
+ }.merge opts
51
+
52
+ worksheet, headers = read_from_excel(path, opts)
53
+ df = DaruLite::DataFrame.new({})
54
+ headers.each_with_index do |h, i|
55
+ col = worksheet.column(i).to_a
56
+ col.delete_at 0
57
+ df[h] = col
58
+ end
59
+
60
+ df
61
+ end
62
+
63
+ def read_from_excel(path, opts)
64
+ optional_gem 'spreadsheet', '~>1.3.0'
65
+
66
+ worksheet_id = opts[:worksheet_id]
67
+ row_id = opts[:row_id]
68
+ book = Spreadsheet.open path
69
+ worksheet = book.worksheet worksheet_id
70
+ headers = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)
71
+
72
+ [worksheet, headers]
73
+ end
74
+
75
+ def dataframe_write_excel(dataframe, path, _opts = {})
76
+ book = Spreadsheet::Workbook.new
77
+ sheet = book.create_worksheet
78
+ format = Spreadsheet::Format.new color: :blue, weight: :bold
79
+
80
+ sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
81
+ sheet.row(0).default_format = format
82
+ i = 1
83
+ dataframe.each_row do |row|
84
+ sheet.row(i).concat(row.to_a)
85
+ i += 1
86
+ end
87
+
88
+ book.write(path)
89
+ end
90
+
91
+ # Functions for loading/writing CSV files
92
+ def from_csv(path, opts = {})
93
+ daru_options, opts = from_csv_prepare_opts opts
94
+ # Preprocess headers for detecting and correcting repetition in
95
+ # case the :headers option is not specified.
96
+ hsh =
97
+ if opts[:headers]
98
+ from_csv_hash_with_headers(path, opts)
99
+ else
100
+ from_csv_hash(path, opts)
101
+ .tap { |hash| daru_options[:order] = hash.keys }
102
+ end
103
+ DaruLite::DataFrame.new(hsh, daru_options)
104
+ end
105
+
106
+ def dataframe_write_csv(dataframe, path, opts = {})
107
+ options = {
108
+ converters: :numeric
109
+ }.merge(opts)
110
+
111
+ writer = ::CSV.open(path, 'w', **options)
112
+ writer << dataframe.vectors.to_a unless options[:headers] == false
113
+
114
+ dataframe.each_row do |row|
115
+ writer << if options[:convert_comma]
116
+ row.map { |v| v.to_s.tr('.', ',') }
117
+ else
118
+ row.to_a
119
+ end
120
+ end
121
+
122
+ writer.close
123
+ end
124
+
125
+ # Execute a query and create a data frame from the result
126
+ #
127
+ # @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
128
+ # @param query [String] The query to be executed
129
+ #
130
+ # @return A dataframe containing the data resulting from the query
131
+ def from_sql(db, query)
132
+ require 'daru_lite/io/sql_data_source'
133
+ SqlDataSource.make_dataframe(db, query)
134
+ end
135
+
136
+ def dataframe_write_sql(ds, dbh, table)
137
+ require 'dbi'
138
+ query = "INSERT INTO #{table} (#{ds.vectors.to_a.join(',')}) VALUES (#{(['?'] * ds.vectors.size).join(',')})"
139
+ sth = dbh.prepare(query)
140
+ ds.each_row { |c| sth.execute(*c.to_a) }
141
+ true
142
+ end
143
+
144
+ # Load dataframe from AR::Relation
145
+ #
146
+ # @param relation [ActiveRecord::Relation] A relation to be used to load the contents of dataframe
147
+ #
148
+ # @return A dataframe containing the data in the given relation
149
+ def from_activerecord(relation, *fields)
150
+ fields = relation.klass.column_names if fields.empty?
151
+ fields = fields.map(&:to_sym)
152
+
153
+ result = relation.pluck(*fields).transpose
154
+ DaruLite::DataFrame.new(result, order: fields).tap(&:update)
155
+ end
156
+
157
+ # Loading data from plain text files
158
+
159
+ def from_plaintext(filename, fields)
160
+ ds = DaruLite::DataFrame.new({}, order: fields)
161
+ fp = File.open(filename, 'r')
162
+ fp.each_line do |line|
163
+ row = DaruLite::IOHelpers.process_row(line.strip.split(/\s+/), [''])
164
+ next if row == ["\x1A"]
165
+
166
+ ds.add_row(row)
167
+ end
168
+ ds.update
169
+ fields.each { |f| ds[f].rename f }
170
+ ds
171
+ end
172
+
173
+ # Loading and writing Marshalled DataFrame/Vector
174
+ def save(klass, filename)
175
+ fp = File.open(filename, 'w')
176
+ Marshal.dump(klass, fp)
177
+ fp.close
178
+ end
179
+
180
+ def load(filename)
181
+ if File.exist? filename
182
+ o = false
183
+ File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
184
+ o
185
+ else
186
+ false
187
+ end
188
+ end
189
+
190
+ private
191
+
192
+ def optional_gem(name, version)
193
+ gem name, version
194
+ require name
195
+ rescue LoadError
196
+ DaruLite.error "\nInstall the #{name} gem version #{version} for using #{name} functions."
197
+ end
198
+
199
+ DARU_OPT_KEYS = %i[clone order index name].freeze
200
+
201
+ def from_csv_prepare_opts(opts)
202
+ opts[:col_sep] ||= ','
203
+ opts[:skip_blanks] ||= true
204
+ opts[:converters] ||= [:numeric]
205
+
206
+ opts[:converters] = from_csv_prepare_converters(opts[:converters])
207
+
208
+ daru_options = opts.keys.each_with_object({}) do |k, hash|
209
+ hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
210
+ end
211
+ [daru_options, opts]
212
+ end
213
+
214
+ def from_csv_prepare_converters(converters)
215
+ Array(converters).flat_map do |c|
216
+ if ::CSV::Converters[c]
217
+ ::CSV::Converters[c]
218
+ elsif DaruLite::IO::CSV::CONVERTERS[c]
219
+ DaruLite::IO::CSV::CONVERTERS[c]
220
+ else
221
+ c
222
+ end
223
+ end
224
+ end
225
+
226
+ def from_csv_hash_with_headers(path, opts)
227
+ opts[:header_converters] ||= :symbol
228
+ ::CSV
229
+ .parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
230
+ .tap { |c| yield c if block_given? }
231
+ .by_col.to_h { |col_name, values| [col_name, values] }
232
+ end
233
+
234
+ def from_csv_hash(path, opts)
235
+ csv_as_arrays =
236
+ ::CSV
237
+ .parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
238
+ .tap { |c| yield c if block_given? }
239
+ .to_a
240
+ headers = ArrayHelper.recode_repeated(csv_as_arrays.shift)
241
+ csv_as_arrays = csv_as_arrays.transpose
242
+ headers.each_with_index.to_h { |h, i| [h, csv_as_arrays[i]] }
243
+ end
244
+
245
+ def html_parse_table(table)
246
+ headers, headers_size = html_scrape_tag(table, 'th')
247
+ data, size = html_scrape_tag(table, 'td')
248
+ data = data.keep_if { |x| x.count == size }
249
+ order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
250
+ return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count.positive?
251
+
252
+ { data: data.compact, index: indice, order: order }
253
+ end
254
+
255
+ def html_scrape_tag(table, tag)
256
+ arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
257
+ size = arr.map(&:count).max
258
+ [arr, size]
259
+ end
260
+
261
+ # Splits headers (all th tags) into order and index. Wherein,
262
+ # Order : All <th> tags on first proper row of HTML table
263
+ # index : All <th> tags on first proper column of HTML table
264
+ def html_parse_hash(headers, size, headers_size)
265
+ headers_index = headers.find_index { |x| x.count == headers_size }
266
+ order = headers[headers_index]
267
+ order_index = order.count - size
268
+ order = order[order_index..]
269
+ indice = headers[headers_index + 1..].flatten
270
+ indice = nil if indice.to_a.empty?
271
+ [order, indice]
272
+ end
273
+
274
+ def html_search(table, match = nil)
275
+ match.nil? ? true : (table.to_s.include? match)
276
+ end
277
+
278
+ # Allows user to override the scraped order / index / data
279
+ def html_decide_values(scraped_val = {}, user_val = {})
280
+ %I[data index name order].each do |key|
281
+ user_val[key] ||= scraped_val[key]
282
+ end
283
+ user_val
284
+ end
285
+
286
+ def html_table_to_dataframe(table)
287
+ DaruLite::DataFrame.rows table[:data],
288
+ index: table[:index],
289
+ order: table[:order],
290
+ name: table[:name]
291
+ end
292
+ end
293
+ end
294
+ end