daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,34 @@
1
+ module DaruLite
2
+ # Defines constants and methods related to configuration
3
+ module Configuration
4
+ INSPECT_OPTIONS_KEYS = [
5
+ :max_rows,
6
+ # Terminal
7
+ :spacing
8
+ ].freeze
9
+
10
+ # Jupyter
11
+ DEFAULT_MAX_ROWS = 30
12
+
13
+ # Terminal
14
+ DEFAULT_SPACING = 10
15
+
16
+ attr_accessor(*INSPECT_OPTIONS_KEYS)
17
+
18
+ def configure
19
+ yield self
20
+ end
21
+
22
+ def self.extended(base)
23
+ base.reset_options
24
+ end
25
+
26
+ def reset_options
27
+ self.max_rows = DEFAULT_MAX_ROWS
28
+
29
+ self.spacing = DEFAULT_SPACING
30
+ end
31
+ end
32
+
33
+ extend Configuration
34
+ end
@@ -0,0 +1,403 @@
1
+ module DaruLite
2
+ module Core
3
+ class GroupBy
4
+ class << self
5
+ extend Gem::Deprecate
6
+
7
+ # @private
8
+ def group_by_index_to_positions(indexes_with_positions, sort: false)
9
+ index_to_positions = {}
10
+
11
+ indexes_with_positions.each do |idx, position|
12
+ (index_to_positions[idx] ||= []) << position
13
+ end
14
+
15
+ if sort # TODO: maybe add a more "stable" sorting option?
16
+ sorted_keys = index_to_positions.keys.sort(&DaruLite::Core::GroupBy::TUPLE_SORTER)
17
+ index_to_positions = sorted_keys.to_h { |k| [k, index_to_positions[k]] }
18
+ end
19
+
20
+ index_to_positions
21
+ end
22
+ alias get_positions_group_map_on group_by_index_to_positions
23
+ deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
24
+
25
+ # @private
26
+ def get_positions_group_for_aggregation(multi_index, level = -1)
27
+ raise unless multi_index.is_a?(DaruLite::MultiIndex)
28
+
29
+ new_index = multi_index.dup
30
+ new_index.remove_layer(level) # TODO: recheck code of DaruLite::MultiIndex#remove_layer
31
+
32
+ group_by_index_to_positions(new_index.each_with_index)
33
+ end
34
+
35
+ # @private
36
+ def get_positions_group_map_for_df(df, group_by_keys, sort: true)
37
+ indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
38
+
39
+ group_by_index_to_positions(indexes_with_positions, sort: sort)
40
+ end
41
+
42
+ # @private
43
+ def group_map_from_positions_to_indexes(positions_group_map, index)
44
+ positions_group_map.transform_values { |positions| positions.map { |pos| index.at(pos) } }
45
+ end
46
+
47
+ # @private
48
+ def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
49
+ return nil if group_map == {}
50
+
51
+ new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
52
+ new_index = DaruLite::MultiIndex.from_tuples(new_index)
53
+
54
+ return DaruLite::DataFrame.new({}, index: new_index) if remaining_vectors == []
55
+
56
+ new_rows_order = group_map.values.flatten
57
+ new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
58
+ new_df.index = new_index
59
+
60
+ new_df
61
+ end
62
+ end
63
+
64
+ # The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
65
+ attr_reader :group_vectors, :non_group_vectors
66
+
67
+ # lazy accessor/attr_reader for the attribute groups
68
+ def groups
69
+ @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
70
+ end
71
+ alias groups_by_idx groups
72
+
73
+ # lazy accessor/attr_reader for the attribute df
74
+ def df
75
+ @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
76
+ end
77
+ alias grouped_df df
78
+
79
+ # Iterate over each group created by group_by. A DataFrame is yielded in
80
+ # block.
81
+ def each_group
82
+ return to_enum(:each_group) unless block_given?
83
+
84
+ groups.each_key do |k|
85
+ yield get_group(k)
86
+ end
87
+ end
88
+
89
+ TUPLE_SORTER = lambda do |left, right|
90
+ return -1 unless right
91
+ return 1 unless left
92
+
93
+ left = left.compact
94
+ right = right.compact
95
+ return left <=> right || 0 if left.length == right.length
96
+
97
+ left.length <=> right.length
98
+ end
99
+
100
+ def initialize(context, names)
101
+ @group_vectors = names
102
+ @non_group_vectors = context.vectors.to_a - names
103
+
104
+ @context = context # TODO: maybe rename in @original_df
105
+
106
+ # FIXME: It feels like we don't want to sort here. Ruby's #group_by
107
+ # never sorts:
108
+ #
109
+ # ['test', 'me', 'please'].group_by(&:size)
110
+ # # => {4=>["test"], 2=>["me"], 6=>["please"]}
111
+ #
112
+ # - zverok, 2016-09-12
113
+ @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
114
+ end
115
+
116
+ # Get a DaruLite::Vector of the size of each group.
117
+ def size
118
+ index = get_grouped_index
119
+
120
+ values = @groups_by_pos.values.map(&:size)
121
+ DaruLite::Vector.new(values, index: index, name: :size)
122
+ end
123
+
124
+ # Get the first group
125
+ def first
126
+ head(1)
127
+ end
128
+
129
+ # Get the last group
130
+ def last
131
+ tail(1)
132
+ end
133
+
134
+ # Get the top 'n' groups
135
+ # @param quantity [Fixnum] (5) The number of groups.
136
+ # @example Usage of head
137
+ # df = DaruLite::DataFrame.new({
138
+ # a: %w{foo bar foo bar foo bar foo foo},
139
+ # b: %w{one one two three two two one three},
140
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
141
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
142
+ # })
143
+ # df.group_by([:a, :b]).head(1)
144
+ # # =>
145
+ # # #<DaruLite::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
146
+ # # a b c d
147
+ # # 1 bar one 2 22
148
+ # # 3 bar three 1 44
149
+ # # 5 bar two 6 66
150
+ # # 0 foo one 1 11
151
+ # # 7 foo three 8 88
152
+ # # 2 foo two 3 33
153
+ def head(quantity = 5)
154
+ select_groups_from :first, quantity
155
+ end
156
+
157
+ # Get the bottom 'n' groups
158
+ # @param quantity [Fixnum] (5) The number of groups.
159
+ # @example Usage of tail
160
+ # df = DaruLite::DataFrame.new({
161
+ # a: %w{foo bar foo bar foo bar foo foo},
162
+ # b: %w{one one two three two two one three},
163
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
164
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
165
+ # })
166
+ # # df.group_by([:a, :b]).tail(1)
167
+ # # =>
168
+ # # #<DaruLite::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
169
+ # # a b c d
170
+ # # 1 bar one 2 22
171
+ # # 3 bar three 1 44
172
+ # # 5 bar two 6 66
173
+ # # 6 foo one 3 77
174
+ # # 7 foo three 8 88
175
+ # # 4 foo two 3 55
176
+ def tail(quantity = 5)
177
+ select_groups_from :last, quantity
178
+ end
179
+
180
+ # Calculate mean of numeric groups, excluding missing values.
181
+ # @example Usage of mean
182
+ # df = DaruLite::DataFrame.new({
183
+ # a: %w{foo bar foo bar foo bar foo foo},
184
+ # b: %w{one one two three two two one three},
185
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
186
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
187
+ # df.group_by([:a, :b]).mean
188
+ # # =>
189
+ # # #<DaruLite::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
190
+ # # c d
191
+ # # ["bar", "one"] 2 22
192
+ # # ["bar", "three"] 1 44
193
+ # # ["bar", "two"] 6 66
194
+ # # ["foo", "one"] 2.0 44.0
195
+ # # ["foo", "three"] 8 88
196
+ # # ["foo", "two"] 3.0 44.0
197
+ def mean
198
+ apply_method :numeric, :mean
199
+ end
200
+
201
+ # Calculate the median of numeric groups, excluding missing values.
202
+ def median
203
+ apply_method :numeric, :median
204
+ end
205
+
206
+ # Calculate sum of numeric groups, excluding missing values.
207
+ def sum
208
+ apply_method :numeric, :sum
209
+ end
210
+
211
+ # Count groups, excludes missing values.
212
+ # @example Using count
213
+ # df = DaruLite::DataFrame.new({
214
+ # a: %w{foo bar foo bar foo bar foo foo},
215
+ # b: %w{one one two three two two one three},
216
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
217
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
218
+ # })
219
+ # df.group_by([:a, :b]).count
220
+ # # =>
221
+ # # #<DaruLite::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
222
+ # # c d
223
+ # # ["bar", "one"] 1 1
224
+ # # ["bar", "two"] 1 1
225
+ # # ["bar", "three"] 1 1
226
+ # # ["foo", "one"] 2 2
227
+ # # ["foo", "three"] 1 1
228
+ # # ["foo", "two"] 2 2
229
+ def count
230
+ width = @non_group_vectors.size
231
+ DaruLite::DataFrame.new([size] * width, order: @non_group_vectors)
232
+ end
233
+
234
+ # Calculate sample standard deviation of numeric vector groups, excluding
235
+ # missing values.
236
+ def std
237
+ apply_method :numeric, :std
238
+ end
239
+
240
+ # Find the max element of each numeric vector group.
241
+ def max
242
+ apply_method :numeric, :max
243
+ end
244
+
245
+ # Find the min element of each numeric vector group.
246
+ def min
247
+ apply_method :numeric, :min
248
+ end
249
+
250
+ # Returns one of the selected groups as a DataFrame.
251
+ # @param group [Array] The group that is to be selected from those grouped.
252
+ #
253
+ # @example Getting a group
254
+ #
255
+ # df = DaruLite::DataFrame.new({
256
+ # a: %w{foo bar foo bar foo bar foo foo},
257
+ # b: %w{one one two three two two one three},
258
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
259
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
260
+ # })
261
+ # df.group_by([:a, :b]).get_group ['bar','two']
262
+ # #=>
263
+ # ##<DaruLite::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
264
+ # # a b c d
265
+ # # 5 bar two 6 66
266
+ def get_group(group)
267
+ indexes = groups_by_idx[group]
268
+ elements = @context.each_vector.map(&:to_a)
269
+ transpose = elements.transpose
270
+ rows = indexes.each.map { |idx| transpose[idx] }
271
+
272
+ DaruLite::DataFrame.rows(
273
+ rows, index: indexes, order: @context.vectors
274
+ )
275
+ end
276
+
277
+ # Iteratively applies a function to the values in a group and accumulates the result.
278
+ # @param init (nil) The initial value of the accumulator.
279
+ # @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
280
+ # is the accumulated result. The second argument is a DataFrame row.
281
+ # @example Usage of reduce
282
+ # df = DaruLite::DataFrame.new({
283
+ # a: ['a','b'] * 3,
284
+ # b: [1,2,3] * 2,
285
+ # c: 'A'..'F'
286
+ # })
287
+ # df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
288
+ # # =>
289
+ # # #<DaruLite::Vector:70343147159900 @name = nil @size = 2 >
290
+ # # nil
291
+ # # a ACE
292
+ # # b BDF
293
+ def reduce(init = nil)
294
+ result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
295
+ group_indices = indices.map { |v| @context.index.to_a[v] }
296
+
297
+ grouped_result = init
298
+ group_indices.each do |idx|
299
+ grouped_result = yield(grouped_result, @context.row[idx])
300
+ end
301
+
302
+ h[group] = grouped_result
303
+ end
304
+
305
+ index = get_grouped_index(result_hash.keys)
306
+
307
+ DaruLite::Vector.new(result_hash.values, index: index)
308
+ end
309
+
310
+ def inspect
311
+ grouped_df.inspect
312
+ end
313
+
314
+ # Function to use for aggregating the data.
315
+ # `group_by` is using DaruLite::DataFrame#aggregate
316
+ #
317
+ # @param options [Hash] options for column, you want in resultant dataframe
318
+ #
319
+ # @return [DaruLite::DataFrame]
320
+ #
321
+ # @example
322
+ #
323
+ # df = DaruLite::DataFrame.new(
324
+ # name: ['Ram','Krishna','Ram','Krishna','Krishna'],
325
+ # visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
326
+ #
327
+ # => #<DaruLite::DataFrame(5x2)>
328
+ # name visited
329
+ # 0 Ram Hyderabad
330
+ # 1 Krishna Delhi
331
+ # 2 Ram Mumbai
332
+ # 3 Krishna Raipur
333
+ # 4 Krishna Banglore
334
+ #
335
+ # df.group_by(:name)
336
+ # => #<DaruLite::DataFrame(5x1)>
337
+ # visited
338
+ # Krishna 1 Delhi
339
+ # 3 Raipur
340
+ # 4 Banglore
341
+ # Ram 0 Hyderabad
342
+ # 2 Mumbai
343
+ #
344
+ # df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
345
+ # => #<DaruLite::DataFrame(2x1)>
346
+ # visited
347
+ # Krishna Delhi,Raipur,Banglore
348
+ # Ram Hyderabad,Mumbai
349
+ #
350
+ def aggregate(options = {})
351
+ new_index = get_grouped_index
352
+
353
+ @context.aggregate(options) { [@groups_by_pos.values, new_index] }
354
+ end
355
+
356
+ private
357
+
358
+ def select_groups_from(method, quantity)
359
+ selection = @context
360
+ rows = []
361
+ indexes = []
362
+
363
+ groups_by_idx.each_value do |index|
364
+ index.send(method, quantity).each do |idx|
365
+ rows << selection.row[idx].to_a
366
+ indexes << idx
367
+ end
368
+ end
369
+ indexes.flatten!
370
+
371
+ DaruLite::DataFrame.rows(rows, order: @context.vectors, index: indexes)
372
+ end
373
+
374
+ def select_numeric_non_group_vectors
375
+ @non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
376
+ end
377
+
378
+ def apply_method(method_type, method)
379
+ raise 'To implement' if method_type != :numeric
380
+
381
+ aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
382
+
383
+ aggregate(aggregation_options)
384
+ end
385
+
386
+ def get_grouped_index(index_tuples = nil)
387
+ index_tuples = @groups_by_pos.keys if index_tuples.nil?
388
+
389
+ if multi_indexed_grouping?
390
+ DaruLite::MultiIndex.from_tuples(index_tuples)
391
+ else
392
+ DaruLite::Index.new(index_tuples.flatten)
393
+ end
394
+ end
395
+
396
+ def multi_indexed_grouping?
397
+ return false unless @groups_by_pos.keys[0]
398
+
399
+ @groups_by_pos.keys[0].size > 1
400
+ end
401
+ end
402
+ end
403
+ end
@@ -0,0 +1,270 @@
1
+ module DaruLite
2
+ module Core
3
+ class MergeFrame
4
+ class NilSorter
5
+ include Comparable
6
+
7
+ def nil?
8
+ true
9
+ end
10
+
11
+ def ==(_other)
12
+ false
13
+ end
14
+
15
+ def <=>(other)
16
+ other.nil? ? 0 : -1
17
+ end
18
+ end
19
+
20
+ # quick-fix for issue #171
21
+ def initialize(left_df, right_df, opts = {})
22
+ init_opts(opts)
23
+ validate_on!(left_df, right_df)
24
+ key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
25
+
26
+ @left = df_to_a(left_df)
27
+ @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
28
+ @left_key_values = @left.map(&key_sanitizer)
29
+
30
+ @right = df_to_a(right_df)
31
+ @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
32
+ @right_key_values = @right.map(&key_sanitizer)
33
+
34
+ @left_keys, @right_keys = merge_keys(left_df, right_df, on)
35
+ end
36
+
37
+ def join
38
+ res = []
39
+
40
+ until left.empty? && right.empty?
41
+ lkey = first_left_key
42
+ rkey = first_right_key
43
+
44
+ row(lkey, rkey).tap { |r| res << r if r }
45
+ end
46
+
47
+ DaruLite::DataFrame.new(res, order: dataframe_vector_names)
48
+ end
49
+
50
+ private
51
+
52
+ attr_reader :on, :indicator,
53
+ :left, :left_key_values, :keep_left, :left_keys,
54
+ :right, :right_key_values, :keep_right, :right_keys
55
+
56
+ attr_accessor :merge_key
57
+
58
+ LEFT_RIGHT_COMBINATIONS = {
59
+ # left right
60
+ inner: [false, false],
61
+ left: [true, false],
62
+ right: [false, true],
63
+ outer: [true, true]
64
+ }.freeze
65
+
66
+ def init_opts(opts)
67
+ @on = opts[:on]
68
+ @keep_left, @keep_right = extract_left_right(opts[:how])
69
+ @indicator = opts[:indicator]
70
+ end
71
+
72
+ def dataframe_vector_names
73
+ left_keys.values + on + right_keys.values + Array(indicator)
74
+ end
75
+
76
+ def extract_left_right(how)
77
+ LEFT_RIGHT_COMBINATIONS[how] or
78
+ raise ArgumentError, "Unrecognized join option: #{how}"
79
+ end
80
+
81
+ def sanitize_merge_keys(merge_keys)
82
+ merge_keys.map { |v| v.nil? ? NilSorter.new : v }
83
+ end
84
+
85
+ def df_to_a(df)
86
+ # FIXME: much faster than "native" DataFrame#to_a. Should not be
87
+ h = df.to_h
88
+ keys = h.keys
89
+ h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
90
+ end
91
+
92
+ def merge_keys(df1, df2, on)
93
+ duplicates =
94
+ (df1.vectors.to_a + df2.vectors.to_a - on)
95
+ .group_by(&:itself)
96
+ .select { |_, g| g.count == 2 }.map(&:first)
97
+
98
+ [
99
+ guard_keys(df1.vectors.to_a - on, duplicates, 1),
100
+ guard_keys(df2.vectors.to_a - on, duplicates, 2)
101
+ ]
102
+ end
103
+
104
+ def guard_keys(keys, duplicates, num)
105
+ keys.to_h { |v| [v, guard_duplicate(v, duplicates, num)] }
106
+ end
107
+
108
+ def guard_duplicate(val, duplicates, num)
109
+ duplicates.include?(val) ? :"#{val}_#{num}" : val
110
+ end
111
+
112
+ def row(lkey, rkey)
113
+ # :nocov:
114
+ # It's just an impossibility handler, can't be covered :)
115
+ raise 'Unexpected condition met during merge' if !lkey && !rkey
116
+
117
+ # :nocov:
118
+ if lkey == rkey
119
+ self.merge_key = lkey
120
+ add_indicator(merge_matching_rows, :both)
121
+ elsif !rkey || lt(lkey, rkey)
122
+ add_indicator(left_row_missing_right, :left_only)
123
+ else # !lkey || lt(rkey, lkey)
124
+ add_indicator(right_row_missing_left, :right_only)
125
+ end
126
+ end
127
+
128
+ def add_indicator(row, indicator_value)
129
+ return row unless indicator
130
+
131
+ row[indicator] = indicator_value
132
+ row
133
+ end
134
+
135
+ def merge_matching_rows
136
+ if one_to_one_merge?
137
+ merge_rows(one_to_one_left_row, one_to_one_right_row)
138
+ elsif one_to_many_merge?
139
+ result = merge_rows(left.first, right.first)
140
+ one_to_many_shift
141
+ result
142
+ else
143
+ result = cartesian_product.shift
144
+ end_cartesian_product if cartesian_product.empty?
145
+ result
146
+ end
147
+ end
148
+
149
+ def one_to_many_shift
150
+ shift_left = first_right_key != next_right_key
151
+ shift_right = first_left_key != next_left_key
152
+ one_to_one_left_row if shift_left
153
+ one_to_one_right_row if shift_right
154
+ end
155
+
156
+ def one_to_one_merge?
157
+ merge_key != next_left_key && merge_key != next_right_key
158
+ end
159
+
160
+ def one_to_many_merge?
161
+ !(merge_key == next_left_key && merge_key == next_right_key)
162
+ end
163
+
164
+ def one_to_one_left_row
165
+ left_key_values.shift
166
+ left.shift
167
+ end
168
+
169
+ def one_to_one_right_row
170
+ right_key_values.shift
171
+ right.shift
172
+ end
173
+
174
+ def left_row_missing_right
175
+ val = one_to_one_left_row
176
+ expand_row(val, left_keys) if keep_left
177
+ end
178
+
179
+ def right_row_missing_left
180
+ val = one_to_one_right_row
181
+ expand_row(val, right_keys) if keep_right
182
+ end
183
+
184
+ def lt(k1, k2)
185
+ (k1 <=> k2) == -1
186
+ end
187
+
188
+ def merge_rows(lrow, rrow)
189
+ left_keys
190
+ .to_h { |from, to| [to, lrow[from]] }
191
+ .merge(on.to_h { |col| [col, lrow[col]] })
192
+ .merge(indicator ? { indicator => nil } : {})
193
+ .merge(right_keys.to_h { |from, to| [to, rrow[from]] })
194
+ end
195
+
196
+ def expand_row(row, renamings)
197
+ renamings
198
+ .to_h { |from, to| [to, row[from]] }
199
+ .merge(on.to_h { |col| [col, row[col]] })
200
+ .merge(indicator ? { indicator => nil } : {})
201
+ end
202
+
203
+ def first_right_key
204
+ right_key_values.empty? ? nil : right_key_values.first
205
+ end
206
+
207
+ def next_right_key
208
+ right_key_values[1]
209
+ end
210
+
211
+ def first_left_key
212
+ left_key_values.empty? ? nil : left_key_values.first
213
+ end
214
+
215
+ def next_left_key
216
+ left_key_values[1]
217
+ end
218
+
219
+ def left_rows_at_merge_key
220
+ left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
221
+ end
222
+
223
+ def right_rows_at_merge_key
224
+ right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
225
+ end
226
+
227
+ def cartesian_product
228
+ @cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
229
+ merge_rows(left_row, right_row)
230
+ end
231
+ end
232
+
233
+ def end_cartesian_product
234
+ left_size = left_rows_at_merge_key.size
235
+ left_key_values.shift(left_size)
236
+ left.shift(left_size)
237
+
238
+ right_size = right_rows_at_merge_key.size
239
+ right_key_values.shift(right_size)
240
+ right.shift(right_size)
241
+ @cartesian_product = nil
242
+ end
243
+
244
+ def validate_on!(left_df, right_df)
245
+ @on.each do |on|
246
+ (left_df.has_vector?(on) && right_df.has_vector?(on)) or
247
+ raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
248
+ end
249
+ end
250
+
251
+ def safe_compare(left_array, right_array)
252
+ left_array.zip(right_array).map do |l, r|
253
+ next 0 if l.nil? && r.nil?
254
+ next 1 if r.nil?
255
+ next -1 if l.nil?
256
+
257
+ l <=> r
258
+ end.reject(&:zero?).first || 0
259
+ end
260
+ end
261
+
262
+ module Merge
263
+ class << self
264
+ def join(df1, df2, opts = {})
265
+ MergeFrame.new(df1, df2, opts).join
266
+ end
267
+ end
268
+ end
269
+ end
270
+ end