daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,34 @@
1
+ module DaruLite
2
+ # Defines constants and methods related to configuration
3
+ module Configuration
4
+ INSPECT_OPTIONS_KEYS = [
5
+ :max_rows,
6
+ # Terminal
7
+ :spacing
8
+ ].freeze
9
+
10
+ # Jupyter
11
+ DEFAULT_MAX_ROWS = 30
12
+
13
+ # Terminal
14
+ DEFAULT_SPACING = 10
15
+
16
+ attr_accessor(*INSPECT_OPTIONS_KEYS)
17
+
18
+ def configure
19
+ yield self
20
+ end
21
+
22
+ def self.extended(base)
23
+ base.reset_options
24
+ end
25
+
26
+ def reset_options
27
+ self.max_rows = DEFAULT_MAX_ROWS
28
+
29
+ self.spacing = DEFAULT_SPACING
30
+ end
31
+ end
32
+
33
+ extend Configuration
34
+ end
@@ -0,0 +1,403 @@
1
+ module DaruLite
2
+ module Core
3
+ class GroupBy
4
+ class << self
5
+ extend Gem::Deprecate
6
+
7
+ # @private
8
+ def group_by_index_to_positions(indexes_with_positions, sort: false)
9
+ index_to_positions = {}
10
+
11
+ indexes_with_positions.each do |idx, position|
12
+ (index_to_positions[idx] ||= []) << position
13
+ end
14
+
15
+ if sort # TODO: maybe add a more "stable" sorting option?
16
+ sorted_keys = index_to_positions.keys.sort(&DaruLite::Core::GroupBy::TUPLE_SORTER)
17
+ index_to_positions = sorted_keys.to_h { |k| [k, index_to_positions[k]] }
18
+ end
19
+
20
+ index_to_positions
21
+ end
22
+ alias get_positions_group_map_on group_by_index_to_positions
23
+ deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
24
+
25
+ # @private
26
+ def get_positions_group_for_aggregation(multi_index, level = -1)
27
+ raise unless multi_index.is_a?(DaruLite::MultiIndex)
28
+
29
+ new_index = multi_index.dup
30
+ new_index.remove_layer(level) # TODO: recheck code of DaruLite::MultiIndex#remove_layer
31
+
32
+ group_by_index_to_positions(new_index.each_with_index)
33
+ end
34
+
35
+ # @private
36
+ def get_positions_group_map_for_df(df, group_by_keys, sort: true)
37
+ indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
38
+
39
+ group_by_index_to_positions(indexes_with_positions, sort: sort)
40
+ end
41
+
42
+ # @private
43
+ def group_map_from_positions_to_indexes(positions_group_map, index)
44
+ positions_group_map.transform_values { |positions| positions.map { |pos| index.at(pos) } }
45
+ end
46
+
47
+ # @private
48
+ def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
49
+ return nil if group_map == {}
50
+
51
+ new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
52
+ new_index = DaruLite::MultiIndex.from_tuples(new_index)
53
+
54
+ return DaruLite::DataFrame.new({}, index: new_index) if remaining_vectors == []
55
+
56
+ new_rows_order = group_map.values.flatten
57
+ new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
58
+ new_df.index = new_index
59
+
60
+ new_df
61
+ end
62
+ end
63
+
64
+ # The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
65
+ attr_reader :group_vectors, :non_group_vectors
66
+
67
+ # lazy accessor/attr_reader for the attribute groups
68
+ def groups
69
+ @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
70
+ end
71
+ alias groups_by_idx groups
72
+
73
+ # lazy accessor/attr_reader for the attribute df
74
+ def df
75
+ @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
76
+ end
77
+ alias grouped_df df
78
+
79
+ # Iterate over each group created by group_by. A DataFrame is yielded in
80
+ # block.
81
+ def each_group
82
+ return to_enum(:each_group) unless block_given?
83
+
84
+ groups.each_key do |k|
85
+ yield get_group(k)
86
+ end
87
+ end
88
+
89
+ TUPLE_SORTER = lambda do |left, right|
90
+ return -1 unless right
91
+ return 1 unless left
92
+
93
+ left = left.compact
94
+ right = right.compact
95
+ return left <=> right || 0 if left.length == right.length
96
+
97
+ left.length <=> right.length
98
+ end
99
+
100
+ def initialize(context, names)
101
+ @group_vectors = names
102
+ @non_group_vectors = context.vectors.to_a - names
103
+
104
+ @context = context # TODO: maybe rename in @original_df
105
+
106
+ # FIXME: It feels like we don't want to sort here. Ruby's #group_by
107
+ # never sorts:
108
+ #
109
+ # ['test', 'me', 'please'].group_by(&:size)
110
+ # # => {4=>["test"], 2=>["me"], 6=>["please"]}
111
+ #
112
+ # - zverok, 2016-09-12
113
+ @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
114
+ end
115
+
116
+ # Get a DaruLite::Vector of the size of each group.
117
+ def size
118
+ index = get_grouped_index
119
+
120
+ values = @groups_by_pos.values.map(&:size)
121
+ DaruLite::Vector.new(values, index: index, name: :size)
122
+ end
123
+
124
+ # Get the first group
125
+ def first
126
+ head(1)
127
+ end
128
+
129
+ # Get the last group
130
+ def last
131
+ tail(1)
132
+ end
133
+
134
+ # Get the top 'n' groups
135
+ # @param quantity [Fixnum] (5) The number of groups.
136
+ # @example Usage of head
137
+ # df = DaruLite::DataFrame.new({
138
+ # a: %w{foo bar foo bar foo bar foo foo},
139
+ # b: %w{one one two three two two one three},
140
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
141
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
142
+ # })
143
+ # df.group_by([:a, :b]).head(1)
144
+ # # =>
145
+ # # #<DaruLite::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
146
+ # # a b c d
147
+ # # 1 bar one 2 22
148
+ # # 3 bar three 1 44
149
+ # # 5 bar two 6 66
150
+ # # 0 foo one 1 11
151
+ # # 7 foo three 8 88
152
+ # # 2 foo two 3 33
153
+ def head(quantity = 5)
154
+ select_groups_from :first, quantity
155
+ end
156
+
157
+ # Get the bottom 'n' groups
158
+ # @param quantity [Fixnum] (5) The number of groups.
159
+ # @example Usage of tail
160
+ # df = DaruLite::DataFrame.new({
161
+ # a: %w{foo bar foo bar foo bar foo foo},
162
+ # b: %w{one one two three two two one three},
163
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
164
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
165
+ # })
166
+ # # df.group_by([:a, :b]).tail(1)
167
+ # # =>
168
+ # # #<DaruLite::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
169
+ # # a b c d
170
+ # # 1 bar one 2 22
171
+ # # 3 bar three 1 44
172
+ # # 5 bar two 6 66
173
+ # # 6 foo one 3 77
174
+ # # 7 foo three 8 88
175
+ # # 4 foo two 3 55
176
+ def tail(quantity = 5)
177
+ select_groups_from :last, quantity
178
+ end
179
+
180
+ # Calculate mean of numeric groups, excluding missing values.
181
+ # @example Usage of mean
182
+ # df = DaruLite::DataFrame.new({
183
+ # a: %w{foo bar foo bar foo bar foo foo},
184
+ # b: %w{one one two three two two one three},
185
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
186
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
187
+ # df.group_by([:a, :b]).mean
188
+ # # =>
189
+ # # #<DaruLite::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
190
+ # # c d
191
+ # # ["bar", "one"] 2 22
192
+ # # ["bar", "three"] 1 44
193
+ # # ["bar", "two"] 6 66
194
+ # # ["foo", "one"] 2.0 44.0
195
+ # # ["foo", "three"] 8 88
196
+ # # ["foo", "two"] 3.0 44.0
197
+ def mean
198
+ apply_method :numeric, :mean
199
+ end
200
+
201
+ # Calculate the median of numeric groups, excluding missing values.
202
+ def median
203
+ apply_method :numeric, :median
204
+ end
205
+
206
+ # Calculate sum of numeric groups, excluding missing values.
207
+ def sum
208
+ apply_method :numeric, :sum
209
+ end
210
+
211
+ # Count groups, excludes missing values.
212
+ # @example Using count
213
+ # df = DaruLite::DataFrame.new({
214
+ # a: %w{foo bar foo bar foo bar foo foo},
215
+ # b: %w{one one two three two two one three},
216
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
217
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
218
+ # })
219
+ # df.group_by([:a, :b]).count
220
+ # # =>
221
+ # # #<DaruLite::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
222
+ # # c d
223
+ # # ["bar", "one"] 1 1
224
+ # # ["bar", "two"] 1 1
225
+ # # ["bar", "three"] 1 1
226
+ # # ["foo", "one"] 2 2
227
+ # # ["foo", "three"] 1 1
228
+ # # ["foo", "two"] 2 2
229
+ def count
230
+ width = @non_group_vectors.size
231
+ DaruLite::DataFrame.new([size] * width, order: @non_group_vectors)
232
+ end
233
+
234
+ # Calculate sample standard deviation of numeric vector groups, excluding
235
+ # missing values.
236
+ def std
237
+ apply_method :numeric, :std
238
+ end
239
+
240
+ # Find the max element of each numeric vector group.
241
+ def max
242
+ apply_method :numeric, :max
243
+ end
244
+
245
+ # Find the min element of each numeric vector group.
246
+ def min
247
+ apply_method :numeric, :min
248
+ end
249
+
250
+ # Returns one of the selected groups as a DataFrame.
251
+ # @param group [Array] The group that is to be selected from those grouped.
252
+ #
253
+ # @example Getting a group
254
+ #
255
+ # df = DaruLite::DataFrame.new({
256
+ # a: %w{foo bar foo bar foo bar foo foo},
257
+ # b: %w{one one two three two two one three},
258
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
259
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
260
+ # })
261
+ # df.group_by([:a, :b]).get_group ['bar','two']
262
+ # #=>
263
+ # ##<DaruLite::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
264
+ # # a b c d
265
+ # # 5 bar two 6 66
266
+ def get_group(group)
267
+ indexes = groups_by_idx[group]
268
+ elements = @context.each_vector.map(&:to_a)
269
+ transpose = elements.transpose
270
+ rows = indexes.each.map { |idx| transpose[idx] }
271
+
272
+ DaruLite::DataFrame.rows(
273
+ rows, index: indexes, order: @context.vectors
274
+ )
275
+ end
276
+
277
+ # Iteratively applies a function to the values in a group and accumulates the result.
278
+ # @param init (nil) The initial value of the accumulator.
279
+ # @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
280
+ # is the accumulated result. The second argument is a DataFrame row.
281
+ # @example Usage of reduce
282
+ # df = DaruLite::DataFrame.new({
283
+ # a: ['a','b'] * 3,
284
+ # b: [1,2,3] * 2,
285
+ # c: 'A'..'F'
286
+ # })
287
+ # df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
288
+ # # =>
289
+ # # #<DaruLite::Vector:70343147159900 @name = nil @size = 2 >
290
+ # # nil
291
+ # # a ACE
292
+ # # b BDF
293
+ def reduce(init = nil)
294
+ result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
295
+ group_indices = indices.map { |v| @context.index.to_a[v] }
296
+
297
+ grouped_result = init
298
+ group_indices.each do |idx|
299
+ grouped_result = yield(grouped_result, @context.row[idx])
300
+ end
301
+
302
+ h[group] = grouped_result
303
+ end
304
+
305
+ index = get_grouped_index(result_hash.keys)
306
+
307
+ DaruLite::Vector.new(result_hash.values, index: index)
308
+ end
309
+
310
+ def inspect
311
+ grouped_df.inspect
312
+ end
313
+
314
+ # Function to use for aggregating the data.
315
+ # `group_by` is using DaruLite::DataFrame#aggregate
316
+ #
317
+ # @param options [Hash] options for column, you want in resultant dataframe
318
+ #
319
+ # @return [DaruLite::DataFrame]
320
+ #
321
+ # @example
322
+ #
323
+ # df = DaruLite::DataFrame.new(
324
+ # name: ['Ram','Krishna','Ram','Krishna','Krishna'],
325
+ # visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
326
+ #
327
+ # => #<DaruLite::DataFrame(5x2)>
328
+ # name visited
329
+ # 0 Ram Hyderabad
330
+ # 1 Krishna Delhi
331
+ # 2 Ram Mumbai
332
+ # 3 Krishna Raipur
333
+ # 4 Krishna Banglore
334
+ #
335
+ # df.group_by(:name)
336
+ # => #<DaruLite::DataFrame(5x1)>
337
+ # visited
338
+ # Krishna 1 Delhi
339
+ # 3 Raipur
340
+ # 4 Banglore
341
+ # Ram 0 Hyderabad
342
+ # 2 Mumbai
343
+ #
344
+ # df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
345
+ # => #<DaruLite::DataFrame(2x1)>
346
+ # visited
347
+ # Krishna Delhi,Raipur,Banglore
348
+ # Ram Hyderabad,Mumbai
349
+ #
350
+ def aggregate(options = {})
351
+ new_index = get_grouped_index
352
+
353
+ @context.aggregate(options) { [@groups_by_pos.values, new_index] }
354
+ end
355
+
356
+ private
357
+
358
+ def select_groups_from(method, quantity)
359
+ selection = @context
360
+ rows = []
361
+ indexes = []
362
+
363
+ groups_by_idx.each_value do |index|
364
+ index.send(method, quantity).each do |idx|
365
+ rows << selection.row[idx].to_a
366
+ indexes << idx
367
+ end
368
+ end
369
+ indexes.flatten!
370
+
371
+ DaruLite::DataFrame.rows(rows, order: @context.vectors, index: indexes)
372
+ end
373
+
374
+ def select_numeric_non_group_vectors
375
+ @non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
376
+ end
377
+
378
+ def apply_method(method_type, method)
379
+ raise 'To implement' if method_type != :numeric
380
+
381
+ aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
382
+
383
+ aggregate(aggregation_options)
384
+ end
385
+
386
+ def get_grouped_index(index_tuples = nil)
387
+ index_tuples = @groups_by_pos.keys if index_tuples.nil?
388
+
389
+ if multi_indexed_grouping?
390
+ DaruLite::MultiIndex.from_tuples(index_tuples)
391
+ else
392
+ DaruLite::Index.new(index_tuples.flatten)
393
+ end
394
+ end
395
+
396
+ def multi_indexed_grouping?
397
+ return false unless @groups_by_pos.keys[0]
398
+
399
+ @groups_by_pos.keys[0].size > 1
400
+ end
401
+ end
402
+ end
403
+ end
@@ -0,0 +1,270 @@
1
+ module DaruLite
2
+ module Core
3
+ class MergeFrame
4
+ class NilSorter
5
+ include Comparable
6
+
7
+ def nil?
8
+ true
9
+ end
10
+
11
+ def ==(_other)
12
+ false
13
+ end
14
+
15
+ def <=>(other)
16
+ other.nil? ? 0 : -1
17
+ end
18
+ end
19
+
20
+ # quick-fix for issue #171
21
+ def initialize(left_df, right_df, opts = {})
22
+ init_opts(opts)
23
+ validate_on!(left_df, right_df)
24
+ key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
25
+
26
+ @left = df_to_a(left_df)
27
+ @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
28
+ @left_key_values = @left.map(&key_sanitizer)
29
+
30
+ @right = df_to_a(right_df)
31
+ @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
32
+ @right_key_values = @right.map(&key_sanitizer)
33
+
34
+ @left_keys, @right_keys = merge_keys(left_df, right_df, on)
35
+ end
36
+
37
+ def join
38
+ res = []
39
+
40
+ until left.empty? && right.empty?
41
+ lkey = first_left_key
42
+ rkey = first_right_key
43
+
44
+ row(lkey, rkey).tap { |r| res << r if r }
45
+ end
46
+
47
+ DaruLite::DataFrame.new(res, order: dataframe_vector_names)
48
+ end
49
+
50
+ private
51
+
52
+ attr_reader :on, :indicator,
53
+ :left, :left_key_values, :keep_left, :left_keys,
54
+ :right, :right_key_values, :keep_right, :right_keys
55
+
56
+ attr_accessor :merge_key
57
+
58
+ LEFT_RIGHT_COMBINATIONS = {
59
+ # left right
60
+ inner: [false, false],
61
+ left: [true, false],
62
+ right: [false, true],
63
+ outer: [true, true]
64
+ }.freeze
65
+
66
+ def init_opts(opts)
67
+ @on = opts[:on]
68
+ @keep_left, @keep_right = extract_left_right(opts[:how])
69
+ @indicator = opts[:indicator]
70
+ end
71
+
72
+ def dataframe_vector_names
73
+ left_keys.values + on + right_keys.values + Array(indicator)
74
+ end
75
+
76
+ def extract_left_right(how)
77
+ LEFT_RIGHT_COMBINATIONS[how] or
78
+ raise ArgumentError, "Unrecognized join option: #{how}"
79
+ end
80
+
81
+ def sanitize_merge_keys(merge_keys)
82
+ merge_keys.map { |v| v.nil? ? NilSorter.new : v }
83
+ end
84
+
85
+ def df_to_a(df)
86
+ # FIXME: much faster than "native" DataFrame#to_a. Should not be
87
+ h = df.to_h
88
+ keys = h.keys
89
+ h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
90
+ end
91
+
92
+ def merge_keys(df1, df2, on)
93
+ duplicates =
94
+ (df1.vectors.to_a + df2.vectors.to_a - on)
95
+ .group_by(&:itself)
96
+ .select { |_, g| g.count == 2 }.map(&:first)
97
+
98
+ [
99
+ guard_keys(df1.vectors.to_a - on, duplicates, 1),
100
+ guard_keys(df2.vectors.to_a - on, duplicates, 2)
101
+ ]
102
+ end
103
+
104
+ def guard_keys(keys, duplicates, num)
105
+ keys.to_h { |v| [v, guard_duplicate(v, duplicates, num)] }
106
+ end
107
+
108
+ def guard_duplicate(val, duplicates, num)
109
+ duplicates.include?(val) ? :"#{val}_#{num}" : val
110
+ end
111
+
112
+ def row(lkey, rkey)
113
+ # :nocov:
114
+ # It's just an impossibility handler, can't be covered :)
115
+ raise 'Unexpected condition met during merge' if !lkey && !rkey
116
+
117
+ # :nocov:
118
+ if lkey == rkey
119
+ self.merge_key = lkey
120
+ add_indicator(merge_matching_rows, :both)
121
+ elsif !rkey || lt(lkey, rkey)
122
+ add_indicator(left_row_missing_right, :left_only)
123
+ else # !lkey || lt(rkey, lkey)
124
+ add_indicator(right_row_missing_left, :right_only)
125
+ end
126
+ end
127
+
128
+ def add_indicator(row, indicator_value)
129
+ return row unless indicator
130
+
131
+ row[indicator] = indicator_value
132
+ row
133
+ end
134
+
135
+ def merge_matching_rows
136
+ if one_to_one_merge?
137
+ merge_rows(one_to_one_left_row, one_to_one_right_row)
138
+ elsif one_to_many_merge?
139
+ result = merge_rows(left.first, right.first)
140
+ one_to_many_shift
141
+ result
142
+ else
143
+ result = cartesian_product.shift
144
+ end_cartesian_product if cartesian_product.empty?
145
+ result
146
+ end
147
+ end
148
+
149
+ def one_to_many_shift
150
+ shift_left = first_right_key != next_right_key
151
+ shift_right = first_left_key != next_left_key
152
+ one_to_one_left_row if shift_left
153
+ one_to_one_right_row if shift_right
154
+ end
155
+
156
+ def one_to_one_merge?
157
+ merge_key != next_left_key && merge_key != next_right_key
158
+ end
159
+
160
+ def one_to_many_merge?
161
+ !(merge_key == next_left_key && merge_key == next_right_key)
162
+ end
163
+
164
+ def one_to_one_left_row
165
+ left_key_values.shift
166
+ left.shift
167
+ end
168
+
169
+ def one_to_one_right_row
170
+ right_key_values.shift
171
+ right.shift
172
+ end
173
+
174
+ def left_row_missing_right
175
+ val = one_to_one_left_row
176
+ expand_row(val, left_keys) if keep_left
177
+ end
178
+
179
+ def right_row_missing_left
180
+ val = one_to_one_right_row
181
+ expand_row(val, right_keys) if keep_right
182
+ end
183
+
184
+ def lt(k1, k2)
185
+ (k1 <=> k2) == -1
186
+ end
187
+
188
+ def merge_rows(lrow, rrow)
189
+ left_keys
190
+ .to_h { |from, to| [to, lrow[from]] }
191
+ .merge(on.to_h { |col| [col, lrow[col]] })
192
+ .merge(indicator ? { indicator => nil } : {})
193
+ .merge(right_keys.to_h { |from, to| [to, rrow[from]] })
194
+ end
195
+
196
+ def expand_row(row, renamings)
197
+ renamings
198
+ .to_h { |from, to| [to, row[from]] }
199
+ .merge(on.to_h { |col| [col, row[col]] })
200
+ .merge(indicator ? { indicator => nil } : {})
201
+ end
202
+
203
+ def first_right_key
204
+ right_key_values.empty? ? nil : right_key_values.first
205
+ end
206
+
207
+ def next_right_key
208
+ right_key_values[1]
209
+ end
210
+
211
+ def first_left_key
212
+ left_key_values.empty? ? nil : left_key_values.first
213
+ end
214
+
215
+ def next_left_key
216
+ left_key_values[1]
217
+ end
218
+
219
+ def left_rows_at_merge_key
220
+ left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
221
+ end
222
+
223
+ def right_rows_at_merge_key
224
+ right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
225
+ end
226
+
227
+ def cartesian_product
228
+ @cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
229
+ merge_rows(left_row, right_row)
230
+ end
231
+ end
232
+
233
+ def end_cartesian_product
234
+ left_size = left_rows_at_merge_key.size
235
+ left_key_values.shift(left_size)
236
+ left.shift(left_size)
237
+
238
+ right_size = right_rows_at_merge_key.size
239
+ right_key_values.shift(right_size)
240
+ right.shift(right_size)
241
+ @cartesian_product = nil
242
+ end
243
+
244
+ def validate_on!(left_df, right_df)
245
+ @on.each do |on|
246
+ (left_df.has_vector?(on) && right_df.has_vector?(on)) or
247
+ raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
248
+ end
249
+ end
250
+
251
+ def safe_compare(left_array, right_array)
252
+ left_array.zip(right_array).map do |l, r|
253
+ next 0 if l.nil? && r.nil?
254
+ next 1 if r.nil?
255
+ next -1 if l.nil?
256
+
257
+ l <=> r
258
+ end.reject(&:zero?).first || 0
259
+ end
260
+ end
261
+
262
+ module Merge
263
+ class << self
264
+ def join(df1, df2, opts = {})
265
+ MergeFrame.new(df1, df2, opts).join
266
+ end
267
+ end
268
+ end
269
+ end
270
+ end