red_amber 0.4.2 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.devcontainer/Dockerfile +75 -0
  3. data/.devcontainer/devcontainer.json +38 -0
  4. data/.devcontainer/onCreateCommand.sh +22 -0
  5. data/.rubocop.yml +11 -5
  6. data/CHANGELOG.md +141 -17
  7. data/Gemfile +5 -6
  8. data/README.ja.md +271 -0
  9. data/README.md +52 -31
  10. data/Rakefile +55 -0
  11. data/benchmark/group.yml +12 -5
  12. data/doc/Dev_Containers.ja.md +290 -0
  13. data/doc/Dev_Containers.md +292 -0
  14. data/doc/qmd/examples_of_red_amber.qmd +4596 -0
  15. data/doc/qmd/red-amber.qmd +90 -0
  16. data/docker/Dockerfile +2 -2
  17. data/docker/Gemfile +8 -3
  18. data/docker/docker-compose.yml +1 -1
  19. data/docker/readme.md +5 -5
  20. data/lib/red_amber/data_frame.rb +78 -4
  21. data/lib/red_amber/data_frame_combinable.rb +147 -119
  22. data/lib/red_amber/data_frame_displayable.rb +7 -6
  23. data/lib/red_amber/data_frame_loadsave.rb +1 -1
  24. data/lib/red_amber/data_frame_selectable.rb +51 -2
  25. data/lib/red_amber/data_frame_variable_operation.rb +6 -6
  26. data/lib/red_amber/group.rb +476 -127
  27. data/lib/red_amber/helper.rb +26 -0
  28. data/lib/red_amber/subframes.rb +18 -11
  29. data/lib/red_amber/vector.rb +45 -25
  30. data/lib/red_amber/vector_aggregation.rb +26 -0
  31. data/lib/red_amber/vector_selectable.rb +124 -40
  32. data/lib/red_amber/vector_string_function.rb +279 -0
  33. data/lib/red_amber/vector_unary_element_wise.rb +4 -0
  34. data/lib/red_amber/vector_updatable.rb +28 -0
  35. data/lib/red_amber/version.rb +1 -1
  36. data/lib/red_amber.rb +2 -1
  37. data/red_amber.gemspec +3 -3
  38. metadata +19 -14
  39. data/docker/Gemfile.lock +0 -80
  40. data/docker/example +0 -74
  41. data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
  42. data/docker/notebook/red-amber.ipynb +0 -188
@@ -4,6 +4,7 @@ module RedAmber
4
4
  # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
+ include Helper
7
8
 
8
9
  using RefineArrowTable
9
10
 
@@ -25,12 +26,7 @@ module RedAmber
25
26
  private
26
27
 
27
28
  # @!macro [attach] define_group_aggregation
28
- # @!method $1(*summary_keys)
29
- # Group aggregation function `$1`.
30
- # @param summary_keys [Array<Symbol, String>]
31
- # summary keys.
32
- # @return [DataFrame]
33
- # aggregated DataFrame
29
+ # Returns aggregated DataFrame.
34
30
  #
35
31
  def define_group_aggregation(function)
36
32
  define_method(function) do |*summary_keys|
@@ -54,7 +50,7 @@ module RedAmber
54
50
  # @param group_keys [Array<Symbol, String>]
55
51
  # keys for grouping.
56
52
  # @return [Group]
57
- # Group object.
53
+ # Group object. It inspects grouped columns and its count.
58
54
  # @example
59
55
  # Group.new(penguins, :species)
60
56
  #
@@ -78,13 +74,93 @@ module RedAmber
78
74
  @group = @dataframe.table.group(*@group_keys)
79
75
  end
80
76
 
81
- define_group_aggregation(:count)
77
+ # @!macro group_aggregation
78
+ # @param group_keys [Array<Symbol, String>]
79
+ # keys for grouping.
80
+ # @return [DataFrame]
81
+ # aggregated DataFrame
82
+
83
+ # Whether all elements in each group evaluate to true.
84
+ #
85
+ # @!method all(*group_keys)
86
+ # @macro group_aggregation
87
+ # @example For boolean columns by default.
88
+ # dataframe
89
+ #
90
+ # # =>
91
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000230dc>
92
+ # x y z
93
+ # <uint8> <string> <boolean>
94
+ # 0 1 A false
95
+ # 1 2 A true
96
+ # 2 3 B false
97
+ # 3 4 B (nil)
98
+ # 4 5 B true
99
+ # 5 6 C false
100
+ #
101
+ # dataframe.group(:y).all
102
+ #
103
+ # # =>
104
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000fc08>
105
+ # y all(z)
106
+ # <string> <boolean>
107
+ # 0 A false
108
+ # 1 B false
109
+ # 2 C false
110
+ #
111
+ define_group_aggregation :all
112
+
113
+ # Whether any elements in each group evaluate to true.
114
+ #
115
+ # @!method any(*group_keys)
116
+ # @macro group_aggregation
117
+ # @example For boolean columns by default.
118
+ # dataframe.group(:y).any
119
+ #
120
+ # # =>
121
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000000117ec>
122
+ # y any(z)
123
+ # <string> <boolean>
124
+ # 0 A true
125
+ # 1 B true
126
+ # 2 C false
127
+ #
128
+ define_group_aggregation :any
129
+
130
+ # Count the number of non-nil values in each group.
131
+ # If counts are the same (and do not include NaN or nil),
132
+ # columns for counts are unified.
133
+ #
134
+ # @!method max(*group_keys)
135
+ # @macro group_aggregation
136
+ # @example Show counts for each group.
137
+ # dataframe.group(:y).count
138
+ #
139
+ # # =>
140
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
141
+ # y count(x) count(z)
142
+ # <string> <int64> <int64>
143
+ # 0 A 2 2
144
+ # 1 B 3 2
145
+ # 2 C 1 1
146
+ #
147
+ # dataframe.group(:z).count
148
+ # # same as dataframe.group(:z).count(:x, :y)
149
+ #
150
+ # =>
151
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000122834>
152
+ # z count
153
+ # <boolean> <int64>
154
+ # 0 false 3
155
+ # 1 true 2
156
+ # 2 (nil) 1
157
+ #
158
+ define_group_aggregation :count
82
159
  alias_method :__count, :count
83
160
  private :__count
84
161
 
85
- def count(*summary_keys)
86
- df = __count(summary_keys)
87
- # if counts are the same (and do not include NaN or nil), aggregate count columns.
162
+ def count(*group_keys)
163
+ df = __count(group_keys)
88
164
  if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
89
165
  df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
90
166
  else
@@ -92,19 +168,213 @@ module RedAmber
92
168
  end
93
169
  end
94
170
 
95
- define_group_aggregation(:sum)
171
+ # Returns each record group size as a DataFrame.
172
+ #
173
+ # @return [DataFrame]
174
+ # DataFrame consists of:
175
+ # - Group key columns.
176
+ # - Result columns by group aggregation.
177
+ # @example
178
+ # penguins.group(:species).group_count
179
+ #
180
+ # # =>
181
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
182
+ # species group_count
183
+ # <string> <uint8>
184
+ # 0 Adelie 152
185
+ # 1 Chinstrap 68
186
+ # 2 Gentoo 124
187
+ #
188
+ def group_count
189
+ DataFrame.create(group_table)
190
+ end
191
+ alias_method :count_all, :group_count
96
192
 
97
- define_group_aggregation(:product)
193
+ # Count the unique values in each group.
194
+ #
195
+ # @!method count_uniq(*group_keys)
196
+ # @macro group_aggregation
197
+ # @example Show counts for each group.
198
+ # dataframe.group(:y).count_uniq
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
202
+ # y count_uniq(x)
203
+ # <string> <int64>
204
+ # 0 A 2
205
+ # 1 B 3
206
+ # 2 C 1
207
+ #
208
+ define_group_aggregation :count_distinct
209
+ def count_uniq(*group_keys)
210
+ df = count_distinct(*group_keys)
211
+ df.rename do
212
+ keys_org = keys.select { _1.start_with?('count_distinct') }
213
+ keys_renamed = keys_org.map { _1.to_s.gsub('distinct', 'uniq') }
214
+ keys_org.zip keys_renamed
215
+ end
216
+ end
98
217
 
99
- define_group_aggregation(:mean)
218
+ # Compute maximum of values in each group for numeric columns.
219
+ #
220
+ # @!method max(*group_keys)
221
+ # @macro group_aggregation
222
+ # @example
223
+ # dataframe.group(:y).max
224
+ #
225
+ # # =>
226
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000014ae74>
227
+ # y max(x)
228
+ # <string> <uint8>
229
+ # 0 A 2
230
+ # 1 B 5
231
+ # 2 C 6
232
+ #
233
+ define_group_aggregation :max
100
234
 
101
- define_group_aggregation(:min)
235
+ # Compute mean of values in each group for numeric columns.
236
+ #
237
+ # @!method mean(*group_keys)
238
+ # @macro group_aggregation
239
+ # @example
240
+ # dataframe.group(:y).mean
241
+ #
242
+ # # =>
243
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
244
+ # y mean(x)
245
+ # <string> <double>
246
+ # 0 A 1.5
247
+ # 1 B 4.0
248
+ # 2 C 6.0
249
+ #
250
+ define_group_aggregation :mean
102
251
 
103
- define_group_aggregation(:max)
252
+ # Compute median of values in each group for numeric columns.
253
+ #
254
+ # @!method median(*group_keys)
255
+ # @macro group_aggregation
256
+ # @example
257
+ # dataframe.group(:y).median
258
+ #
259
+ # # =>
260
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
261
+ # y median(x)
262
+ # <string> <double>
263
+ # 0 A 1.5
264
+ # 1 B 4.0
265
+ # 2 C 6.0
266
+ #
267
+ define_group_aggregation :approximate_median
268
+ def median(*group_keys)
269
+ df = approximate_median(*group_keys)
270
+ df.rename do
271
+ keys_org = keys.select { _1.start_with?('approximate_') }
272
+ keys_renamed = keys_org.map { _1.to_s.delete_prefix('approximate_') }
273
+ keys_org.zip keys_renamed
274
+ end
275
+ end
104
276
 
105
- define_group_aggregation(:stddev)
277
+ # Compute minimum of values in each group for numeric columns.
278
+ #
279
+ # @!method min(*group_keys)
280
+ # @macro group_aggregation
281
+ # @example
282
+ # dataframe.group(:y).min
283
+ #
284
+ # # =>
285
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000018f38>
286
+ # y min(x)
287
+ # <string> <uint8>
288
+ # 0 A 1
289
+ # 1 B 3
290
+ # 2 C 6
291
+ #
292
+ define_group_aggregation :min
106
293
 
107
- define_group_aggregation(:variance)
294
+ # Get one value from each group.
295
+ #
296
+ # @!method one(*group_keys)
297
+ # @macro group_aggregation
298
+ # @example
299
+ # dataframe.group(:y).one
300
+ #
301
+ # # =>
302
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000002885c>
303
+ # y one(x)
304
+ # <string> <uint8>
305
+ # 0 A 1
306
+ # 1 B 3
307
+ # 2 C 6
308
+ #
309
+ define_group_aggregation :one
310
+
311
+ # Compute product of values in each group for numeric columns.
312
+ #
313
+ # @!method product(*group_keys)
314
+ # @macro group_aggregation
315
+ # @example
316
+ # dataframe.group(:y).product
317
+ #
318
+ # # =>
319
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000021a84>
320
+ # y product(x)
321
+ # <string> <uint64>
322
+ # 0 A 2
323
+ # 1 B 60
324
+ # 2 C 6
325
+ #
326
+ define_group_aggregation :product
327
+
328
+ # Compute standard deviation of values in each group for numeric columns.
329
+ #
330
+ # @!method stddev(*group_keys)
331
+ # @macro group_aggregation
332
+ # @example
333
+ # dataframe.group(:y).stddev
334
+ #
335
+ # # =>
336
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000002be6c>
337
+ # y stddev(x)
338
+ # <string> <double>
339
+ # 0 A 0.5
340
+ # 1 B 0.082
341
+ # 2 C 0.0
342
+ #
343
+ define_group_aggregation :stddev
344
+
345
+ # Compute sum of values in each group for numeric columns.
346
+ #
347
+ # @!method sum(*group_keys)
348
+ # @macro group_aggregation
349
+ # @example
350
+ # dataframe.group(:y).sum
351
+ #
352
+ # # =>
353
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000032a14>
354
+ # y sum(x)
355
+ # <string> <uint64>
356
+ # 0 A 3
357
+ # 1 B 12
358
+ # 2 C 6
359
+ #
360
+ define_group_aggregation :sum
361
+
362
+ # Compute variance of values in each group for numeric columns.
363
+ #
364
+ # @!method variance(*group_keys)
365
+ # @macro group_aggregation
366
+ # @example
367
+ # dataframe.group(:y).variance
368
+ #
369
+ # # =>
370
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000003b1dc>
371
+ # y variance(x)
372
+ # <string> <double>
373
+ # 0 A 0.25
374
+ # 1 B 0.067
375
+ # 2 C 0.0
376
+ #
377
+ define_group_aggregation :variance
108
378
 
109
379
  # Returns Array of boolean filters to select each records in the Group.
110
380
  #
@@ -114,15 +384,27 @@ module RedAmber
114
384
  #
115
385
  def filters
116
386
  @filters ||= begin
117
- first, *others = @group_keys.map do |key|
118
- vector = @dataframe[key]
119
- vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
120
- end
121
-
122
- if others.empty?
123
- first.select(&:any?)
124
- else
125
- first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
387
+ group_values = group_table[group_keys].each_record.map(&:to_a)
388
+
389
+ Enumerator.new(group_table.n_rows) do |yielder|
390
+ group_values.each do |values|
391
+ booleans =
392
+ values.map.with_index do |value, i|
393
+ column = @dataframe[group_keys[i]].data
394
+ if value.nil?
395
+ Arrow::Function.find('is_null').execute([column])
396
+ elsif value.is_a?(Float) && value.nan?
397
+ Arrow::Function.find('is_nan').execute([column])
398
+ else
399
+ Arrow::Function.find('equal').execute([column, value])
400
+ end
401
+ end
402
+ filter =
403
+ booleans.reduce do |result, datum|
404
+ Arrow::Function.find('and_kleene').execute([result, datum])
405
+ end
406
+ yielder << Vector.create(filter.value)
407
+ end
126
408
  end
127
409
  end
128
410
  end
@@ -147,119 +429,174 @@ module RedAmber
147
429
  # group size.
148
430
  #
149
431
  def each
150
- filters
151
432
  return enum_for(:each) unless block_given?
152
433
 
153
- @filters.each do |filter|
154
- yield @dataframe[filter]
434
+ filters.each do |filter|
435
+ yield @dataframe.filter(filter)
155
436
  end
156
437
  @filters.size
157
438
  end
158
439
 
159
- # Returns each record group size as a DataFrame.
440
+ # String representation of self.
160
441
  #
161
- # @return [DataFrame]
162
- # DataFrame consists of:
163
- # - Group key columns.
164
- # - Result columns by group aggregation.
442
+ # @return [String]
443
+ # show information of self as a String.
165
444
  # @example
166
- # penguins.group(:species).group_count
445
+ # puts penguins.group(:species).inspect
167
446
  #
168
447
  # # =>
169
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
448
+ # #<RedAmber::Group : 0x0000000000003a98>
170
449
  # species group_count
171
450
  # <string> <uint8>
172
451
  # 0 Adelie 152
173
452
  # 1 Chinstrap 68
174
453
  # 2 Gentoo 124
175
454
  #
176
- def group_count
177
- DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
455
+ def inspect
456
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
178
457
  end
179
458
 
180
- # String representation of self.
459
+ # Summarize Group by aggregation functions from the block.
181
460
  #
182
- # @return [String]
183
- # show information of self as a String.
184
- # @example
185
- # puts penguins.group(:species).inspect
461
+ # @overload summarize
462
+ # Summarize by a function.
463
+ # @yieldparam group [Group]
464
+ # passes group object self.
465
+ # @yieldreturn [DataFrame]
466
+ # @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
467
+ # an aggregated DataFrame or an array of aggregated DataFrames.
468
+ # @return [DataFrame]
469
+ # summarized DataFrame.
470
+ # @example Single function and single variable
471
+ # group = penguins.group(:species)
472
+ # group
186
473
  #
187
- # # =>
188
- # #<RedAmber::Group : 0x0000000000003a98>
189
- # species count
190
- # <string> <uint8>
191
- # 0 Adelie 152
192
- # 1 Chinstrap 68
193
- # 2 Gentoo 124
474
+ # # =>
475
+ # #<RedAmber::Group : 0x000000000000c314>
476
+ # species group_count
477
+ # <string> <uint8>
478
+ # 0 Adelie 152
479
+ # 1 Chinstrap 68
480
+ # 2 Gentoo 124
194
481
  #
195
- def inspect
196
- "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
197
- end
198
-
199
- # Summarize Group by aggregation functions from the block.
482
+ # group.summarize { mean(:bill_length_mm) }
200
483
  #
201
- # @yieldparam group [Group]
202
- # passes group object self.
203
- # @yieldreturn [DataFrame, Array<DataFrame>]
204
- # an aggregated DataFrame or an array of aggregated DataFrames.
205
- # @return [DataFrame]
206
- # summarized DataFrame.
207
- # @example Single function and single variable
208
- # group = penguins.group(:species)
209
- # group
484
+ # # =>
485
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
486
+ # species mean(bill_length_mm)
487
+ # <string> <double>
488
+ # 0 Adelie 38.79
489
+ # 1 Chinstrap 48.83
490
+ # 2 Gentoo 47.5
210
491
  #
211
- # # =>
212
- # #<RedAmber::Group : 0x000000000000c314>
213
- # species count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
492
+ # @example Single function only
493
+ # group.summarize { mean }
218
494
  #
219
- # group.summarize { mean(:bill_length_mm) }
495
+ # # =>
496
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
497
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
498
+ # <string> <double> <double> ... <double>
499
+ # 0 Adelie 38.79 18.35 ... 2008.01
500
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
501
+ # 2 Gentoo 47.5 14.98 ... 2008.08
220
502
  #
221
- # # =>
222
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
- # species mean(bill_length_mm)
224
- # <string> <double>
225
- # 0 Adelie 38.79
226
- # 1 Chinstrap 48.83
227
- # 2 Gentoo 47.5
503
+ # @overload summarize
504
+ # Summarize by a function.
228
505
  #
229
- # @example Single function only
230
- # group.summarize { mean }
506
+ # @yieldparam group [Group]
507
+ # passes group object self.
508
+ # @yieldreturn [Array<DataFrame>]
509
+ # an aggregated DataFrame or an array of aggregated DataFrames.
510
+ # @return [DataFrame]
511
+ # summarized DataFrame.
512
+ # @example Multiple functions
513
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
231
514
  #
232
- # # =>
233
- # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
- # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
- # <string> <double> <double> ... <double>
236
- # 0 Adelie 38.79 18.35 ... 2008.01
237
- # 1 Chinstrap 48.83 18.42 ... 2007.97
238
- # 2 Gentoo 47.5 14.98 ... 2008.08
515
+ # # =>
516
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
517
+ # species min(bill_length_mm) max(bill_length_mm)
518
+ # <string> <double> <double>
519
+ # 0 Adelie 32.1 46.0
520
+ # 1 Chinstrap 40.9 58.0
521
+ # 2 Gentoo 40.9 59.6
239
522
  #
240
- # @example Multiple functions
241
- # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
523
+ # @overload summarize
524
+ # Summarize by a function.
242
525
  #
243
- # # =>
244
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
- # species min(bill_length_mm) max(bill_length_mm)
246
- # <string> <double> <double>
247
- # 0 Adelie 32.1 46.0
248
- # 1 Chinstrap 40.9 58.0
249
- # 2 Gentoo 40.9 59.6
250
- #
251
- def summarize(&block)
252
- agg = instance_eval(&block)
526
+ # @yieldparam group [Group]
527
+ # passes group object self.
528
+ # @yieldreturn [Hash{Symbol, String => DataFrame}]
529
+ # an aggregated DataFrame or an array of aggregated DataFrames.
530
+ # The DataFrame must return only one aggregated column.
531
+ # @return [DataFrame]
532
+ # summarized DataFrame.
533
+ # @example Rename column name by Hash
534
+ # group.summarize {
535
+ # {
536
+ # min_bill_length_mm: min(:bill_length_mm),
537
+ # max_bill_length_mm: max(:bill_length_mm),
538
+ # }
539
+ # }
540
+ #
541
+ # # =>
542
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
543
+ # species min_bill_length_mm max_bill_length_mm
544
+ # <string> <double> <double>
545
+ # 0 Adelie 32.1 46.0
546
+ # 1 Chinstrap 40.9 58.0
547
+ # 2 Gentoo 40.9 59.6
548
+ #
549
+ def summarize(*args, &block)
550
+ if block
551
+ agg = instance_eval(&block)
552
+ unless args.empty?
553
+ agg = [agg] if agg.is_a?(DataFrame)
554
+ agg = args.zip(agg).to_h
555
+ end
556
+ else
557
+ agg = args
558
+ end
559
+
253
560
  case agg
254
561
  when DataFrame
255
562
  agg
256
563
  when Array
257
- agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
564
+ aggregations =
565
+ agg.map do |df|
566
+ v = df.vectors[-1]
567
+ [v.key, v]
568
+ end
569
+ agg[0].assign(aggregations)
570
+ when Hash
571
+ aggregations =
572
+ agg.map do |key, df|
573
+ aggregated_keys = df.keys - @group_keys
574
+ if aggregated_keys.size > 1
575
+ message =
576
+ "accept only one column from the Hash: #{aggregated_keys.join(', ')}"
577
+ raise GroupArgumentError, message
578
+ end
579
+
580
+ v = df.vectors[-1]
581
+ [key, v]
582
+ end
583
+ agg.values[-1].drop(-1).assign(aggregations)
258
584
  else
259
585
  raise GroupArgumentError, "Unknown argument: #{agg}"
260
586
  end
261
587
  end
262
588
 
589
+ # Return grouped DataFrame only for group keys.
590
+ #
591
+ # @return [DataFrame]
592
+ # grouped DataFrame projected only for group_keys.
593
+ # @since 0.5.0
594
+ #
595
+ def grouped_frame
596
+ DataFrame.create(group_table[group_keys])
597
+ end
598
+ alias_method :none, :grouped_frame
599
+
263
600
  # Aggregating summary.
264
601
  #
265
602
  # @api private
@@ -270,37 +607,49 @@ module RedAmber
270
607
 
271
608
  private
272
609
 
273
- def build_aggregation_keys(function_name, summary_keys)
274
- if summary_keys.empty?
275
- [function_name]
276
- else
277
- summary_keys.map { |key| "#{function_name}(#{key})" }
278
- end
279
- end
280
-
281
- # @note `@group_counts.sum == @dataframe.size``
282
- def group_counts
283
- @group_counts ||= filters.map(&:sum)
610
+ def group_table
611
+ @group_table ||= build_aggregated_table
284
612
  end
285
613
 
286
- def base_table
287
- @base_table ||= begin
288
- indexes = filters.map { |filter| filter.index(true) }
289
- @dataframe.table[@group_keys].take(indexes)
614
+ def build_aggregated_table
615
+ keys = @group_keys
616
+ key = keys[0]
617
+ table = @dataframe.table
618
+
619
+ plan = Arrow::ExecutePlan.new
620
+ source_node = plan.build_source_node(table)
621
+
622
+ aggregate_node =
623
+ plan.build_aggregate_node(source_node, {
624
+ aggregations: [{ function: 'hash_count',
625
+ input: key }], keys: keys
626
+ })
627
+ expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
628
+ null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
629
+ count_field = Arrow::FieldExpression.new("count(#{key})")
630
+ if null_count.zero?
631
+ expressions << count_field
632
+ else
633
+ is_zero =
634
+ Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
635
+ null_count_scalar = Arrow::Int64Scalar.new(null_count)
636
+ expressions <<
637
+ Arrow::CallExpression.new('if_else', [
638
+ is_zero, null_count_scalar, count_field
639
+ ])
290
640
  end
291
- end
641
+ options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
642
+ project_node = plan.build_project_node(aggregate_node, options)
292
643
 
293
- def add_columns_to_table(table, keys, data_arrays)
294
- fields = table.schema.fields
295
- arrays = table.columns.map(&:data)
644
+ sink_and_start_plan(plan, project_node)
645
+ end
296
646
 
297
- keys.zip(data_arrays).each do |key, array|
298
- data = Arrow::ChunkedArray.new([array])
299
- fields << Arrow::Field.new(key, data.value_data_type)
300
- arrays << data
647
+ def build_aggregation_keys(function_name, summary_keys)
648
+ if summary_keys.empty?
649
+ [function_name]
650
+ else
651
+ summary_keys.map { |key| "#{function_name}(#{key})" }
301
652
  end
302
-
303
- Arrow::Table.new(Arrow::Schema.new(fields), arrays)
304
653
  end
305
654
 
306
655
  # Call Vector aggregating function and return an array of arrays: