red_amber 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.devcontainer/Dockerfile +75 -0
  3. data/.devcontainer/devcontainer.json +38 -0
  4. data/.devcontainer/onCreateCommand.sh +22 -0
  5. data/.rubocop.yml +11 -5
  6. data/CHANGELOG.md +141 -17
  7. data/Gemfile +5 -6
  8. data/README.ja.md +271 -0
  9. data/README.md +52 -31
  10. data/Rakefile +55 -0
  11. data/benchmark/group.yml +12 -5
  12. data/doc/Dev_Containers.ja.md +290 -0
  13. data/doc/Dev_Containers.md +292 -0
  14. data/doc/qmd/examples_of_red_amber.qmd +4596 -0
  15. data/doc/qmd/red-amber.qmd +90 -0
  16. data/docker/Dockerfile +2 -2
  17. data/docker/Gemfile +8 -3
  18. data/docker/docker-compose.yml +1 -1
  19. data/docker/readme.md +5 -5
  20. data/lib/red_amber/data_frame.rb +78 -4
  21. data/lib/red_amber/data_frame_combinable.rb +147 -119
  22. data/lib/red_amber/data_frame_displayable.rb +7 -6
  23. data/lib/red_amber/data_frame_loadsave.rb +1 -1
  24. data/lib/red_amber/data_frame_selectable.rb +51 -2
  25. data/lib/red_amber/data_frame_variable_operation.rb +6 -6
  26. data/lib/red_amber/group.rb +476 -127
  27. data/lib/red_amber/helper.rb +26 -0
  28. data/lib/red_amber/subframes.rb +18 -11
  29. data/lib/red_amber/vector.rb +45 -25
  30. data/lib/red_amber/vector_aggregation.rb +26 -0
  31. data/lib/red_amber/vector_selectable.rb +124 -40
  32. data/lib/red_amber/vector_string_function.rb +279 -0
  33. data/lib/red_amber/vector_unary_element_wise.rb +4 -0
  34. data/lib/red_amber/vector_updatable.rb +28 -0
  35. data/lib/red_amber/version.rb +1 -1
  36. data/lib/red_amber.rb +2 -1
  37. data/red_amber.gemspec +3 -3
  38. metadata +19 -14
  39. data/docker/Gemfile.lock +0 -80
  40. data/docker/example +0 -74
  41. data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
  42. data/docker/notebook/red-amber.ipynb +0 -188
@@ -4,6 +4,7 @@ module RedAmber
4
4
  # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
+ include Helper
7
8
 
8
9
  using RefineArrowTable
9
10
 
@@ -25,12 +26,7 @@ module RedAmber
25
26
  private
26
27
 
27
28
  # @!macro [attach] define_group_aggregation
28
- # @!method $1(*summary_keys)
29
- # Group aggregation function `$1`.
30
- # @param summary_keys [Array<Symbol, String>]
31
- # summary keys.
32
- # @return [DataFrame]
33
- # aggregated DataFrame
29
+ # Returns aggregated DataFrame.
34
30
  #
35
31
  def define_group_aggregation(function)
36
32
  define_method(function) do |*summary_keys|
@@ -54,7 +50,7 @@ module RedAmber
54
50
  # @param group_keys [Array<Symbol, String>]
55
51
  # keys for grouping.
56
52
  # @return [Group]
57
- # Group object.
53
+ # Group object. It inspects grouped columns and its count.
58
54
  # @example
59
55
  # Group.new(penguins, :species)
60
56
  #
@@ -78,13 +74,93 @@ module RedAmber
78
74
  @group = @dataframe.table.group(*@group_keys)
79
75
  end
80
76
 
81
- define_group_aggregation(:count)
77
+ # @!macro group_aggregation
78
+ # @param group_keys [Array<Symbol, String>]
79
+ # keys for grouping.
80
+ # @return [DataFrame]
81
+ # aggregated DataFrame
82
+
83
+ # Whether all elements in each group evaluate to true.
84
+ #
85
+ # @!method all(*group_keys)
86
+ # @macro group_aggregation
87
+ # @example For boolean columns by default.
88
+ # dataframe
89
+ #
90
+ # # =>
91
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000230dc>
92
+ # x y z
93
+ # <uint8> <string> <boolean>
94
+ # 0 1 A false
95
+ # 1 2 A true
96
+ # 2 3 B false
97
+ # 3 4 B (nil)
98
+ # 4 5 B true
99
+ # 5 6 C false
100
+ #
101
+ # dataframe.group(:y).all
102
+ #
103
+ # # =>
104
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000fc08>
105
+ # y all(z)
106
+ # <string> <boolean>
107
+ # 0 A false
108
+ # 1 B false
109
+ # 2 C false
110
+ #
111
+ define_group_aggregation :all
112
+
113
+ # Whether any elements in each group evaluate to true.
114
+ #
115
+ # @!method any(*group_keys)
116
+ # @macro group_aggregation
117
+ # @example For boolean columns by default.
118
+ # dataframe.group(:y).any
119
+ #
120
+ # # =>
121
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000000117ec>
122
+ # y any(z)
123
+ # <string> <boolean>
124
+ # 0 A true
125
+ # 1 B true
126
+ # 2 C false
127
+ #
128
+ define_group_aggregation :any
129
+
130
+ # Count the number of non-nil values in each group.
131
+ # If counts are the same (and do not include NaN or nil),
132
+ # columns for counts are unified.
133
+ #
134
+ # @!method max(*group_keys)
135
+ # @macro group_aggregation
136
+ # @example Show counts for each group.
137
+ # dataframe.group(:y).count
138
+ #
139
+ # # =>
140
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
141
+ # y count(x) count(z)
142
+ # <string> <int64> <int64>
143
+ # 0 A 2 2
144
+ # 1 B 3 2
145
+ # 2 C 1 1
146
+ #
147
+ # dataframe.group(:z).count
148
+ # # same as dataframe.group(:z).count(:x, :y)
149
+ #
150
+ # =>
151
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000122834>
152
+ # z count
153
+ # <boolean> <int64>
154
+ # 0 false 3
155
+ # 1 true 2
156
+ # 2 (nil) 1
157
+ #
158
+ define_group_aggregation :count
82
159
  alias_method :__count, :count
83
160
  private :__count
84
161
 
85
- def count(*summary_keys)
86
- df = __count(summary_keys)
87
- # if counts are the same (and do not include NaN or nil), aggregate count columns.
162
+ def count(*group_keys)
163
+ df = __count(group_keys)
88
164
  if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
89
165
  df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
90
166
  else
@@ -92,19 +168,213 @@ module RedAmber
92
168
  end
93
169
  end
94
170
 
95
- define_group_aggregation(:sum)
171
+ # Returns each record group size as a DataFrame.
172
+ #
173
+ # @return [DataFrame]
174
+ # DataFrame consists of:
175
+ # - Group key columns.
176
+ # - Result columns by group aggregation.
177
+ # @example
178
+ # penguins.group(:species).group_count
179
+ #
180
+ # # =>
181
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
182
+ # species group_count
183
+ # <string> <uint8>
184
+ # 0 Adelie 152
185
+ # 1 Chinstrap 68
186
+ # 2 Gentoo 124
187
+ #
188
+ def group_count
189
+ DataFrame.create(group_table)
190
+ end
191
+ alias_method :count_all, :group_count
96
192
 
97
- define_group_aggregation(:product)
193
+ # Count the unique values in each group.
194
+ #
195
+ # @!method count_uniq(*group_keys)
196
+ # @macro group_aggregation
197
+ # @example Show counts for each group.
198
+ # dataframe.group(:y).count_uniq
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
202
+ # y count_uniq(x)
203
+ # <string> <int64>
204
+ # 0 A 2
205
+ # 1 B 3
206
+ # 2 C 1
207
+ #
208
+ define_group_aggregation :count_distinct
209
+ def count_uniq(*group_keys)
210
+ df = count_distinct(*group_keys)
211
+ df.rename do
212
+ keys_org = keys.select { _1.start_with?('count_distinct') }
213
+ keys_renamed = keys_org.map { _1.to_s.gsub('distinct', 'uniq') }
214
+ keys_org.zip keys_renamed
215
+ end
216
+ end
98
217
 
99
- define_group_aggregation(:mean)
218
+ # Compute maximum of values in each group for numeric columns.
219
+ #
220
+ # @!method max(*group_keys)
221
+ # @macro group_aggregation
222
+ # @example
223
+ # dataframe.group(:y).max
224
+ #
225
+ # # =>
226
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000014ae74>
227
+ # y max(x)
228
+ # <string> <uint8>
229
+ # 0 A 2
230
+ # 1 B 5
231
+ # 2 C 6
232
+ #
233
+ define_group_aggregation :max
100
234
 
101
- define_group_aggregation(:min)
235
+ # Compute mean of values in each group for numeric columns.
236
+ #
237
+ # @!method mean(*group_keys)
238
+ # @macro group_aggregation
239
+ # @example
240
+ # dataframe.group(:y).mean
241
+ #
242
+ # # =>
243
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
244
+ # y mean(x)
245
+ # <string> <double>
246
+ # 0 A 1.5
247
+ # 1 B 4.0
248
+ # 2 C 6.0
249
+ #
250
+ define_group_aggregation :mean
102
251
 
103
- define_group_aggregation(:max)
252
+ # Compute median of values in each group for numeric columns.
253
+ #
254
+ # @!method median(*group_keys)
255
+ # @macro group_aggregation
256
+ # @example
257
+ # dataframe.group(:y).median
258
+ #
259
+ # # =>
260
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
261
+ # y median(x)
262
+ # <string> <double>
263
+ # 0 A 1.5
264
+ # 1 B 4.0
265
+ # 2 C 6.0
266
+ #
267
+ define_group_aggregation :approximate_median
268
+ def median(*group_keys)
269
+ df = approximate_median(*group_keys)
270
+ df.rename do
271
+ keys_org = keys.select { _1.start_with?('approximate_') }
272
+ keys_renamed = keys_org.map { _1.to_s.delete_prefix('approximate_') }
273
+ keys_org.zip keys_renamed
274
+ end
275
+ end
104
276
 
105
- define_group_aggregation(:stddev)
277
+ # Compute minimum of values in each group for numeric columns.
278
+ #
279
+ # @!method min(*group_keys)
280
+ # @macro group_aggregation
281
+ # @example
282
+ # dataframe.group(:y).min
283
+ #
284
+ # # =>
285
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000018f38>
286
+ # y min(x)
287
+ # <string> <uint8>
288
+ # 0 A 1
289
+ # 1 B 3
290
+ # 2 C 6
291
+ #
292
+ define_group_aggregation :min
106
293
 
107
- define_group_aggregation(:variance)
294
+ # Get one value from each group.
295
+ #
296
+ # @!method one(*group_keys)
297
+ # @macro group_aggregation
298
+ # @example
299
+ # dataframe.group(:y).one
300
+ #
301
+ # # =>
302
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000002885c>
303
+ # y one(x)
304
+ # <string> <uint8>
305
+ # 0 A 1
306
+ # 1 B 3
307
+ # 2 C 6
308
+ #
309
+ define_group_aggregation :one
310
+
311
+ # Compute product of values in each group for numeric columns.
312
+ #
313
+ # @!method product(*group_keys)
314
+ # @macro group_aggregation
315
+ # @example
316
+ # dataframe.group(:y).product
317
+ #
318
+ # # =>
319
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000021a84>
320
+ # y product(x)
321
+ # <string> <uint64>
322
+ # 0 A 2
323
+ # 1 B 60
324
+ # 2 C 6
325
+ #
326
+ define_group_aggregation :product
327
+
328
+ # Compute standard deviation of values in each group for numeric columns.
329
+ #
330
+ # @!method stddev(*group_keys)
331
+ # @macro group_aggregation
332
+ # @example
333
+ # dataframe.group(:y).stddev
334
+ #
335
+ # # =>
336
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000002be6c>
337
+ # y stddev(x)
338
+ # <string> <double>
339
+ # 0 A 0.5
340
+ # 1 B 0.082
341
+ # 2 C 0.0
342
+ #
343
+ define_group_aggregation :stddev
344
+
345
+ # Compute sum of values in each group for numeric columns.
346
+ #
347
+ # @!method sum(*group_keys)
348
+ # @macro group_aggregation
349
+ # @example
350
+ # dataframe.group(:y).sum
351
+ #
352
+ # # =>
353
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000032a14>
354
+ # y sum(x)
355
+ # <string> <uint64>
356
+ # 0 A 3
357
+ # 1 B 12
358
+ # 2 C 6
359
+ #
360
+ define_group_aggregation :sum
361
+
362
+ # Compute variance of values in each group for numeric columns.
363
+ #
364
+ # @!method variance(*group_keys)
365
+ # @macro group_aggregation
366
+ # @example
367
+ # dataframe.group(:y).variance
368
+ #
369
+ # # =>
370
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000003b1dc>
371
+ # y variance(x)
372
+ # <string> <double>
373
+ # 0 A 0.25
374
+ # 1 B 0.067
375
+ # 2 C 0.0
376
+ #
377
+ define_group_aggregation :variance
108
378
 
109
379
  # Returns Array of boolean filters to select each records in the Group.
110
380
  #
@@ -114,15 +384,27 @@ module RedAmber
114
384
  #
115
385
  def filters
116
386
  @filters ||= begin
117
- first, *others = @group_keys.map do |key|
118
- vector = @dataframe[key]
119
- vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
120
- end
121
-
122
- if others.empty?
123
- first.select(&:any?)
124
- else
125
- first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
387
+ group_values = group_table[group_keys].each_record.map(&:to_a)
388
+
389
+ Enumerator.new(group_table.n_rows) do |yielder|
390
+ group_values.each do |values|
391
+ booleans =
392
+ values.map.with_index do |value, i|
393
+ column = @dataframe[group_keys[i]].data
394
+ if value.nil?
395
+ Arrow::Function.find('is_null').execute([column])
396
+ elsif value.is_a?(Float) && value.nan?
397
+ Arrow::Function.find('is_nan').execute([column])
398
+ else
399
+ Arrow::Function.find('equal').execute([column, value])
400
+ end
401
+ end
402
+ filter =
403
+ booleans.reduce do |result, datum|
404
+ Arrow::Function.find('and_kleene').execute([result, datum])
405
+ end
406
+ yielder << Vector.create(filter.value)
407
+ end
126
408
  end
127
409
  end
128
410
  end
@@ -147,119 +429,174 @@ module RedAmber
147
429
  # group size.
148
430
  #
149
431
  def each
150
- filters
151
432
  return enum_for(:each) unless block_given?
152
433
 
153
- @filters.each do |filter|
154
- yield @dataframe[filter]
434
+ filters.each do |filter|
435
+ yield @dataframe.filter(filter)
155
436
  end
156
437
  @filters.size
157
438
  end
158
439
 
159
- # Returns each record group size as a DataFrame.
440
+ # String representation of self.
160
441
  #
161
- # @return [DataFrame]
162
- # DataFrame consists of:
163
- # - Group key columns.
164
- # - Result columns by group aggregation.
442
+ # @return [String]
443
+ # show information of self as a String.
165
444
  # @example
166
- # penguins.group(:species).group_count
445
+ # puts penguins.group(:species).inspect
167
446
  #
168
447
  # # =>
169
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
448
+ # #<RedAmber::Group : 0x0000000000003a98>
170
449
  # species group_count
171
450
  # <string> <uint8>
172
451
  # 0 Adelie 152
173
452
  # 1 Chinstrap 68
174
453
  # 2 Gentoo 124
175
454
  #
176
- def group_count
177
- DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
455
+ def inspect
456
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
178
457
  end
179
458
 
180
- # String representation of self.
459
+ # Summarize Group by aggregation functions from the block.
181
460
  #
182
- # @return [String]
183
- # show information of self as a String.
184
- # @example
185
- # puts penguins.group(:species).inspect
461
+ # @overload summarize
462
+ # Summarize by a function.
463
+ # @yieldparam group [Group]
464
+ # passes group object self.
465
+ # @yieldreturn [DataFrame]
466
+ # @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
467
+ # an aggregated DataFrame or an array of aggregated DataFrames.
468
+ # @return [DataFrame]
469
+ # summarized DataFrame.
470
+ # @example Single function and single variable
471
+ # group = penguins.group(:species)
472
+ # group
186
473
  #
187
- # # =>
188
- # #<RedAmber::Group : 0x0000000000003a98>
189
- # species count
190
- # <string> <uint8>
191
- # 0 Adelie 152
192
- # 1 Chinstrap 68
193
- # 2 Gentoo 124
474
+ # # =>
475
+ # #<RedAmber::Group : 0x000000000000c314>
476
+ # species group_count
477
+ # <string> <uint8>
478
+ # 0 Adelie 152
479
+ # 1 Chinstrap 68
480
+ # 2 Gentoo 124
194
481
  #
195
- def inspect
196
- "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
197
- end
198
-
199
- # Summarize Group by aggregation functions from the block.
482
+ # group.summarize { mean(:bill_length_mm) }
200
483
  #
201
- # @yieldparam group [Group]
202
- # passes group object self.
203
- # @yieldreturn [DataFrame, Array<DataFrame>]
204
- # an aggregated DataFrame or an array of aggregated DataFrames.
205
- # @return [DataFrame]
206
- # summarized DataFrame.
207
- # @example Single function and single variable
208
- # group = penguins.group(:species)
209
- # group
484
+ # # =>
485
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
486
+ # species mean(bill_length_mm)
487
+ # <string> <double>
488
+ # 0 Adelie 38.79
489
+ # 1 Chinstrap 48.83
490
+ # 2 Gentoo 47.5
210
491
  #
211
- # # =>
212
- # #<RedAmber::Group : 0x000000000000c314>
213
- # species count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
492
+ # @example Single function only
493
+ # group.summarize { mean }
218
494
  #
219
- # group.summarize { mean(:bill_length_mm) }
495
+ # # =>
496
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
497
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
498
+ # <string> <double> <double> ... <double>
499
+ # 0 Adelie 38.79 18.35 ... 2008.01
500
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
501
+ # 2 Gentoo 47.5 14.98 ... 2008.08
220
502
  #
221
- # # =>
222
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
- # species mean(bill_length_mm)
224
- # <string> <double>
225
- # 0 Adelie 38.79
226
- # 1 Chinstrap 48.83
227
- # 2 Gentoo 47.5
503
+ # @overload summarize
504
+ # Summarize by a function.
228
505
  #
229
- # @example Single function only
230
- # group.summarize { mean }
506
+ # @yieldparam group [Group]
507
+ # passes group object self.
508
+ # @yieldreturn [Array<DataFrame>]
509
+ # an aggregated DataFrame or an array of aggregated DataFrames.
510
+ # @return [DataFrame]
511
+ # summarized DataFrame.
512
+ # @example Multiple functions
513
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
231
514
  #
232
- # # =>
233
- # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
- # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
- # <string> <double> <double> ... <double>
236
- # 0 Adelie 38.79 18.35 ... 2008.01
237
- # 1 Chinstrap 48.83 18.42 ... 2007.97
238
- # 2 Gentoo 47.5 14.98 ... 2008.08
515
+ # # =>
516
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
517
+ # species min(bill_length_mm) max(bill_length_mm)
518
+ # <string> <double> <double>
519
+ # 0 Adelie 32.1 46.0
520
+ # 1 Chinstrap 40.9 58.0
521
+ # 2 Gentoo 40.9 59.6
239
522
  #
240
- # @example Multiple functions
241
- # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
523
+ # @overload summarize
524
+ # Summarize by a function.
242
525
  #
243
- # # =>
244
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
- # species min(bill_length_mm) max(bill_length_mm)
246
- # <string> <double> <double>
247
- # 0 Adelie 32.1 46.0
248
- # 1 Chinstrap 40.9 58.0
249
- # 2 Gentoo 40.9 59.6
250
- #
251
- def summarize(&block)
252
- agg = instance_eval(&block)
526
+ # @yieldparam group [Group]
527
+ # passes group object self.
528
+ # @yieldreturn [Hash{Symbol, String => DataFrame}]
529
+ # an aggregated DataFrame or an array of aggregated DataFrames.
530
+ # The DataFrame must return only one aggregated column.
531
+ # @return [DataFrame]
532
+ # summarized DataFrame.
533
+ # @example Rename column name by Hash
534
+ # group.summarize {
535
+ # {
536
+ # min_bill_length_mm: min(:bill_length_mm),
537
+ # max_bill_length_mm: max(:bill_length_mm),
538
+ # }
539
+ # }
540
+ #
541
+ # # =>
542
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
543
+ # species min_bill_length_mm max_bill_length_mm
544
+ # <string> <double> <double>
545
+ # 0 Adelie 32.1 46.0
546
+ # 1 Chinstrap 40.9 58.0
547
+ # 2 Gentoo 40.9 59.6
548
+ #
549
+ def summarize(*args, &block)
550
+ if block
551
+ agg = instance_eval(&block)
552
+ unless args.empty?
553
+ agg = [agg] if agg.is_a?(DataFrame)
554
+ agg = args.zip(agg).to_h
555
+ end
556
+ else
557
+ agg = args
558
+ end
559
+
253
560
  case agg
254
561
  when DataFrame
255
562
  agg
256
563
  when Array
257
- agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
564
+ aggregations =
565
+ agg.map do |df|
566
+ v = df.vectors[-1]
567
+ [v.key, v]
568
+ end
569
+ agg[0].assign(aggregations)
570
+ when Hash
571
+ aggregations =
572
+ agg.map do |key, df|
573
+ aggregated_keys = df.keys - @group_keys
574
+ if aggregated_keys.size > 1
575
+ message =
576
+ "accept only one column from the Hash: #{aggregated_keys.join(', ')}"
577
+ raise GroupArgumentError, message
578
+ end
579
+
580
+ v = df.vectors[-1]
581
+ [key, v]
582
+ end
583
+ agg.values[-1].drop(-1).assign(aggregations)
258
584
  else
259
585
  raise GroupArgumentError, "Unknown argument: #{agg}"
260
586
  end
261
587
  end
262
588
 
589
+ # Return grouped DataFrame only for group keys.
590
+ #
591
+ # @return [DataFrame]
592
+ # grouped DataFrame projected only for group_keys.
593
+ # @since 0.5.0
594
+ #
595
+ def grouped_frame
596
+ DataFrame.create(group_table[group_keys])
597
+ end
598
+ alias_method :none, :grouped_frame
599
+
263
600
  # Aggregating summary.
264
601
  #
265
602
  # @api private
@@ -270,37 +607,49 @@ module RedAmber
270
607
 
271
608
  private
272
609
 
273
- def build_aggregation_keys(function_name, summary_keys)
274
- if summary_keys.empty?
275
- [function_name]
276
- else
277
- summary_keys.map { |key| "#{function_name}(#{key})" }
278
- end
279
- end
280
-
281
- # @note `@group_counts.sum == @dataframe.size``
282
- def group_counts
283
- @group_counts ||= filters.map(&:sum)
610
+ def group_table
611
+ @group_table ||= build_aggregated_table
284
612
  end
285
613
 
286
- def base_table
287
- @base_table ||= begin
288
- indexes = filters.map { |filter| filter.index(true) }
289
- @dataframe.table[@group_keys].take(indexes)
614
+ def build_aggregated_table
615
+ keys = @group_keys
616
+ key = keys[0]
617
+ table = @dataframe.table
618
+
619
+ plan = Arrow::ExecutePlan.new
620
+ source_node = plan.build_source_node(table)
621
+
622
+ aggregate_node =
623
+ plan.build_aggregate_node(source_node, {
624
+ aggregations: [{ function: 'hash_count',
625
+ input: key }], keys: keys
626
+ })
627
+ expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
628
+ null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
629
+ count_field = Arrow::FieldExpression.new("count(#{key})")
630
+ if null_count.zero?
631
+ expressions << count_field
632
+ else
633
+ is_zero =
634
+ Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
635
+ null_count_scalar = Arrow::Int64Scalar.new(null_count)
636
+ expressions <<
637
+ Arrow::CallExpression.new('if_else', [
638
+ is_zero, null_count_scalar, count_field
639
+ ])
290
640
  end
291
- end
641
+ options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
642
+ project_node = plan.build_project_node(aggregate_node, options)
292
643
 
293
- def add_columns_to_table(table, keys, data_arrays)
294
- fields = table.schema.fields
295
- arrays = table.columns.map(&:data)
644
+ sink_and_start_plan(plan, project_node)
645
+ end
296
646
 
297
- keys.zip(data_arrays).each do |key, array|
298
- data = Arrow::ChunkedArray.new([array])
299
- fields << Arrow::Field.new(key, data.value_data_type)
300
- arrays << data
647
+ def build_aggregation_keys(function_name, summary_keys)
648
+ if summary_keys.empty?
649
+ [function_name]
650
+ else
651
+ summary_keys.map { |key| "#{function_name}(#{key})" }
301
652
  end
302
-
303
- Arrow::Table.new(Arrow::Schema.new(fields), arrays)
304
653
  end
305
654
 
306
655
  # Call Vector aggregating function and return an array of arrays: