red_amber 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.devcontainer/Dockerfile +75 -0
- data/.devcontainer/devcontainer.json +38 -0
- data/.devcontainer/onCreateCommand.sh +22 -0
- data/.rubocop.yml +11 -5
- data/CHANGELOG.md +141 -17
- data/Gemfile +5 -6
- data/README.ja.md +271 -0
- data/README.md +52 -31
- data/Rakefile +55 -0
- data/benchmark/group.yml +12 -5
- data/doc/Dev_Containers.ja.md +290 -0
- data/doc/Dev_Containers.md +292 -0
- data/doc/qmd/examples_of_red_amber.qmd +4596 -0
- data/doc/qmd/red-amber.qmd +90 -0
- data/docker/Dockerfile +2 -2
- data/docker/Gemfile +8 -3
- data/docker/docker-compose.yml +1 -1
- data/docker/readme.md +5 -5
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +7 -6
- data/lib/red_amber/data_frame_loadsave.rb +1 -1
- data/lib/red_amber/data_frame_selectable.rb +51 -2
- data/lib/red_amber/data_frame_variable_operation.rb +6 -6
- data/lib/red_amber/group.rb +476 -127
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +18 -11
- data/lib/red_amber/vector.rb +45 -25
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +124 -40
- data/lib/red_amber/vector_string_function.rb +279 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +3 -3
- metadata +19 -14
- data/docker/Gemfile.lock +0 -80
- data/docker/example +0 -74
- data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
- data/docker/notebook/red-amber.ipynb +0 -188
data/lib/red_amber/group.rb
CHANGED
@@ -4,6 +4,7 @@ module RedAmber
|
|
4
4
|
# Group class
|
5
5
|
class Group
|
6
6
|
include Enumerable # This feature is experimental
|
7
|
+
include Helper
|
7
8
|
|
8
9
|
using RefineArrowTable
|
9
10
|
|
@@ -25,12 +26,7 @@ module RedAmber
|
|
25
26
|
private
|
26
27
|
|
27
28
|
# @!macro [attach] define_group_aggregation
|
28
|
-
#
|
29
|
-
# Group aggregation function `$1`.
|
30
|
-
# @param summary_keys [Array<Symbol, String>]
|
31
|
-
# summary keys.
|
32
|
-
# @return [DataFrame]
|
33
|
-
# aggregated DataFrame
|
29
|
+
# Returns aggregated DataFrame.
|
34
30
|
#
|
35
31
|
def define_group_aggregation(function)
|
36
32
|
define_method(function) do |*summary_keys|
|
@@ -54,7 +50,7 @@ module RedAmber
|
|
54
50
|
# @param group_keys [Array<Symbol, String>]
|
55
51
|
# keys for grouping.
|
56
52
|
# @return [Group]
|
57
|
-
# Group object.
|
53
|
+
# Group object. It inspects grouped columns and its count.
|
58
54
|
# @example
|
59
55
|
# Group.new(penguins, :species)
|
60
56
|
#
|
@@ -78,13 +74,93 @@ module RedAmber
|
|
78
74
|
@group = @dataframe.table.group(*@group_keys)
|
79
75
|
end
|
80
76
|
|
81
|
-
|
77
|
+
# @!macro group_aggregation
|
78
|
+
# @param group_keys [Array<Symbol, String>]
|
79
|
+
# keys for grouping.
|
80
|
+
# @return [DataFrame]
|
81
|
+
# aggregated DataFrame
|
82
|
+
|
83
|
+
# Whether all elements in each group evaluate to true.
|
84
|
+
#
|
85
|
+
# @!method all(*group_keys)
|
86
|
+
# @macro group_aggregation
|
87
|
+
# @example For boolean columns by default.
|
88
|
+
# dataframe
|
89
|
+
#
|
90
|
+
# # =>
|
91
|
+
# #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000230dc>
|
92
|
+
# x y z
|
93
|
+
# <uint8> <string> <boolean>
|
94
|
+
# 0 1 A false
|
95
|
+
# 1 2 A true
|
96
|
+
# 2 3 B false
|
97
|
+
# 3 4 B (nil)
|
98
|
+
# 4 5 B true
|
99
|
+
# 5 6 C false
|
100
|
+
#
|
101
|
+
# dataframe.group(:y).all
|
102
|
+
#
|
103
|
+
# # =>
|
104
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000fc08>
|
105
|
+
# y all(z)
|
106
|
+
# <string> <boolean>
|
107
|
+
# 0 A false
|
108
|
+
# 1 B false
|
109
|
+
# 2 C false
|
110
|
+
#
|
111
|
+
define_group_aggregation :all
|
112
|
+
|
113
|
+
# Whether any elements in each group evaluate to true.
|
114
|
+
#
|
115
|
+
# @!method any(*group_keys)
|
116
|
+
# @macro group_aggregation
|
117
|
+
# @example For boolean columns by default.
|
118
|
+
# dataframe.group(:y).any
|
119
|
+
#
|
120
|
+
# # =>
|
121
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000000117ec>
|
122
|
+
# y any(z)
|
123
|
+
# <string> <boolean>
|
124
|
+
# 0 A true
|
125
|
+
# 1 B true
|
126
|
+
# 2 C false
|
127
|
+
#
|
128
|
+
define_group_aggregation :any
|
129
|
+
|
130
|
+
# Count the number of non-nil values in each group.
|
131
|
+
# If counts are the same (and do not include NaN or nil),
|
132
|
+
# columns for counts are unified.
|
133
|
+
#
|
134
|
+
# @!method max(*group_keys)
|
135
|
+
# @macro group_aggregation
|
136
|
+
# @example Show counts for each group.
|
137
|
+
# dataframe.group(:y).count
|
138
|
+
#
|
139
|
+
# # =>
|
140
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
|
141
|
+
# y count(x) count(z)
|
142
|
+
# <string> <int64> <int64>
|
143
|
+
# 0 A 2 2
|
144
|
+
# 1 B 3 2
|
145
|
+
# 2 C 1 1
|
146
|
+
#
|
147
|
+
# dataframe.group(:z).count
|
148
|
+
# # same as dataframe.group(:z).count(:x, :y)
|
149
|
+
#
|
150
|
+
# =>
|
151
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000122834>
|
152
|
+
# z count
|
153
|
+
# <boolean> <int64>
|
154
|
+
# 0 false 3
|
155
|
+
# 1 true 2
|
156
|
+
# 2 (nil) 1
|
157
|
+
#
|
158
|
+
define_group_aggregation :count
|
82
159
|
alias_method :__count, :count
|
83
160
|
private :__count
|
84
161
|
|
85
|
-
def count(*
|
86
|
-
df = __count(
|
87
|
-
# if counts are the same (and do not include NaN or nil), aggregate count columns.
|
162
|
+
def count(*group_keys)
|
163
|
+
df = __count(group_keys)
|
88
164
|
if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
|
89
165
|
df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
|
90
166
|
else
|
@@ -92,19 +168,213 @@ module RedAmber
|
|
92
168
|
end
|
93
169
|
end
|
94
170
|
|
95
|
-
|
171
|
+
# Returns each record group size as a DataFrame.
|
172
|
+
#
|
173
|
+
# @return [DataFrame]
|
174
|
+
# DataFrame consists of:
|
175
|
+
# - Group key columns.
|
176
|
+
# - Result columns by group aggregation.
|
177
|
+
# @example
|
178
|
+
# penguins.group(:species).group_count
|
179
|
+
#
|
180
|
+
# # =>
|
181
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
|
182
|
+
# species group_count
|
183
|
+
# <string> <uint8>
|
184
|
+
# 0 Adelie 152
|
185
|
+
# 1 Chinstrap 68
|
186
|
+
# 2 Gentoo 124
|
187
|
+
#
|
188
|
+
def group_count
|
189
|
+
DataFrame.create(group_table)
|
190
|
+
end
|
191
|
+
alias_method :count_all, :group_count
|
96
192
|
|
97
|
-
|
193
|
+
# Count the unique values in each group.
|
194
|
+
#
|
195
|
+
# @!method count_uniq(*group_keys)
|
196
|
+
# @macro group_aggregation
|
197
|
+
# @example Show counts for each group.
|
198
|
+
# dataframe.group(:y).count_uniq
|
199
|
+
#
|
200
|
+
# # =>
|
201
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
|
202
|
+
# y count_uniq(x)
|
203
|
+
# <string> <int64>
|
204
|
+
# 0 A 2
|
205
|
+
# 1 B 3
|
206
|
+
# 2 C 1
|
207
|
+
#
|
208
|
+
define_group_aggregation :count_distinct
|
209
|
+
def count_uniq(*group_keys)
|
210
|
+
df = count_distinct(*group_keys)
|
211
|
+
df.rename do
|
212
|
+
keys_org = keys.select { _1.start_with?('count_distinct') }
|
213
|
+
keys_renamed = keys_org.map { _1.to_s.gsub('distinct', 'uniq') }
|
214
|
+
keys_org.zip keys_renamed
|
215
|
+
end
|
216
|
+
end
|
98
217
|
|
99
|
-
|
218
|
+
# Compute maximum of values in each group for numeric columns.
|
219
|
+
#
|
220
|
+
# @!method max(*group_keys)
|
221
|
+
# @macro group_aggregation
|
222
|
+
# @example
|
223
|
+
# dataframe.group(:y).max
|
224
|
+
#
|
225
|
+
# # =>
|
226
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000014ae74>
|
227
|
+
# y max(x)
|
228
|
+
# <string> <uint8>
|
229
|
+
# 0 A 2
|
230
|
+
# 1 B 5
|
231
|
+
# 2 C 6
|
232
|
+
#
|
233
|
+
define_group_aggregation :max
|
100
234
|
|
101
|
-
|
235
|
+
# Compute mean of values in each group for numeric columns.
|
236
|
+
#
|
237
|
+
# @!method mean(*group_keys)
|
238
|
+
# @macro group_aggregation
|
239
|
+
# @example
|
240
|
+
# dataframe.group(:y).mean
|
241
|
+
#
|
242
|
+
# # =>
|
243
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
|
244
|
+
# y mean(x)
|
245
|
+
# <string> <double>
|
246
|
+
# 0 A 1.5
|
247
|
+
# 1 B 4.0
|
248
|
+
# 2 C 6.0
|
249
|
+
#
|
250
|
+
define_group_aggregation :mean
|
102
251
|
|
103
|
-
|
252
|
+
# Compute median of values in each group for numeric columns.
|
253
|
+
#
|
254
|
+
# @!method median(*group_keys)
|
255
|
+
# @macro group_aggregation
|
256
|
+
# @example
|
257
|
+
# dataframe.group(:y).median
|
258
|
+
#
|
259
|
+
# # =>
|
260
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
|
261
|
+
# y median(x)
|
262
|
+
# <string> <double>
|
263
|
+
# 0 A 1.5
|
264
|
+
# 1 B 4.0
|
265
|
+
# 2 C 6.0
|
266
|
+
#
|
267
|
+
define_group_aggregation :approximate_median
|
268
|
+
def median(*group_keys)
|
269
|
+
df = approximate_median(*group_keys)
|
270
|
+
df.rename do
|
271
|
+
keys_org = keys.select { _1.start_with?('approximate_') }
|
272
|
+
keys_renamed = keys_org.map { _1.to_s.delete_prefix('approximate_') }
|
273
|
+
keys_org.zip keys_renamed
|
274
|
+
end
|
275
|
+
end
|
104
276
|
|
105
|
-
|
277
|
+
# Compute minimum of values in each group for numeric columns.
|
278
|
+
#
|
279
|
+
# @!method min(*group_keys)
|
280
|
+
# @macro group_aggregation
|
281
|
+
# @example
|
282
|
+
# dataframe.group(:y).min
|
283
|
+
#
|
284
|
+
# # =>
|
285
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000018f38>
|
286
|
+
# y min(x)
|
287
|
+
# <string> <uint8>
|
288
|
+
# 0 A 1
|
289
|
+
# 1 B 3
|
290
|
+
# 2 C 6
|
291
|
+
#
|
292
|
+
define_group_aggregation :min
|
106
293
|
|
107
|
-
|
294
|
+
# Get one value from each group.
|
295
|
+
#
|
296
|
+
# @!method one(*group_keys)
|
297
|
+
# @macro group_aggregation
|
298
|
+
# @example
|
299
|
+
# dataframe.group(:y).one
|
300
|
+
#
|
301
|
+
# # =>
|
302
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000002885c>
|
303
|
+
# y one(x)
|
304
|
+
# <string> <uint8>
|
305
|
+
# 0 A 1
|
306
|
+
# 1 B 3
|
307
|
+
# 2 C 6
|
308
|
+
#
|
309
|
+
define_group_aggregation :one
|
310
|
+
|
311
|
+
# Compute product of values in each group for numeric columns.
|
312
|
+
#
|
313
|
+
# @!method product(*group_keys)
|
314
|
+
# @macro group_aggregation
|
315
|
+
# @example
|
316
|
+
# dataframe.group(:y).product
|
317
|
+
#
|
318
|
+
# # =>
|
319
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000021a84>
|
320
|
+
# y product(x)
|
321
|
+
# <string> <uint64>
|
322
|
+
# 0 A 2
|
323
|
+
# 1 B 60
|
324
|
+
# 2 C 6
|
325
|
+
#
|
326
|
+
define_group_aggregation :product
|
327
|
+
|
328
|
+
# Compute standard deviation of values in each group for numeric columns.
|
329
|
+
#
|
330
|
+
# @!method stddev(*group_keys)
|
331
|
+
# @macro group_aggregation
|
332
|
+
# @example
|
333
|
+
# dataframe.group(:y).stddev
|
334
|
+
#
|
335
|
+
# # =>
|
336
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000002be6c>
|
337
|
+
# y stddev(x)
|
338
|
+
# <string> <double>
|
339
|
+
# 0 A 0.5
|
340
|
+
# 1 B 0.082
|
341
|
+
# 2 C 0.0
|
342
|
+
#
|
343
|
+
define_group_aggregation :stddev
|
344
|
+
|
345
|
+
# Compute sum of values in each group for numeric columns.
|
346
|
+
#
|
347
|
+
# @!method sum(*group_keys)
|
348
|
+
# @macro group_aggregation
|
349
|
+
# @example
|
350
|
+
# dataframe.group(:y).sum
|
351
|
+
#
|
352
|
+
# # =>
|
353
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000032a14>
|
354
|
+
# y sum(x)
|
355
|
+
# <string> <uint64>
|
356
|
+
# 0 A 3
|
357
|
+
# 1 B 12
|
358
|
+
# 2 C 6
|
359
|
+
#
|
360
|
+
define_group_aggregation :sum
|
361
|
+
|
362
|
+
# Compute variance of values in each group for numeric columns.
|
363
|
+
#
|
364
|
+
# @!method variance(*group_keys)
|
365
|
+
# @macro group_aggregation
|
366
|
+
# @example
|
367
|
+
# dataframe.group(:y).variance
|
368
|
+
#
|
369
|
+
# # =>
|
370
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000003b1dc>
|
371
|
+
# y variance(x)
|
372
|
+
# <string> <double>
|
373
|
+
# 0 A 0.25
|
374
|
+
# 1 B 0.067
|
375
|
+
# 2 C 0.0
|
376
|
+
#
|
377
|
+
define_group_aggregation :variance
|
108
378
|
|
109
379
|
# Returns Array of boolean filters to select each records in the Group.
|
110
380
|
#
|
@@ -114,15 +384,27 @@ module RedAmber
|
|
114
384
|
#
|
115
385
|
def filters
|
116
386
|
@filters ||= begin
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
387
|
+
group_values = group_table[group_keys].each_record.map(&:to_a)
|
388
|
+
|
389
|
+
Enumerator.new(group_table.n_rows) do |yielder|
|
390
|
+
group_values.each do |values|
|
391
|
+
booleans =
|
392
|
+
values.map.with_index do |value, i|
|
393
|
+
column = @dataframe[group_keys[i]].data
|
394
|
+
if value.nil?
|
395
|
+
Arrow::Function.find('is_null').execute([column])
|
396
|
+
elsif value.is_a?(Float) && value.nan?
|
397
|
+
Arrow::Function.find('is_nan').execute([column])
|
398
|
+
else
|
399
|
+
Arrow::Function.find('equal').execute([column, value])
|
400
|
+
end
|
401
|
+
end
|
402
|
+
filter =
|
403
|
+
booleans.reduce do |result, datum|
|
404
|
+
Arrow::Function.find('and_kleene').execute([result, datum])
|
405
|
+
end
|
406
|
+
yielder << Vector.create(filter.value)
|
407
|
+
end
|
126
408
|
end
|
127
409
|
end
|
128
410
|
end
|
@@ -147,119 +429,174 @@ module RedAmber
|
|
147
429
|
# group size.
|
148
430
|
#
|
149
431
|
def each
|
150
|
-
filters
|
151
432
|
return enum_for(:each) unless block_given?
|
152
433
|
|
153
|
-
|
154
|
-
yield @dataframe
|
434
|
+
filters.each do |filter|
|
435
|
+
yield @dataframe.filter(filter)
|
155
436
|
end
|
156
437
|
@filters.size
|
157
438
|
end
|
158
439
|
|
159
|
-
#
|
440
|
+
# String representation of self.
|
160
441
|
#
|
161
|
-
# @return [
|
162
|
-
#
|
163
|
-
# - Group key columns.
|
164
|
-
# - Result columns by group aggregation.
|
442
|
+
# @return [String]
|
443
|
+
# show information of self as a String.
|
165
444
|
# @example
|
166
|
-
# penguins.group(:species).
|
445
|
+
# puts penguins.group(:species).inspect
|
167
446
|
#
|
168
447
|
# # =>
|
169
|
-
# #<RedAmber::
|
448
|
+
# #<RedAmber::Group : 0x0000000000003a98>
|
170
449
|
# species group_count
|
171
450
|
# <string> <uint8>
|
172
451
|
# 0 Adelie 152
|
173
452
|
# 1 Chinstrap 68
|
174
453
|
# 2 Gentoo 124
|
175
454
|
#
|
176
|
-
def
|
177
|
-
|
455
|
+
def inspect
|
456
|
+
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
|
178
457
|
end
|
179
458
|
|
180
|
-
#
|
459
|
+
# Summarize Group by aggregation functions from the block.
|
181
460
|
#
|
182
|
-
# @
|
183
|
-
#
|
184
|
-
#
|
185
|
-
#
|
461
|
+
# @overload summarize
|
462
|
+
# Summarize by a function.
|
463
|
+
# @yieldparam group [Group]
|
464
|
+
# passes group object self.
|
465
|
+
# @yieldreturn [DataFrame]
|
466
|
+
# @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
|
467
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
468
|
+
# @return [DataFrame]
|
469
|
+
# summarized DataFrame.
|
470
|
+
# @example Single function and single variable
|
471
|
+
# group = penguins.group(:species)
|
472
|
+
# group
|
186
473
|
#
|
187
|
-
#
|
188
|
-
#
|
189
|
-
#
|
190
|
-
#
|
191
|
-
#
|
192
|
-
#
|
193
|
-
#
|
474
|
+
# # =>
|
475
|
+
# #<RedAmber::Group : 0x000000000000c314>
|
476
|
+
# species group_count
|
477
|
+
# <string> <uint8>
|
478
|
+
# 0 Adelie 152
|
479
|
+
# 1 Chinstrap 68
|
480
|
+
# 2 Gentoo 124
|
194
481
|
#
|
195
|
-
|
196
|
-
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
|
197
|
-
end
|
198
|
-
|
199
|
-
# Summarize Group by aggregation functions from the block.
|
482
|
+
# group.summarize { mean(:bill_length_mm) }
|
200
483
|
#
|
201
|
-
#
|
202
|
-
#
|
203
|
-
#
|
204
|
-
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
#
|
208
|
-
# group = penguins.group(:species)
|
209
|
-
# group
|
484
|
+
# # =>
|
485
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
|
486
|
+
# species mean(bill_length_mm)
|
487
|
+
# <string> <double>
|
488
|
+
# 0 Adelie 38.79
|
489
|
+
# 1 Chinstrap 48.83
|
490
|
+
# 2 Gentoo 47.5
|
210
491
|
#
|
211
|
-
#
|
212
|
-
#
|
213
|
-
# species count
|
214
|
-
# <string> <uint8>
|
215
|
-
# 0 Adelie 152
|
216
|
-
# 1 Chinstrap 68
|
217
|
-
# 2 Gentoo 124
|
492
|
+
# @example Single function only
|
493
|
+
# group.summarize { mean }
|
218
494
|
#
|
219
|
-
#
|
495
|
+
# # =>
|
496
|
+
# #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
|
497
|
+
# species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
|
498
|
+
# <string> <double> <double> ... <double>
|
499
|
+
# 0 Adelie 38.79 18.35 ... 2008.01
|
500
|
+
# 1 Chinstrap 48.83 18.42 ... 2007.97
|
501
|
+
# 2 Gentoo 47.5 14.98 ... 2008.08
|
220
502
|
#
|
221
|
-
#
|
222
|
-
#
|
223
|
-
# species mean(bill_length_mm)
|
224
|
-
# <string> <double>
|
225
|
-
# 0 Adelie 38.79
|
226
|
-
# 1 Chinstrap 48.83
|
227
|
-
# 2 Gentoo 47.5
|
503
|
+
# @overload summarize
|
504
|
+
# Summarize by a function.
|
228
505
|
#
|
229
|
-
#
|
230
|
-
#
|
506
|
+
# @yieldparam group [Group]
|
507
|
+
# passes group object self.
|
508
|
+
# @yieldreturn [Array<DataFrame>]
|
509
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
510
|
+
# @return [DataFrame]
|
511
|
+
# summarized DataFrame.
|
512
|
+
# @example Multiple functions
|
513
|
+
# group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
|
231
514
|
#
|
232
|
-
#
|
233
|
-
#
|
234
|
-
#
|
235
|
-
#
|
236
|
-
#
|
237
|
-
#
|
238
|
-
#
|
515
|
+
# # =>
|
516
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
517
|
+
# species min(bill_length_mm) max(bill_length_mm)
|
518
|
+
# <string> <double> <double>
|
519
|
+
# 0 Adelie 32.1 46.0
|
520
|
+
# 1 Chinstrap 40.9 58.0
|
521
|
+
# 2 Gentoo 40.9 59.6
|
239
522
|
#
|
240
|
-
# @
|
241
|
-
#
|
523
|
+
# @overload summarize
|
524
|
+
# Summarize by a function.
|
242
525
|
#
|
243
|
-
#
|
244
|
-
#
|
245
|
-
#
|
246
|
-
#
|
247
|
-
#
|
248
|
-
#
|
249
|
-
#
|
250
|
-
#
|
251
|
-
|
252
|
-
|
526
|
+
# @yieldparam group [Group]
|
527
|
+
# passes group object self.
|
528
|
+
# @yieldreturn [Hash{Symbol, String => DataFrame}]
|
529
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
530
|
+
# The DataFrame must return only one aggregated column.
|
531
|
+
# @return [DataFrame]
|
532
|
+
# summarized DataFrame.
|
533
|
+
# @example Rename column name by Hash
|
534
|
+
# group.summarize {
|
535
|
+
# {
|
536
|
+
# min_bill_length_mm: min(:bill_length_mm),
|
537
|
+
# max_bill_length_mm: max(:bill_length_mm),
|
538
|
+
# }
|
539
|
+
# }
|
540
|
+
#
|
541
|
+
# # =>
|
542
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
543
|
+
# species min_bill_length_mm max_bill_length_mm
|
544
|
+
# <string> <double> <double>
|
545
|
+
# 0 Adelie 32.1 46.0
|
546
|
+
# 1 Chinstrap 40.9 58.0
|
547
|
+
# 2 Gentoo 40.9 59.6
|
548
|
+
#
|
549
|
+
def summarize(*args, &block)
|
550
|
+
if block
|
551
|
+
agg = instance_eval(&block)
|
552
|
+
unless args.empty?
|
553
|
+
agg = [agg] if agg.is_a?(DataFrame)
|
554
|
+
agg = args.zip(agg).to_h
|
555
|
+
end
|
556
|
+
else
|
557
|
+
agg = args
|
558
|
+
end
|
559
|
+
|
253
560
|
case agg
|
254
561
|
when DataFrame
|
255
562
|
agg
|
256
563
|
when Array
|
257
|
-
|
564
|
+
aggregations =
|
565
|
+
agg.map do |df|
|
566
|
+
v = df.vectors[-1]
|
567
|
+
[v.key, v]
|
568
|
+
end
|
569
|
+
agg[0].assign(aggregations)
|
570
|
+
when Hash
|
571
|
+
aggregations =
|
572
|
+
agg.map do |key, df|
|
573
|
+
aggregated_keys = df.keys - @group_keys
|
574
|
+
if aggregated_keys.size > 1
|
575
|
+
message =
|
576
|
+
"accept only one column from the Hash: #{aggregated_keys.join(', ')}"
|
577
|
+
raise GroupArgumentError, message
|
578
|
+
end
|
579
|
+
|
580
|
+
v = df.vectors[-1]
|
581
|
+
[key, v]
|
582
|
+
end
|
583
|
+
agg.values[-1].drop(-1).assign(aggregations)
|
258
584
|
else
|
259
585
|
raise GroupArgumentError, "Unknown argument: #{agg}"
|
260
586
|
end
|
261
587
|
end
|
262
588
|
|
589
|
+
# Return grouped DataFrame only for group keys.
|
590
|
+
#
|
591
|
+
# @return [DataFrame]
|
592
|
+
# grouped DataFrame projected only for group_keys.
|
593
|
+
# @since 0.5.0
|
594
|
+
#
|
595
|
+
def grouped_frame
|
596
|
+
DataFrame.create(group_table[group_keys])
|
597
|
+
end
|
598
|
+
alias_method :none, :grouped_frame
|
599
|
+
|
263
600
|
# Aggregating summary.
|
264
601
|
#
|
265
602
|
# @api private
|
@@ -270,37 +607,49 @@ module RedAmber
|
|
270
607
|
|
271
608
|
private
|
272
609
|
|
273
|
-
def
|
274
|
-
|
275
|
-
[function_name]
|
276
|
-
else
|
277
|
-
summary_keys.map { |key| "#{function_name}(#{key})" }
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
# @note `@group_counts.sum == @dataframe.size``
|
282
|
-
def group_counts
|
283
|
-
@group_counts ||= filters.map(&:sum)
|
610
|
+
def group_table
|
611
|
+
@group_table ||= build_aggregated_table
|
284
612
|
end
|
285
613
|
|
286
|
-
def
|
287
|
-
|
288
|
-
|
289
|
-
|
614
|
+
def build_aggregated_table
|
615
|
+
keys = @group_keys
|
616
|
+
key = keys[0]
|
617
|
+
table = @dataframe.table
|
618
|
+
|
619
|
+
plan = Arrow::ExecutePlan.new
|
620
|
+
source_node = plan.build_source_node(table)
|
621
|
+
|
622
|
+
aggregate_node =
|
623
|
+
plan.build_aggregate_node(source_node, {
|
624
|
+
aggregations: [{ function: 'hash_count',
|
625
|
+
input: key }], keys: keys
|
626
|
+
})
|
627
|
+
expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
|
628
|
+
null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
|
629
|
+
count_field = Arrow::FieldExpression.new("count(#{key})")
|
630
|
+
if null_count.zero?
|
631
|
+
expressions << count_field
|
632
|
+
else
|
633
|
+
is_zero =
|
634
|
+
Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
|
635
|
+
null_count_scalar = Arrow::Int64Scalar.new(null_count)
|
636
|
+
expressions <<
|
637
|
+
Arrow::CallExpression.new('if_else', [
|
638
|
+
is_zero, null_count_scalar, count_field
|
639
|
+
])
|
290
640
|
end
|
291
|
-
|
641
|
+
options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
|
642
|
+
project_node = plan.build_project_node(aggregate_node, options)
|
292
643
|
|
293
|
-
|
294
|
-
|
295
|
-
arrays = table.columns.map(&:data)
|
644
|
+
sink_and_start_plan(plan, project_node)
|
645
|
+
end
|
296
646
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
647
|
+
def build_aggregation_keys(function_name, summary_keys)
|
648
|
+
if summary_keys.empty?
|
649
|
+
[function_name]
|
650
|
+
else
|
651
|
+
summary_keys.map { |key| "#{function_name}(#{key})" }
|
301
652
|
end
|
302
|
-
|
303
|
-
Arrow::Table.new(Arrow::Schema.new(fields), arrays)
|
304
653
|
end
|
305
654
|
|
306
655
|
# Call Vector aggregating function and return an array of arrays:
|