red_amber 0.4.2 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.devcontainer/Dockerfile +75 -0
- data/.devcontainer/devcontainer.json +38 -0
- data/.devcontainer/onCreateCommand.sh +22 -0
- data/.rubocop.yml +11 -5
- data/CHANGELOG.md +141 -17
- data/Gemfile +5 -6
- data/README.ja.md +271 -0
- data/README.md +52 -31
- data/Rakefile +55 -0
- data/benchmark/group.yml +12 -5
- data/doc/Dev_Containers.ja.md +290 -0
- data/doc/Dev_Containers.md +292 -0
- data/doc/qmd/examples_of_red_amber.qmd +4596 -0
- data/doc/qmd/red-amber.qmd +90 -0
- data/docker/Dockerfile +2 -2
- data/docker/Gemfile +8 -3
- data/docker/docker-compose.yml +1 -1
- data/docker/readme.md +5 -5
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +7 -6
- data/lib/red_amber/data_frame_loadsave.rb +1 -1
- data/lib/red_amber/data_frame_selectable.rb +51 -2
- data/lib/red_amber/data_frame_variable_operation.rb +6 -6
- data/lib/red_amber/group.rb +476 -127
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +18 -11
- data/lib/red_amber/vector.rb +45 -25
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_selectable.rb +124 -40
- data/lib/red_amber/vector_string_function.rb +279 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +3 -3
- metadata +19 -14
- data/docker/Gemfile.lock +0 -80
- data/docker/example +0 -74
- data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
- data/docker/notebook/red-amber.ipynb +0 -188
data/lib/red_amber/group.rb
CHANGED
@@ -4,6 +4,7 @@ module RedAmber
|
|
4
4
|
# Group class
|
5
5
|
class Group
|
6
6
|
include Enumerable # This feature is experimental
|
7
|
+
include Helper
|
7
8
|
|
8
9
|
using RefineArrowTable
|
9
10
|
|
@@ -25,12 +26,7 @@ module RedAmber
|
|
25
26
|
private
|
26
27
|
|
27
28
|
# @!macro [attach] define_group_aggregation
|
28
|
-
#
|
29
|
-
# Group aggregation function `$1`.
|
30
|
-
# @param summary_keys [Array<Symbol, String>]
|
31
|
-
# summary keys.
|
32
|
-
# @return [DataFrame]
|
33
|
-
# aggregated DataFrame
|
29
|
+
# Returns aggregated DataFrame.
|
34
30
|
#
|
35
31
|
def define_group_aggregation(function)
|
36
32
|
define_method(function) do |*summary_keys|
|
@@ -54,7 +50,7 @@ module RedAmber
|
|
54
50
|
# @param group_keys [Array<Symbol, String>]
|
55
51
|
# keys for grouping.
|
56
52
|
# @return [Group]
|
57
|
-
# Group object.
|
53
|
+
# Group object. It inspects grouped columns and its count.
|
58
54
|
# @example
|
59
55
|
# Group.new(penguins, :species)
|
60
56
|
#
|
@@ -78,13 +74,93 @@ module RedAmber
|
|
78
74
|
@group = @dataframe.table.group(*@group_keys)
|
79
75
|
end
|
80
76
|
|
81
|
-
|
77
|
+
# @!macro group_aggregation
|
78
|
+
# @param group_keys [Array<Symbol, String>]
|
79
|
+
# keys for grouping.
|
80
|
+
# @return [DataFrame]
|
81
|
+
# aggregated DataFrame
|
82
|
+
|
83
|
+
# Whether all elements in each group evaluate to true.
|
84
|
+
#
|
85
|
+
# @!method all(*group_keys)
|
86
|
+
# @macro group_aggregation
|
87
|
+
# @example For boolean columns by default.
|
88
|
+
# dataframe
|
89
|
+
#
|
90
|
+
# # =>
|
91
|
+
# #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000230dc>
|
92
|
+
# x y z
|
93
|
+
# <uint8> <string> <boolean>
|
94
|
+
# 0 1 A false
|
95
|
+
# 1 2 A true
|
96
|
+
# 2 3 B false
|
97
|
+
# 3 4 B (nil)
|
98
|
+
# 4 5 B true
|
99
|
+
# 5 6 C false
|
100
|
+
#
|
101
|
+
# dataframe.group(:y).all
|
102
|
+
#
|
103
|
+
# # =>
|
104
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000fc08>
|
105
|
+
# y all(z)
|
106
|
+
# <string> <boolean>
|
107
|
+
# 0 A false
|
108
|
+
# 1 B false
|
109
|
+
# 2 C false
|
110
|
+
#
|
111
|
+
define_group_aggregation :all
|
112
|
+
|
113
|
+
# Whether any elements in each group evaluate to true.
|
114
|
+
#
|
115
|
+
# @!method any(*group_keys)
|
116
|
+
# @macro group_aggregation
|
117
|
+
# @example For boolean columns by default.
|
118
|
+
# dataframe.group(:y).any
|
119
|
+
#
|
120
|
+
# # =>
|
121
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000000117ec>
|
122
|
+
# y any(z)
|
123
|
+
# <string> <boolean>
|
124
|
+
# 0 A true
|
125
|
+
# 1 B true
|
126
|
+
# 2 C false
|
127
|
+
#
|
128
|
+
define_group_aggregation :any
|
129
|
+
|
130
|
+
# Count the number of non-nil values in each group.
|
131
|
+
# If counts are the same (and do not include NaN or nil),
|
132
|
+
# columns for counts are unified.
|
133
|
+
#
|
134
|
+
# @!method max(*group_keys)
|
135
|
+
# @macro group_aggregation
|
136
|
+
# @example Show counts for each group.
|
137
|
+
# dataframe.group(:y).count
|
138
|
+
#
|
139
|
+
# # =>
|
140
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
|
141
|
+
# y count(x) count(z)
|
142
|
+
# <string> <int64> <int64>
|
143
|
+
# 0 A 2 2
|
144
|
+
# 1 B 3 2
|
145
|
+
# 2 C 1 1
|
146
|
+
#
|
147
|
+
# dataframe.group(:z).count
|
148
|
+
# # same as dataframe.group(:z).count(:x, :y)
|
149
|
+
#
|
150
|
+
# =>
|
151
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000122834>
|
152
|
+
# z count
|
153
|
+
# <boolean> <int64>
|
154
|
+
# 0 false 3
|
155
|
+
# 1 true 2
|
156
|
+
# 2 (nil) 1
|
157
|
+
#
|
158
|
+
define_group_aggregation :count
|
82
159
|
alias_method :__count, :count
|
83
160
|
private :__count
|
84
161
|
|
85
|
-
def count(*
|
86
|
-
df = __count(
|
87
|
-
# if counts are the same (and do not include NaN or nil), aggregate count columns.
|
162
|
+
def count(*group_keys)
|
163
|
+
df = __count(group_keys)
|
88
164
|
if df.pick(@group_keys.size..).to_h.values.uniq.size == 1
|
89
165
|
df.pick(0..@group_keys.size).rename { [keys[-1], :count] }
|
90
166
|
else
|
@@ -92,19 +168,213 @@ module RedAmber
|
|
92
168
|
end
|
93
169
|
end
|
94
170
|
|
95
|
-
|
171
|
+
# Returns each record group size as a DataFrame.
|
172
|
+
#
|
173
|
+
# @return [DataFrame]
|
174
|
+
# DataFrame consists of:
|
175
|
+
# - Group key columns.
|
176
|
+
# - Result columns by group aggregation.
|
177
|
+
# @example
|
178
|
+
# penguins.group(:species).group_count
|
179
|
+
#
|
180
|
+
# # =>
|
181
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003a70>
|
182
|
+
# species group_count
|
183
|
+
# <string> <uint8>
|
184
|
+
# 0 Adelie 152
|
185
|
+
# 1 Chinstrap 68
|
186
|
+
# 2 Gentoo 124
|
187
|
+
#
|
188
|
+
def group_count
|
189
|
+
DataFrame.create(group_table)
|
190
|
+
end
|
191
|
+
alias_method :count_all, :group_count
|
96
192
|
|
97
|
-
|
193
|
+
# Count the unique values in each group.
|
194
|
+
#
|
195
|
+
# @!method count_uniq(*group_keys)
|
196
|
+
# @macro group_aggregation
|
197
|
+
# @example Show counts for each group.
|
198
|
+
# dataframe.group(:y).count_uniq
|
199
|
+
#
|
200
|
+
# # =>
|
201
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000011ea04>
|
202
|
+
# y count_uniq(x)
|
203
|
+
# <string> <int64>
|
204
|
+
# 0 A 2
|
205
|
+
# 1 B 3
|
206
|
+
# 2 C 1
|
207
|
+
#
|
208
|
+
define_group_aggregation :count_distinct
|
209
|
+
def count_uniq(*group_keys)
|
210
|
+
df = count_distinct(*group_keys)
|
211
|
+
df.rename do
|
212
|
+
keys_org = keys.select { _1.start_with?('count_distinct') }
|
213
|
+
keys_renamed = keys_org.map { _1.to_s.gsub('distinct', 'uniq') }
|
214
|
+
keys_org.zip keys_renamed
|
215
|
+
end
|
216
|
+
end
|
98
217
|
|
99
|
-
|
218
|
+
# Compute maximum of values in each group for numeric columns.
|
219
|
+
#
|
220
|
+
# @!method max(*group_keys)
|
221
|
+
# @macro group_aggregation
|
222
|
+
# @example
|
223
|
+
# dataframe.group(:y).max
|
224
|
+
#
|
225
|
+
# # =>
|
226
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000014ae74>
|
227
|
+
# y max(x)
|
228
|
+
# <string> <uint8>
|
229
|
+
# 0 A 2
|
230
|
+
# 1 B 5
|
231
|
+
# 2 C 6
|
232
|
+
#
|
233
|
+
define_group_aggregation :max
|
100
234
|
|
101
|
-
|
235
|
+
# Compute mean of values in each group for numeric columns.
|
236
|
+
#
|
237
|
+
# @!method mean(*group_keys)
|
238
|
+
# @macro group_aggregation
|
239
|
+
# @example
|
240
|
+
# dataframe.group(:y).mean
|
241
|
+
#
|
242
|
+
# # =>
|
243
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
|
244
|
+
# y mean(x)
|
245
|
+
# <string> <double>
|
246
|
+
# 0 A 1.5
|
247
|
+
# 1 B 4.0
|
248
|
+
# 2 C 6.0
|
249
|
+
#
|
250
|
+
define_group_aggregation :mean
|
102
251
|
|
103
|
-
|
252
|
+
# Compute median of values in each group for numeric columns.
|
253
|
+
#
|
254
|
+
# @!method median(*group_keys)
|
255
|
+
# @macro group_aggregation
|
256
|
+
# @example
|
257
|
+
# dataframe.group(:y).median
|
258
|
+
#
|
259
|
+
# # =>
|
260
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000138a8>
|
261
|
+
# y median(x)
|
262
|
+
# <string> <double>
|
263
|
+
# 0 A 1.5
|
264
|
+
# 1 B 4.0
|
265
|
+
# 2 C 6.0
|
266
|
+
#
|
267
|
+
define_group_aggregation :approximate_median
|
268
|
+
def median(*group_keys)
|
269
|
+
df = approximate_median(*group_keys)
|
270
|
+
df.rename do
|
271
|
+
keys_org = keys.select { _1.start_with?('approximate_') }
|
272
|
+
keys_renamed = keys_org.map { _1.to_s.delete_prefix('approximate_') }
|
273
|
+
keys_org.zip keys_renamed
|
274
|
+
end
|
275
|
+
end
|
104
276
|
|
105
|
-
|
277
|
+
# Compute minimum of values in each group for numeric columns.
|
278
|
+
#
|
279
|
+
# @!method min(*group_keys)
|
280
|
+
# @macro group_aggregation
|
281
|
+
# @example
|
282
|
+
# dataframe.group(:y).min
|
283
|
+
#
|
284
|
+
# # =>
|
285
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000018f38>
|
286
|
+
# y min(x)
|
287
|
+
# <string> <uint8>
|
288
|
+
# 0 A 1
|
289
|
+
# 1 B 3
|
290
|
+
# 2 C 6
|
291
|
+
#
|
292
|
+
define_group_aggregation :min
|
106
293
|
|
107
|
-
|
294
|
+
# Get one value from each group.
|
295
|
+
#
|
296
|
+
# @!method one(*group_keys)
|
297
|
+
# @macro group_aggregation
|
298
|
+
# @example
|
299
|
+
# dataframe.group(:y).one
|
300
|
+
#
|
301
|
+
# # =>
|
302
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000002885c>
|
303
|
+
# y one(x)
|
304
|
+
# <string> <uint8>
|
305
|
+
# 0 A 1
|
306
|
+
# 1 B 3
|
307
|
+
# 2 C 6
|
308
|
+
#
|
309
|
+
define_group_aggregation :one
|
310
|
+
|
311
|
+
# Compute product of values in each group for numeric columns.
|
312
|
+
#
|
313
|
+
# @!method product(*group_keys)
|
314
|
+
# @macro group_aggregation
|
315
|
+
# @example
|
316
|
+
# dataframe.group(:y).product
|
317
|
+
#
|
318
|
+
# # =>
|
319
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000021a84>
|
320
|
+
# y product(x)
|
321
|
+
# <string> <uint64>
|
322
|
+
# 0 A 2
|
323
|
+
# 1 B 60
|
324
|
+
# 2 C 6
|
325
|
+
#
|
326
|
+
define_group_aggregation :product
|
327
|
+
|
328
|
+
# Compute standard deviation of values in each group for numeric columns.
|
329
|
+
#
|
330
|
+
# @!method stddev(*group_keys)
|
331
|
+
# @macro group_aggregation
|
332
|
+
# @example
|
333
|
+
# dataframe.group(:y).stddev
|
334
|
+
#
|
335
|
+
# # =>
|
336
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000002be6c>
|
337
|
+
# y stddev(x)
|
338
|
+
# <string> <double>
|
339
|
+
# 0 A 0.5
|
340
|
+
# 1 B 0.082
|
341
|
+
# 2 C 0.0
|
342
|
+
#
|
343
|
+
define_group_aggregation :stddev
|
344
|
+
|
345
|
+
# Compute sum of values in each group for numeric columns.
|
346
|
+
#
|
347
|
+
# @!method sum(*group_keys)
|
348
|
+
# @macro group_aggregation
|
349
|
+
# @example
|
350
|
+
# dataframe.group(:y).sum
|
351
|
+
#
|
352
|
+
# # =>
|
353
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000032a14>
|
354
|
+
# y sum(x)
|
355
|
+
# <string> <uint64>
|
356
|
+
# 0 A 3
|
357
|
+
# 1 B 12
|
358
|
+
# 2 C 6
|
359
|
+
#
|
360
|
+
define_group_aggregation :sum
|
361
|
+
|
362
|
+
# Compute variance of values in each group for numeric columns.
|
363
|
+
#
|
364
|
+
# @!method variance(*group_keys)
|
365
|
+
# @macro group_aggregation
|
366
|
+
# @example
|
367
|
+
# dataframe.group(:y).variance
|
368
|
+
#
|
369
|
+
# # =>
|
370
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x00000000003b1dc>
|
371
|
+
# y variance(x)
|
372
|
+
# <string> <double>
|
373
|
+
# 0 A 0.25
|
374
|
+
# 1 B 0.067
|
375
|
+
# 2 C 0.0
|
376
|
+
#
|
377
|
+
define_group_aggregation :variance
|
108
378
|
|
109
379
|
# Returns Array of boolean filters to select each records in the Group.
|
110
380
|
#
|
@@ -114,15 +384,27 @@ module RedAmber
|
|
114
384
|
#
|
115
385
|
def filters
|
116
386
|
@filters ||= begin
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
387
|
+
group_values = group_table[group_keys].each_record.map(&:to_a)
|
388
|
+
|
389
|
+
Enumerator.new(group_table.n_rows) do |yielder|
|
390
|
+
group_values.each do |values|
|
391
|
+
booleans =
|
392
|
+
values.map.with_index do |value, i|
|
393
|
+
column = @dataframe[group_keys[i]].data
|
394
|
+
if value.nil?
|
395
|
+
Arrow::Function.find('is_null').execute([column])
|
396
|
+
elsif value.is_a?(Float) && value.nan?
|
397
|
+
Arrow::Function.find('is_nan').execute([column])
|
398
|
+
else
|
399
|
+
Arrow::Function.find('equal').execute([column, value])
|
400
|
+
end
|
401
|
+
end
|
402
|
+
filter =
|
403
|
+
booleans.reduce do |result, datum|
|
404
|
+
Arrow::Function.find('and_kleene').execute([result, datum])
|
405
|
+
end
|
406
|
+
yielder << Vector.create(filter.value)
|
407
|
+
end
|
126
408
|
end
|
127
409
|
end
|
128
410
|
end
|
@@ -147,119 +429,174 @@ module RedAmber
|
|
147
429
|
# group size.
|
148
430
|
#
|
149
431
|
def each
|
150
|
-
filters
|
151
432
|
return enum_for(:each) unless block_given?
|
152
433
|
|
153
|
-
|
154
|
-
yield @dataframe
|
434
|
+
filters.each do |filter|
|
435
|
+
yield @dataframe.filter(filter)
|
155
436
|
end
|
156
437
|
@filters.size
|
157
438
|
end
|
158
439
|
|
159
|
-
#
|
440
|
+
# String representation of self.
|
160
441
|
#
|
161
|
-
# @return [
|
162
|
-
#
|
163
|
-
# - Group key columns.
|
164
|
-
# - Result columns by group aggregation.
|
442
|
+
# @return [String]
|
443
|
+
# show information of self as a String.
|
165
444
|
# @example
|
166
|
-
# penguins.group(:species).
|
445
|
+
# puts penguins.group(:species).inspect
|
167
446
|
#
|
168
447
|
# # =>
|
169
|
-
# #<RedAmber::
|
448
|
+
# #<RedAmber::Group : 0x0000000000003a98>
|
170
449
|
# species group_count
|
171
450
|
# <string> <uint8>
|
172
451
|
# 0 Adelie 152
|
173
452
|
# 1 Chinstrap 68
|
174
453
|
# 2 Gentoo 124
|
175
454
|
#
|
176
|
-
def
|
177
|
-
|
455
|
+
def inspect
|
456
|
+
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
|
178
457
|
end
|
179
458
|
|
180
|
-
#
|
459
|
+
# Summarize Group by aggregation functions from the block.
|
181
460
|
#
|
182
|
-
# @
|
183
|
-
#
|
184
|
-
#
|
185
|
-
#
|
461
|
+
# @overload summarize
|
462
|
+
# Summarize by a function.
|
463
|
+
# @yieldparam group [Group]
|
464
|
+
# passes group object self.
|
465
|
+
# @yieldreturn [DataFrame]
|
466
|
+
# @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
|
467
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
468
|
+
# @return [DataFrame]
|
469
|
+
# summarized DataFrame.
|
470
|
+
# @example Single function and single variable
|
471
|
+
# group = penguins.group(:species)
|
472
|
+
# group
|
186
473
|
#
|
187
|
-
#
|
188
|
-
#
|
189
|
-
#
|
190
|
-
#
|
191
|
-
#
|
192
|
-
#
|
193
|
-
#
|
474
|
+
# # =>
|
475
|
+
# #<RedAmber::Group : 0x000000000000c314>
|
476
|
+
# species group_count
|
477
|
+
# <string> <uint8>
|
478
|
+
# 0 Adelie 152
|
479
|
+
# 1 Chinstrap 68
|
480
|
+
# 2 Gentoo 124
|
194
481
|
#
|
195
|
-
|
196
|
-
"#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
|
197
|
-
end
|
198
|
-
|
199
|
-
# Summarize Group by aggregation functions from the block.
|
482
|
+
# group.summarize { mean(:bill_length_mm) }
|
200
483
|
#
|
201
|
-
#
|
202
|
-
#
|
203
|
-
#
|
204
|
-
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
#
|
208
|
-
# group = penguins.group(:species)
|
209
|
-
# group
|
484
|
+
# # =>
|
485
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
|
486
|
+
# species mean(bill_length_mm)
|
487
|
+
# <string> <double>
|
488
|
+
# 0 Adelie 38.79
|
489
|
+
# 1 Chinstrap 48.83
|
490
|
+
# 2 Gentoo 47.5
|
210
491
|
#
|
211
|
-
#
|
212
|
-
#
|
213
|
-
# species count
|
214
|
-
# <string> <uint8>
|
215
|
-
# 0 Adelie 152
|
216
|
-
# 1 Chinstrap 68
|
217
|
-
# 2 Gentoo 124
|
492
|
+
# @example Single function only
|
493
|
+
# group.summarize { mean }
|
218
494
|
#
|
219
|
-
#
|
495
|
+
# # =>
|
496
|
+
# #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
|
497
|
+
# species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
|
498
|
+
# <string> <double> <double> ... <double>
|
499
|
+
# 0 Adelie 38.79 18.35 ... 2008.01
|
500
|
+
# 1 Chinstrap 48.83 18.42 ... 2007.97
|
501
|
+
# 2 Gentoo 47.5 14.98 ... 2008.08
|
220
502
|
#
|
221
|
-
#
|
222
|
-
#
|
223
|
-
# species mean(bill_length_mm)
|
224
|
-
# <string> <double>
|
225
|
-
# 0 Adelie 38.79
|
226
|
-
# 1 Chinstrap 48.83
|
227
|
-
# 2 Gentoo 47.5
|
503
|
+
# @overload summarize
|
504
|
+
# Summarize by a function.
|
228
505
|
#
|
229
|
-
#
|
230
|
-
#
|
506
|
+
# @yieldparam group [Group]
|
507
|
+
# passes group object self.
|
508
|
+
# @yieldreturn [Array<DataFrame>]
|
509
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
510
|
+
# @return [DataFrame]
|
511
|
+
# summarized DataFrame.
|
512
|
+
# @example Multiple functions
|
513
|
+
# group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
|
231
514
|
#
|
232
|
-
#
|
233
|
-
#
|
234
|
-
#
|
235
|
-
#
|
236
|
-
#
|
237
|
-
#
|
238
|
-
#
|
515
|
+
# # =>
|
516
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
517
|
+
# species min(bill_length_mm) max(bill_length_mm)
|
518
|
+
# <string> <double> <double>
|
519
|
+
# 0 Adelie 32.1 46.0
|
520
|
+
# 1 Chinstrap 40.9 58.0
|
521
|
+
# 2 Gentoo 40.9 59.6
|
239
522
|
#
|
240
|
-
# @
|
241
|
-
#
|
523
|
+
# @overload summarize
|
524
|
+
# Summarize by a function.
|
242
525
|
#
|
243
|
-
#
|
244
|
-
#
|
245
|
-
#
|
246
|
-
#
|
247
|
-
#
|
248
|
-
#
|
249
|
-
#
|
250
|
-
#
|
251
|
-
|
252
|
-
|
526
|
+
# @yieldparam group [Group]
|
527
|
+
# passes group object self.
|
528
|
+
# @yieldreturn [Hash{Symbol, String => DataFrame}]
|
529
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
530
|
+
# The DataFrame must return only one aggregated column.
|
531
|
+
# @return [DataFrame]
|
532
|
+
# summarized DataFrame.
|
533
|
+
# @example Rename column name by Hash
|
534
|
+
# group.summarize {
|
535
|
+
# {
|
536
|
+
# min_bill_length_mm: min(:bill_length_mm),
|
537
|
+
# max_bill_length_mm: max(:bill_length_mm),
|
538
|
+
# }
|
539
|
+
# }
|
540
|
+
#
|
541
|
+
# # =>
|
542
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
543
|
+
# species min_bill_length_mm max_bill_length_mm
|
544
|
+
# <string> <double> <double>
|
545
|
+
# 0 Adelie 32.1 46.0
|
546
|
+
# 1 Chinstrap 40.9 58.0
|
547
|
+
# 2 Gentoo 40.9 59.6
|
548
|
+
#
|
549
|
+
def summarize(*args, &block)
|
550
|
+
if block
|
551
|
+
agg = instance_eval(&block)
|
552
|
+
unless args.empty?
|
553
|
+
agg = [agg] if agg.is_a?(DataFrame)
|
554
|
+
agg = args.zip(agg).to_h
|
555
|
+
end
|
556
|
+
else
|
557
|
+
agg = args
|
558
|
+
end
|
559
|
+
|
253
560
|
case agg
|
254
561
|
when DataFrame
|
255
562
|
agg
|
256
563
|
when Array
|
257
|
-
|
564
|
+
aggregations =
|
565
|
+
agg.map do |df|
|
566
|
+
v = df.vectors[-1]
|
567
|
+
[v.key, v]
|
568
|
+
end
|
569
|
+
agg[0].assign(aggregations)
|
570
|
+
when Hash
|
571
|
+
aggregations =
|
572
|
+
agg.map do |key, df|
|
573
|
+
aggregated_keys = df.keys - @group_keys
|
574
|
+
if aggregated_keys.size > 1
|
575
|
+
message =
|
576
|
+
"accept only one column from the Hash: #{aggregated_keys.join(', ')}"
|
577
|
+
raise GroupArgumentError, message
|
578
|
+
end
|
579
|
+
|
580
|
+
v = df.vectors[-1]
|
581
|
+
[key, v]
|
582
|
+
end
|
583
|
+
agg.values[-1].drop(-1).assign(aggregations)
|
258
584
|
else
|
259
585
|
raise GroupArgumentError, "Unknown argument: #{agg}"
|
260
586
|
end
|
261
587
|
end
|
262
588
|
|
589
|
+
# Return grouped DataFrame only for group keys.
|
590
|
+
#
|
591
|
+
# @return [DataFrame]
|
592
|
+
# grouped DataFrame projected only for group_keys.
|
593
|
+
# @since 0.5.0
|
594
|
+
#
|
595
|
+
def grouped_frame
|
596
|
+
DataFrame.create(group_table[group_keys])
|
597
|
+
end
|
598
|
+
alias_method :none, :grouped_frame
|
599
|
+
|
263
600
|
# Aggregating summary.
|
264
601
|
#
|
265
602
|
# @api private
|
@@ -270,37 +607,49 @@ module RedAmber
|
|
270
607
|
|
271
608
|
private
|
272
609
|
|
273
|
-
def
|
274
|
-
|
275
|
-
[function_name]
|
276
|
-
else
|
277
|
-
summary_keys.map { |key| "#{function_name}(#{key})" }
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
# @note `@group_counts.sum == @dataframe.size``
|
282
|
-
def group_counts
|
283
|
-
@group_counts ||= filters.map(&:sum)
|
610
|
+
def group_table
|
611
|
+
@group_table ||= build_aggregated_table
|
284
612
|
end
|
285
613
|
|
286
|
-
def
|
287
|
-
|
288
|
-
|
289
|
-
|
614
|
+
def build_aggregated_table
|
615
|
+
keys = @group_keys
|
616
|
+
key = keys[0]
|
617
|
+
table = @dataframe.table
|
618
|
+
|
619
|
+
plan = Arrow::ExecutePlan.new
|
620
|
+
source_node = plan.build_source_node(table)
|
621
|
+
|
622
|
+
aggregate_node =
|
623
|
+
plan.build_aggregate_node(source_node, {
|
624
|
+
aggregations: [{ function: 'hash_count',
|
625
|
+
input: key }], keys: keys
|
626
|
+
})
|
627
|
+
expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
|
628
|
+
null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
|
629
|
+
count_field = Arrow::FieldExpression.new("count(#{key})")
|
630
|
+
if null_count.zero?
|
631
|
+
expressions << count_field
|
632
|
+
else
|
633
|
+
is_zero =
|
634
|
+
Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
|
635
|
+
null_count_scalar = Arrow::Int64Scalar.new(null_count)
|
636
|
+
expressions <<
|
637
|
+
Arrow::CallExpression.new('if_else', [
|
638
|
+
is_zero, null_count_scalar, count_field
|
639
|
+
])
|
290
640
|
end
|
291
|
-
|
641
|
+
options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
|
642
|
+
project_node = plan.build_project_node(aggregate_node, options)
|
292
643
|
|
293
|
-
|
294
|
-
|
295
|
-
arrays = table.columns.map(&:data)
|
644
|
+
sink_and_start_plan(plan, project_node)
|
645
|
+
end
|
296
646
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
647
|
+
def build_aggregation_keys(function_name, summary_keys)
|
648
|
+
if summary_keys.empty?
|
649
|
+
[function_name]
|
650
|
+
else
|
651
|
+
summary_keys.map { |key| "#{function_name}(#{key})" }
|
301
652
|
end
|
302
|
-
|
303
|
-
Arrow::Table.new(Arrow::Schema.new(fields), arrays)
|
304
653
|
end
|
305
654
|
|
306
655
|
# Call Vector aggregating function and return an array of arrays:
|