red_amber 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +11 -5
- data/CHANGELOG.md +93 -1
- data/Gemfile +5 -6
- data/README.ja.md +252 -0
- data/README.md +30 -23
- data/benchmark/basic.yml +1 -1
- data/benchmark/group.yml +12 -5
- data/doc/CODE_OF_CONDUCT.md +1 -1
- data/docker/.env +4 -0
- data/docker/Dockerfile +66 -0
- data/docker/Gemfile +26 -0
- data/docker/Gemfile.lock +118 -0
- data/docker/docker-compose.yml +21 -0
- data/docker/example +86 -0
- data/docker/notebook/examples_of_red_amber.ipynb +8562 -0
- data/docker/notebook/red-amber.ipynb +188 -0
- data/docker/readme.md +118 -0
- data/lib/red_amber/data_frame.rb +78 -4
- data/lib/red_amber/data_frame_combinable.rb +147 -119
- data/lib/red_amber/data_frame_displayable.rb +6 -5
- data/lib/red_amber/data_frame_selectable.rb +49 -0
- data/lib/red_amber/group.rb +190 -89
- data/lib/red_amber/helper.rb +26 -0
- data/lib/red_amber/subframes.rb +166 -66
- data/lib/red_amber/vector.rb +43 -24
- data/lib/red_amber/vector_aggregation.rb +26 -0
- data/lib/red_amber/vector_binary_element_wise.rb +54 -25
- data/lib/red_amber/vector_selectable.rb +74 -23
- data/lib/red_amber/vector_string_function.rb +211 -0
- data/lib/red_amber/vector_unary_element_wise.rb +4 -0
- data/lib/red_amber/vector_updatable.rb +28 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -1
- data/red_amber.gemspec +4 -4
- metadata +20 -9
data/lib/red_amber/group.rb
CHANGED
@@ -4,6 +4,7 @@ module RedAmber
|
|
4
4
|
# Group class
|
5
5
|
class Group
|
6
6
|
include Enumerable # This feature is experimental
|
7
|
+
include Helper
|
7
8
|
|
8
9
|
using RefineArrowTable
|
9
10
|
|
@@ -60,11 +61,11 @@ module RedAmber
|
|
60
61
|
#
|
61
62
|
# # =>
|
62
63
|
# #<RedAmber::Group : 0x000000000000f410>
|
63
|
-
# species
|
64
|
-
# <string>
|
65
|
-
# 0 Adelie
|
66
|
-
# 1 Chinstrap
|
67
|
-
# 2 Gentoo
|
64
|
+
# species count
|
65
|
+
# <string> <uint8>
|
66
|
+
# 0 Adelie 152
|
67
|
+
# 1 Chinstrap 68
|
68
|
+
# 2 Gentoo 124
|
68
69
|
#
|
69
70
|
def initialize(dataframe, *group_keys)
|
70
71
|
@dataframe = dataframe
|
@@ -114,15 +115,27 @@ module RedAmber
|
|
114
115
|
#
|
115
116
|
def filters
|
116
117
|
@filters ||= begin
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
118
|
+
group_values = group_table[group_keys].each_record.map(&:to_a)
|
119
|
+
|
120
|
+
Enumerator.new(group_table.n_rows) do |yielder|
|
121
|
+
group_values.each do |values|
|
122
|
+
booleans =
|
123
|
+
values.map.with_index do |value, i|
|
124
|
+
column = @dataframe[group_keys[i]].data
|
125
|
+
if value.nil?
|
126
|
+
Arrow::Function.find('is_null').execute([column])
|
127
|
+
elsif value.is_a?(Float) && value.nan?
|
128
|
+
Arrow::Function.find('is_nan').execute([column])
|
129
|
+
else
|
130
|
+
Arrow::Function.find('equal').execute([column, value])
|
131
|
+
end
|
132
|
+
end
|
133
|
+
filter =
|
134
|
+
booleans.reduce do |result, datum|
|
135
|
+
Arrow::Function.find('and_kleene').execute([result, datum])
|
136
|
+
end
|
137
|
+
yielder << Vector.create(filter.value)
|
138
|
+
end
|
126
139
|
end
|
127
140
|
end
|
128
141
|
end
|
@@ -147,11 +160,10 @@ module RedAmber
|
|
147
160
|
# group size.
|
148
161
|
#
|
149
162
|
def each
|
150
|
-
filters
|
151
163
|
return enum_for(:each) unless block_given?
|
152
164
|
|
153
|
-
|
154
|
-
yield @dataframe
|
165
|
+
filters.each do |filter|
|
166
|
+
yield @dataframe.filter(filter)
|
155
167
|
end
|
156
168
|
@filters.size
|
157
169
|
end
|
@@ -174,7 +186,7 @@ module RedAmber
|
|
174
186
|
# 2 Gentoo 124
|
175
187
|
#
|
176
188
|
def group_count
|
177
|
-
DataFrame.create(
|
189
|
+
DataFrame.create(group_table)
|
178
190
|
end
|
179
191
|
|
180
192
|
# String representation of self.
|
@@ -198,68 +210,145 @@ module RedAmber
|
|
198
210
|
|
199
211
|
# Summarize Group by aggregation functions from the block.
|
200
212
|
#
|
201
|
-
# @
|
202
|
-
#
|
203
|
-
#
|
204
|
-
#
|
205
|
-
#
|
206
|
-
#
|
207
|
-
#
|
208
|
-
#
|
209
|
-
#
|
213
|
+
# @overload summarize
|
214
|
+
# Summarize by a function.
|
215
|
+
# @yieldparam group [Group]
|
216
|
+
# passes group object self.
|
217
|
+
# @yieldreturn [DataFrame]
|
218
|
+
# @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
|
219
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
220
|
+
# @return [DataFrame]
|
221
|
+
# summarized DataFrame.
|
222
|
+
# @example Single function and single variable
|
223
|
+
# group = penguins.group(:species)
|
224
|
+
# group
|
210
225
|
#
|
211
|
-
#
|
212
|
-
#
|
213
|
-
#
|
214
|
-
#
|
215
|
-
#
|
216
|
-
#
|
217
|
-
#
|
226
|
+
# # =>
|
227
|
+
# #<RedAmber::Group : 0x000000000000c314>
|
228
|
+
# species group_count
|
229
|
+
# <string> <uint8>
|
230
|
+
# 0 Adelie 152
|
231
|
+
# 1 Chinstrap 68
|
232
|
+
# 2 Gentoo 124
|
218
233
|
#
|
219
|
-
#
|
234
|
+
# group.summarize { mean(:bill_length_mm) }
|
220
235
|
#
|
221
|
-
#
|
222
|
-
#
|
223
|
-
#
|
224
|
-
#
|
225
|
-
#
|
226
|
-
#
|
227
|
-
#
|
236
|
+
# # =>
|
237
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
|
238
|
+
# species mean(bill_length_mm)
|
239
|
+
# <string> <double>
|
240
|
+
# 0 Adelie 38.79
|
241
|
+
# 1 Chinstrap 48.83
|
242
|
+
# 2 Gentoo 47.5
|
228
243
|
#
|
229
|
-
#
|
230
|
-
#
|
244
|
+
# @example Single function only
|
245
|
+
# group.summarize { mean }
|
231
246
|
#
|
232
|
-
#
|
233
|
-
#
|
234
|
-
#
|
235
|
-
#
|
236
|
-
#
|
237
|
-
#
|
238
|
-
#
|
247
|
+
# # =>
|
248
|
+
# #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
|
249
|
+
# species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
|
250
|
+
# <string> <double> <double> ... <double>
|
251
|
+
# 0 Adelie 38.79 18.35 ... 2008.01
|
252
|
+
# 1 Chinstrap 48.83 18.42 ... 2007.97
|
253
|
+
# 2 Gentoo 47.5 14.98 ... 2008.08
|
239
254
|
#
|
240
|
-
# @
|
241
|
-
#
|
255
|
+
# @overload summarize
|
256
|
+
# Summarize by a function.
|
242
257
|
#
|
243
|
-
#
|
244
|
-
#
|
245
|
-
#
|
246
|
-
#
|
247
|
-
#
|
248
|
-
#
|
249
|
-
#
|
250
|
-
#
|
251
|
-
|
252
|
-
|
258
|
+
# @yieldparam group [Group]
|
259
|
+
# passes group object self.
|
260
|
+
# @yieldreturn [Array<DataFrame>]
|
261
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
262
|
+
# @return [DataFrame]
|
263
|
+
# summarized DataFrame.
|
264
|
+
# @example Multiple functions
|
265
|
+
# group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
|
266
|
+
#
|
267
|
+
# # =>
|
268
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
269
|
+
# species min(bill_length_mm) max(bill_length_mm)
|
270
|
+
# <string> <double> <double>
|
271
|
+
# 0 Adelie 32.1 46.0
|
272
|
+
# 1 Chinstrap 40.9 58.0
|
273
|
+
# 2 Gentoo 40.9 59.6
|
274
|
+
#
|
275
|
+
# @overload summarize
|
276
|
+
# Summarize by a function.
|
277
|
+
#
|
278
|
+
# @yieldparam group [Group]
|
279
|
+
# passes group object self.
|
280
|
+
# @yieldreturn [Hash{Symbol, String => DataFrame}]
|
281
|
+
# an aggregated DataFrame or an array of aggregated DataFrames.
|
282
|
+
# The DataFrame must return only one aggregated column.
|
283
|
+
# @return [DataFrame]
|
284
|
+
# summarized DataFrame.
|
285
|
+
# @example Rename column name by Hash
|
286
|
+
# group.summarize {
|
287
|
+
# {
|
288
|
+
# min_bill_length_mm: min(:bill_length_mm),
|
289
|
+
# max_bill_length_mm: max(:bill_length_mm),
|
290
|
+
# }
|
291
|
+
# }
|
292
|
+
#
|
293
|
+
# # =>
|
294
|
+
# #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
|
295
|
+
# species min_bill_length_mm max_bill_length_mm
|
296
|
+
# <string> <double> <double>
|
297
|
+
# 0 Adelie 32.1 46.0
|
298
|
+
# 1 Chinstrap 40.9 58.0
|
299
|
+
# 2 Gentoo 40.9 59.6
|
300
|
+
#
|
301
|
+
def summarize(*args, &block)
|
302
|
+
if block
|
303
|
+
agg = instance_eval(&block)
|
304
|
+
unless args.empty?
|
305
|
+
agg = [agg] if agg.is_a?(DataFrame)
|
306
|
+
agg = args.zip(agg).to_h
|
307
|
+
end
|
308
|
+
else
|
309
|
+
agg = args
|
310
|
+
end
|
311
|
+
|
253
312
|
case agg
|
254
313
|
when DataFrame
|
255
314
|
agg
|
256
315
|
when Array
|
257
|
-
|
316
|
+
aggregations =
|
317
|
+
agg.map do |df|
|
318
|
+
v = df.vectors[-1]
|
319
|
+
[v.key, v]
|
320
|
+
end
|
321
|
+
agg[0].assign(aggregations)
|
322
|
+
when Hash
|
323
|
+
aggregations =
|
324
|
+
agg.map do |key, df|
|
325
|
+
aggregated_keys = df.keys - @group_keys
|
326
|
+
if aggregated_keys.size > 1
|
327
|
+
message =
|
328
|
+
"accept only one column from the Hash: #{aggregated_keys.join(', ')}"
|
329
|
+
raise GroupArgumentError, message
|
330
|
+
end
|
331
|
+
|
332
|
+
v = df.vectors[-1]
|
333
|
+
[key, v]
|
334
|
+
end
|
335
|
+
agg.values[-1].drop(-1).assign(aggregations)
|
258
336
|
else
|
259
337
|
raise GroupArgumentError, "Unknown argument: #{agg}"
|
260
338
|
end
|
261
339
|
end
|
262
340
|
|
341
|
+
# Return grouped DataFrame only for group keys.
|
342
|
+
#
|
343
|
+
# @return [DataFrame]
|
344
|
+
# grouped DataFrame projected only for group_keys.
|
345
|
+
# @since 0.5.0
|
346
|
+
#
|
347
|
+
def grouped_frame
|
348
|
+
DataFrame.create(group_table[group_keys])
|
349
|
+
end
|
350
|
+
alias_method :none, :grouped_frame
|
351
|
+
|
263
352
|
# Aggregating summary.
|
264
353
|
#
|
265
354
|
# @api private
|
@@ -270,37 +359,49 @@ module RedAmber
|
|
270
359
|
|
271
360
|
private
|
272
361
|
|
273
|
-
def
|
274
|
-
|
275
|
-
[function_name]
|
276
|
-
else
|
277
|
-
summary_keys.map { |key| "#{function_name}(#{key})" }
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
# @note `@group_counts.sum == @dataframe.size``
|
282
|
-
def group_counts
|
283
|
-
@group_counts ||= filters.map(&:sum)
|
362
|
+
def group_table
|
363
|
+
@group_table ||= build_aggregated_table
|
284
364
|
end
|
285
365
|
|
286
|
-
def
|
287
|
-
|
288
|
-
|
289
|
-
|
366
|
+
def build_aggregated_table
|
367
|
+
keys = @group_keys
|
368
|
+
key = keys[0]
|
369
|
+
table = @dataframe.table
|
370
|
+
|
371
|
+
plan = Arrow::ExecutePlan.new
|
372
|
+
source_node = plan.build_source_node(table)
|
373
|
+
|
374
|
+
aggregate_node =
|
375
|
+
plan.build_aggregate_node(source_node, {
|
376
|
+
aggregations: [{ function: 'hash_count',
|
377
|
+
input: key }], keys: keys
|
378
|
+
})
|
379
|
+
expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
|
380
|
+
null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
|
381
|
+
count_field = Arrow::FieldExpression.new("count(#{key})")
|
382
|
+
if null_count.zero?
|
383
|
+
expressions << count_field
|
384
|
+
else
|
385
|
+
is_zero =
|
386
|
+
Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
|
387
|
+
null_count_scalar = Arrow::Int64Scalar.new(null_count)
|
388
|
+
expressions <<
|
389
|
+
Arrow::CallExpression.new('if_else', [
|
390
|
+
is_zero, null_count_scalar, count_field
|
391
|
+
])
|
290
392
|
end
|
291
|
-
|
393
|
+
options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
|
394
|
+
project_node = plan.build_project_node(aggregate_node, options)
|
292
395
|
|
293
|
-
|
294
|
-
|
295
|
-
arrays = table.columns.map(&:data)
|
396
|
+
sink_and_start_plan(plan, project_node)
|
397
|
+
end
|
296
398
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
399
|
+
def build_aggregation_keys(function_name, summary_keys)
|
400
|
+
if summary_keys.empty?
|
401
|
+
[function_name]
|
402
|
+
else
|
403
|
+
summary_keys.map { |key| "#{function_name}(#{key})" }
|
301
404
|
end
|
302
|
-
|
303
|
-
Arrow::Table.new(Arrow::Schema.new(fields), arrays)
|
304
405
|
end
|
305
406
|
|
306
407
|
# Call Vector aggregating function and return an array of arrays:
|
data/lib/red_amber/helper.rb
CHANGED
@@ -78,6 +78,32 @@ module RedAmber
|
|
78
78
|
Array(range)
|
79
79
|
end
|
80
80
|
end
|
81
|
+
|
82
|
+
# Create sink node and execute plan
|
83
|
+
#
|
84
|
+
# @param plan [Arrow::ExecutePlan]
|
85
|
+
# Execute plan of Acero.
|
86
|
+
# @param node [Arrow::ExecuteNode]
|
87
|
+
# Execute node of Acero.
|
88
|
+
# @param output_schema [Arrow::Schema, nil]
|
89
|
+
# Schema of table to output. If it is nil, output_schema of
|
90
|
+
# sink node is used.
|
91
|
+
# @return [Arrow::Table]
|
92
|
+
# Result of plan.
|
93
|
+
# @since 0.5.0
|
94
|
+
#
|
95
|
+
def sink_and_start_plan(plan, node, output_schema: nil)
|
96
|
+
sink_node_options = Arrow::SinkNodeOptions.new
|
97
|
+
plan.build_sink_node(node, sink_node_options)
|
98
|
+
plan.validate
|
99
|
+
plan.start
|
100
|
+
plan.wait
|
101
|
+
output_schema = node.output_schema if output_schema.nil?
|
102
|
+
reader = sink_node_options.get_reader(output_schema)
|
103
|
+
table = reader.read_all
|
104
|
+
plan.stop
|
105
|
+
table
|
106
|
+
end
|
81
107
|
end
|
82
108
|
|
83
109
|
# rubocop:disable Layout/LineLength
|