red_amber 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ module RedAmber
4
4
  # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
+ include Helper
7
8
 
8
9
  using RefineArrowTable
9
10
 
@@ -60,11 +61,11 @@ module RedAmber
60
61
  #
61
62
  # # =>
62
63
  # #<RedAmber::Group : 0x000000000000f410>
63
- # species group_count
64
- # <string> <uint8>
65
- # 0 Adelie 152
66
- # 1 Chinstrap 68
67
- # 2 Gentoo 124
64
+ # species count
65
+ # <string> <uint8>
66
+ # 0 Adelie 152
67
+ # 1 Chinstrap 68
68
+ # 2 Gentoo 124
68
69
  #
69
70
  def initialize(dataframe, *group_keys)
70
71
  @dataframe = dataframe
@@ -114,15 +115,27 @@ module RedAmber
114
115
  #
115
116
  def filters
116
117
  @filters ||= begin
117
- first, *others = @group_keys.map do |key|
118
- vector = @dataframe[key]
119
- vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
120
- end
121
-
122
- if others.empty?
123
- first.select(&:any?)
124
- else
125
- first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
118
+ group_values = group_table[group_keys].each_record.map(&:to_a)
119
+
120
+ Enumerator.new(group_table.n_rows) do |yielder|
121
+ group_values.each do |values|
122
+ booleans =
123
+ values.map.with_index do |value, i|
124
+ column = @dataframe[group_keys[i]].data
125
+ if value.nil?
126
+ Arrow::Function.find('is_null').execute([column])
127
+ elsif value.is_a?(Float) && value.nan?
128
+ Arrow::Function.find('is_nan').execute([column])
129
+ else
130
+ Arrow::Function.find('equal').execute([column, value])
131
+ end
132
+ end
133
+ filter =
134
+ booleans.reduce do |result, datum|
135
+ Arrow::Function.find('and_kleene').execute([result, datum])
136
+ end
137
+ yielder << Vector.create(filter.value)
138
+ end
126
139
  end
127
140
  end
128
141
  end
@@ -147,11 +160,10 @@ module RedAmber
147
160
  # group size.
148
161
  #
149
162
  def each
150
- filters
151
163
  return enum_for(:each) unless block_given?
152
164
 
153
- @filters.each do |filter|
154
- yield @dataframe[filter]
165
+ filters.each do |filter|
166
+ yield @dataframe.filter(filter)
155
167
  end
156
168
  @filters.size
157
169
  end
@@ -174,7 +186,7 @@ module RedAmber
174
186
  # 2 Gentoo 124
175
187
  #
176
188
  def group_count
177
- DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
189
+ DataFrame.create(group_table)
178
190
  end
179
191
 
180
192
  # String representation of self.
@@ -198,68 +210,145 @@ module RedAmber
198
210
 
199
211
  # Summarize Group by aggregation functions from the block.
200
212
  #
201
- # @yieldparam group [Group]
202
- # passes group object self.
203
- # @yieldreturn [DataFrame, Array<DataFrame>]
204
- # an aggregated DataFrame or an array of aggregated DataFrames.
205
- # @return [DataFrame]
206
- # summarized DataFrame.
207
- # @example Single function and single variable
208
- # group = penguins.group(:species)
209
- # group
213
+ # @overload summarize
214
+ # Summarize by a function.
215
+ # @yieldparam group [Group]
216
+ # passes group object self.
217
+ # @yieldreturn [DataFrame]
218
+ # @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
219
+ # an aggregated DataFrame or an array of aggregated DataFrames.
220
+ # @return [DataFrame]
221
+ # summarized DataFrame.
222
+ # @example Single function and single variable
223
+ # group = penguins.group(:species)
224
+ # group
210
225
  #
211
- # # =>
212
- # #<RedAmber::Group : 0x000000000000c314>
213
- # species group_count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
226
+ # # =>
227
+ # #<RedAmber::Group : 0x000000000000c314>
228
+ # species group_count
229
+ # <string> <uint8>
230
+ # 0 Adelie 152
231
+ # 1 Chinstrap 68
232
+ # 2 Gentoo 124
218
233
  #
219
- # group.summarize { mean(:bill_length_mm) }
234
+ # group.summarize { mean(:bill_length_mm) }
220
235
  #
221
- # # =>
222
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
- # species mean(bill_length_mm)
224
- # <string> <double>
225
- # 0 Adelie 38.79
226
- # 1 Chinstrap 48.83
227
- # 2 Gentoo 47.5
236
+ # # =>
237
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
238
+ # species mean(bill_length_mm)
239
+ # <string> <double>
240
+ # 0 Adelie 38.79
241
+ # 1 Chinstrap 48.83
242
+ # 2 Gentoo 47.5
228
243
  #
229
- # @example Single function only
230
- # group.summarize { mean }
244
+ # @example Single function only
245
+ # group.summarize { mean }
231
246
  #
232
- # # =>
233
- # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
- # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
- # <string> <double> <double> ... <double>
236
- # 0 Adelie 38.79 18.35 ... 2008.01
237
- # 1 Chinstrap 48.83 18.42 ... 2007.97
238
- # 2 Gentoo 47.5 14.98 ... 2008.08
247
+ # # =>
248
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
249
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
250
+ # <string> <double> <double> ... <double>
251
+ # 0 Adelie 38.79 18.35 ... 2008.01
252
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
253
+ # 2 Gentoo 47.5 14.98 ... 2008.08
239
254
  #
240
- # @example Multiple functions
241
- # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
255
+ # @overload summarize
256
+ # Summarize by a function.
242
257
  #
243
- # # =>
244
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
- # species min(bill_length_mm) max(bill_length_mm)
246
- # <string> <double> <double>
247
- # 0 Adelie 32.1 46.0
248
- # 1 Chinstrap 40.9 58.0
249
- # 2 Gentoo 40.9 59.6
250
- #
251
- def summarize(&block)
252
- agg = instance_eval(&block)
258
+ # @yieldparam group [Group]
259
+ # passes group object self.
260
+ # @yieldreturn [Array<DataFrame>]
261
+ # an aggregated DataFrame or an array of aggregated DataFrames.
262
+ # @return [DataFrame]
263
+ # summarized DataFrame.
264
+ # @example Multiple functions
265
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
266
+ #
267
+ # # =>
268
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
269
+ # species min(bill_length_mm) max(bill_length_mm)
270
+ # <string> <double> <double>
271
+ # 0 Adelie 32.1 46.0
272
+ # 1 Chinstrap 40.9 58.0
273
+ # 2 Gentoo 40.9 59.6
274
+ #
275
+ # @overload summarize
276
+ # Summarize by a function.
277
+ #
278
+ # @yieldparam group [Group]
279
+ # passes group object self.
280
+ # @yieldreturn [Hash{Symbol, String => DataFrame}]
281
+ # an aggregated DataFrame or an array of aggregated DataFrames.
282
+ # The DataFrame must return only one aggregated column.
283
+ # @return [DataFrame]
284
+ # summarized DataFrame.
285
+ # @example Rename column name by Hash
286
+ # group.summarize {
287
+ # {
288
+ # min_bill_length_mm: min(:bill_length_mm),
289
+ # max_bill_length_mm: max(:bill_length_mm),
290
+ # }
291
+ # }
292
+ #
293
+ # # =>
294
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
295
+ # species min_bill_length_mm max_bill_length_mm
296
+ # <string> <double> <double>
297
+ # 0 Adelie 32.1 46.0
298
+ # 1 Chinstrap 40.9 58.0
299
+ # 2 Gentoo 40.9 59.6
300
+ #
301
+ def summarize(*args, &block)
302
+ if block
303
+ agg = instance_eval(&block)
304
+ unless args.empty?
305
+ agg = [agg] if agg.is_a?(DataFrame)
306
+ agg = args.zip(agg).to_h
307
+ end
308
+ else
309
+ agg = args
310
+ end
311
+
253
312
  case agg
254
313
  when DataFrame
255
314
  agg
256
315
  when Array
257
- agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
316
+ aggregations =
317
+ agg.map do |df|
318
+ v = df.vectors[-1]
319
+ [v.key, v]
320
+ end
321
+ agg[0].assign(aggregations)
322
+ when Hash
323
+ aggregations =
324
+ agg.map do |key, df|
325
+ aggregated_keys = df.keys - @group_keys
326
+ if aggregated_keys.size > 1
327
+ message =
328
+ "accept only one column from the Hash: #{aggregated_keys.join(', ')}"
329
+ raise GroupArgumentError, message
330
+ end
331
+
332
+ v = df.vectors[-1]
333
+ [key, v]
334
+ end
335
+ agg.values[-1].drop(-1).assign(aggregations)
258
336
  else
259
337
  raise GroupArgumentError, "Unknown argument: #{agg}"
260
338
  end
261
339
  end
262
340
 
341
+ # Return grouped DataFrame only for group keys.
342
+ #
343
+ # @return [DataFrame]
344
+ # grouped DataFrame projected only for group_keys.
345
+ # @since 0.5.0
346
+ #
347
+ def grouped_frame
348
+ DataFrame.create(group_table[group_keys])
349
+ end
350
+ alias_method :none, :grouped_frame
351
+
263
352
  # Aggregating summary.
264
353
  #
265
354
  # @api private
@@ -270,37 +359,49 @@ module RedAmber
270
359
 
271
360
  private
272
361
 
273
- def build_aggregation_keys(function_name, summary_keys)
274
- if summary_keys.empty?
275
- [function_name]
276
- else
277
- summary_keys.map { |key| "#{function_name}(#{key})" }
278
- end
279
- end
280
-
281
- # @note `@group_counts.sum == @dataframe.size``
282
- def group_counts
283
- @group_counts ||= filters.map(&:sum)
362
+ def group_table
363
+ @group_table ||= build_aggregated_table
284
364
  end
285
365
 
286
- def base_table
287
- @base_table ||= begin
288
- indexes = filters.map { |filter| filter.index(true) }
289
- @dataframe.table[@group_keys].take(indexes)
366
+ def build_aggregated_table
367
+ keys = @group_keys
368
+ key = keys[0]
369
+ table = @dataframe.table
370
+
371
+ plan = Arrow::ExecutePlan.new
372
+ source_node = plan.build_source_node(table)
373
+
374
+ aggregate_node =
375
+ plan.build_aggregate_node(source_node, {
376
+ aggregations: [{ function: 'hash_count',
377
+ input: key }], keys: keys
378
+ })
379
+ expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
380
+ null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
381
+ count_field = Arrow::FieldExpression.new("count(#{key})")
382
+ if null_count.zero?
383
+ expressions << count_field
384
+ else
385
+ is_zero =
386
+ Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
387
+ null_count_scalar = Arrow::Int64Scalar.new(null_count)
388
+ expressions <<
389
+ Arrow::CallExpression.new('if_else', [
390
+ is_zero, null_count_scalar, count_field
391
+ ])
290
392
  end
291
- end
393
+ options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
394
+ project_node = plan.build_project_node(aggregate_node, options)
292
395
 
293
- def add_columns_to_table(table, keys, data_arrays)
294
- fields = table.schema.fields
295
- arrays = table.columns.map(&:data)
396
+ sink_and_start_plan(plan, project_node)
397
+ end
296
398
 
297
- keys.zip(data_arrays).each do |key, array|
298
- data = Arrow::ChunkedArray.new([array])
299
- fields << Arrow::Field.new(key, data.value_data_type)
300
- arrays << data
399
+ def build_aggregation_keys(function_name, summary_keys)
400
+ if summary_keys.empty?
401
+ [function_name]
402
+ else
403
+ summary_keys.map { |key| "#{function_name}(#{key})" }
301
404
  end
302
-
303
- Arrow::Table.new(Arrow::Schema.new(fields), arrays)
304
405
  end
305
406
 
306
407
  # Call Vector aggregating function and return an array of arrays:
@@ -78,6 +78,32 @@ module RedAmber
78
78
  Array(range)
79
79
  end
80
80
  end
81
+
82
+ # Create sink node and execute plan
83
+ #
84
+ # @param plan [Arrow::ExecutePlan]
85
+ # Execute plan of Acero.
86
+ # @param node [Arrow::ExecuteNode]
87
+ # Execute node of Acero.
88
+ # @param output_schema [Arrow::Schema, nil]
89
+ # Schema of table to output. If it is nil, output_schema of
90
+ # sink node is used.
91
+ # @return [Arrow::Table]
92
+ # Result of plan.
93
+ # @since 0.5.0
94
+ #
95
+ def sink_and_start_plan(plan, node, output_schema: nil)
96
+ sink_node_options = Arrow::SinkNodeOptions.new
97
+ plan.build_sink_node(node, sink_node_options)
98
+ plan.validate
99
+ plan.start
100
+ plan.wait
101
+ output_schema = node.output_schema if output_schema.nil?
102
+ reader = sink_node_options.get_reader(output_schema)
103
+ table = reader.read_all
104
+ plan.stop
105
+ table
106
+ end
81
107
  end
82
108
 
83
109
  # rubocop:disable Layout/LineLength