red_amber 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -836,6 +836,55 @@ module RedAmber
836
836
  tail(n_obs)
837
837
  end
838
838
 
839
+ # Select records randomly to create a DataFrame.
840
+ # This method calls `indices.sample`.
841
+ # We can use the same arguments in `Vector#sample`.
842
+ # @note This method requires 'arrow-numo-narray' gem.
843
+ #
844
+ # @overload sample()
845
+ # Return a DataFrame with a randomly selected record.
846
+ #
847
+ # @return [DataFrame]
848
+ # a DataFrame with single record.
849
+ #
850
+ # @overload sample(n)
851
+ # Return a DataFrame with n records selected at random.
852
+ #
853
+ # @param n [Integer]
854
+ # positive number of records to select.
855
+ # If n is smaller or equal to size, records are selected by non-repeating.
856
+ # If n is greater than `size`, records are selected repeatedly.
857
+ # @return [DataFrame]
858
+ # a DataFrame with sampled records.
859
+ #
860
+ # @overload sample(prop)
861
+ # Return a DataFrame with records by proportion `prop` at random.
862
+ #
863
+ # @param prop [Float]
864
+ # positive proportion of records to select.
865
+ # Absolute number of records to select:`prop*size` is rounded (by `half: :up`).
866
+ # If prop is smaller or equal to 1.0, records are selected by non-repeating.
867
+ # If prop is greater than 1.0, some records are selected repeatedly.
868
+ # @return [Vector]
869
+ # a DataFrame with sampled records.
870
+ #
871
+ # @since 0.5.0
872
+ #
873
+ def sample(n_or_prop = nil)
874
+ slice { indices.sample(n_or_prop) }
875
+ end
876
+
877
+ # Returns a DataFrame with shuffled rows.
878
+ #
879
+ # @note This method requires 'arrow-numo-narray' gem.
880
+ # @note Same behavior as `DataFrame#sample(1.0)`
881
+ # @return (see #sample)
882
+ # @since 0.5.0
883
+ #
884
+ def shuffle
885
+ sample(1.0)
886
+ end
887
+
839
888
  # Select records by index Array to create a DataFrame.
840
889
  #
841
890
  # - TODO: support for option `boundscheck: true`
@@ -4,6 +4,7 @@ module RedAmber
4
4
  # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
+ include Helper
7
8
 
8
9
  using RefineArrowTable
9
10
 
@@ -114,15 +115,27 @@ module RedAmber
114
115
  #
115
116
  def filters
116
117
  @filters ||= begin
117
- first, *others = @group_keys.map do |key|
118
- vector = @dataframe[key]
119
- vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
120
- end
121
-
122
- if others.empty?
123
- first.select(&:any?)
124
- else
125
- first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
118
+ group_values = group_table[group_keys].each_record.map(&:to_a)
119
+
120
+ Enumerator.new(group_table.n_rows) do |yielder|
121
+ group_values.each do |values|
122
+ booleans =
123
+ values.map.with_index do |value, i|
124
+ column = @dataframe[group_keys[i]].data
125
+ if value.nil?
126
+ Arrow::Function.find('is_null').execute([column])
127
+ elsif value.is_a?(Float) && value.nan?
128
+ Arrow::Function.find('is_nan').execute([column])
129
+ else
130
+ Arrow::Function.find('equal').execute([column, value])
131
+ end
132
+ end
133
+ filter =
134
+ booleans.reduce do |result, datum|
135
+ Arrow::Function.find('and_kleene').execute([result, datum])
136
+ end
137
+ yielder << Vector.create(filter.value)
138
+ end
126
139
  end
127
140
  end
128
141
  end
@@ -147,11 +160,10 @@ module RedAmber
147
160
  # group size.
148
161
  #
149
162
  def each
150
- filters
151
163
  return enum_for(:each) unless block_given?
152
164
 
153
- @filters.each do |filter|
154
- yield @dataframe[filter]
165
+ filters.each do |filter|
166
+ yield @dataframe.filter(filter)
155
167
  end
156
168
  @filters.size
157
169
  end
@@ -174,7 +186,7 @@ module RedAmber
174
186
  # 2 Gentoo 124
175
187
  #
176
188
  def group_count
177
- DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
189
+ DataFrame.create(group_table)
178
190
  end
179
191
 
180
192
  # String representation of self.
@@ -186,80 +198,157 @@ module RedAmber
186
198
  #
187
199
  # # =>
188
200
  # #<RedAmber::Group : 0x0000000000003a98>
189
- # species count
190
- # <string> <uint8>
191
- # 0 Adelie 152
192
- # 1 Chinstrap 68
193
- # 2 Gentoo 124
201
+ # species group_count
202
+ # <string> <uint8>
203
+ # 0 Adelie 152
204
+ # 1 Chinstrap 68
205
+ # 2 Gentoo 124
194
206
  #
195
207
  def inspect
196
- "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
208
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
197
209
  end
198
210
 
199
211
  # Summarize Group by aggregation functions from the block.
200
212
  #
201
- # @yieldparam group [Group]
202
- # passes group object self.
203
- # @yieldreturn [DataFrame, Array<DataFrame>]
204
- # an aggregated DataFrame or an array of aggregated DataFrames.
205
- # @return [DataFrame]
206
- # summarized DataFrame.
207
- # @example Single function and single variable
208
- # group = penguins.group(:species)
209
- # group
213
+ # @overload summarize
214
+ # Summarize by a function.
215
+ # @yieldparam group [Group]
216
+ # passes group object self.
217
+ # @yieldreturn [DataFrame]
218
+ # @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
219
+ # an aggregated DataFrame or an array of aggregated DataFrames.
220
+ # @return [DataFrame]
221
+ # summarized DataFrame.
222
+ # @example Single function and single variable
223
+ # group = penguins.group(:species)
224
+ # group
210
225
  #
211
- # # =>
212
- # #<RedAmber::Group : 0x000000000000c314>
213
- # species count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
226
+ # # =>
227
+ # #<RedAmber::Group : 0x000000000000c314>
228
+ # species group_count
229
+ # <string> <uint8>
230
+ # 0 Adelie 152
231
+ # 1 Chinstrap 68
232
+ # 2 Gentoo 124
218
233
  #
219
- # group.summarize { mean(:bill_length_mm) }
234
+ # group.summarize { mean(:bill_length_mm) }
220
235
  #
221
- # # =>
222
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
- # species mean(bill_length_mm)
224
- # <string> <double>
225
- # 0 Adelie 38.79
226
- # 1 Chinstrap 48.83
227
- # 2 Gentoo 47.5
236
+ # # =>
237
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
238
+ # species mean(bill_length_mm)
239
+ # <string> <double>
240
+ # 0 Adelie 38.79
241
+ # 1 Chinstrap 48.83
242
+ # 2 Gentoo 47.5
228
243
  #
229
- # @example Single function only
230
- # group.summarize { mean }
244
+ # @example Single function only
245
+ # group.summarize { mean }
231
246
  #
232
- # # =>
233
- # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
- # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
- # <string> <double> <double> ... <double>
236
- # 0 Adelie 38.79 18.35 ... 2008.01
237
- # 1 Chinstrap 48.83 18.42 ... 2007.97
238
- # 2 Gentoo 47.5 14.98 ... 2008.08
247
+ # # =>
248
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
249
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
250
+ # <string> <double> <double> ... <double>
251
+ # 0 Adelie 38.79 18.35 ... 2008.01
252
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
253
+ # 2 Gentoo 47.5 14.98 ... 2008.08
239
254
  #
240
- # @example Multiple functions
241
- # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
255
+ # @overload summarize
256
+ # Summarize by a function.
242
257
  #
243
- # # =>
244
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
- # species min(bill_length_mm) max(bill_length_mm)
246
- # <string> <double> <double>
247
- # 0 Adelie 32.1 46.0
248
- # 1 Chinstrap 40.9 58.0
249
- # 2 Gentoo 40.9 59.6
250
- #
251
- def summarize(&block)
252
- agg = instance_eval(&block)
258
+ # @yieldparam group [Group]
259
+ # passes group object self.
260
+ # @yieldreturn [Array<DataFrame>]
261
+ # an aggregated DataFrame or an array of aggregated DataFrames.
262
+ # @return [DataFrame]
263
+ # summarized DataFrame.
264
+ # @example Multiple functions
265
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
266
+ #
267
+ # # =>
268
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
269
+ # species min(bill_length_mm) max(bill_length_mm)
270
+ # <string> <double> <double>
271
+ # 0 Adelie 32.1 46.0
272
+ # 1 Chinstrap 40.9 58.0
273
+ # 2 Gentoo 40.9 59.6
274
+ #
275
+ # @overload summarize
276
+ # Summarize by a function.
277
+ #
278
+ # @yieldparam group [Group]
279
+ # passes group object self.
280
+ # @yieldreturn [Hash{Symbol, String => DataFrame}]
281
+ # an aggregated DataFrame or an array of aggregated DataFrames.
282
+ # The DataFrame must return only one aggregated column.
283
+ # @return [DataFrame]
284
+ # summarized DataFrame.
285
+ # @example Rename column name by Hash
286
+ # group.summarize {
287
+ # {
288
+ # min_bill_length_mm: min(:bill_length_mm),
289
+ # max_bill_length_mm: max(:bill_length_mm),
290
+ # }
291
+ # }
292
+ #
293
+ # # =>
294
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
295
+ # species min_bill_length_mm max_bill_length_mm
296
+ # <string> <double> <double>
297
+ # 0 Adelie 32.1 46.0
298
+ # 1 Chinstrap 40.9 58.0
299
+ # 2 Gentoo 40.9 59.6
300
+ #
301
+ def summarize(*args, &block)
302
+ if block
303
+ agg = instance_eval(&block)
304
+ unless args.empty?
305
+ agg = [agg] if agg.is_a?(DataFrame)
306
+ agg = args.zip(agg).to_h
307
+ end
308
+ else
309
+ agg = args
310
+ end
311
+
253
312
  case agg
254
313
  when DataFrame
255
314
  agg
256
315
  when Array
257
- agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
316
+ aggregations =
317
+ agg.map do |df|
318
+ v = df.vectors[-1]
319
+ [v.key, v]
320
+ end
321
+ agg[0].assign(aggregations)
322
+ when Hash
323
+ aggregations =
324
+ agg.map do |key, df|
325
+ aggregated_keys = df.keys - @group_keys
326
+ if aggregated_keys.size > 1
327
+ message =
328
+ "accept only one column from the Hash: #{aggregated_keys.join(', ')}"
329
+ raise GroupArgumentError, message
330
+ end
331
+
332
+ v = df.vectors[-1]
333
+ [key, v]
334
+ end
335
+ agg.values[-1].drop(-1).assign(aggregations)
258
336
  else
259
337
  raise GroupArgumentError, "Unknown argument: #{agg}"
260
338
  end
261
339
  end
262
340
 
341
+ # Return grouped DataFrame only for group keys.
342
+ #
343
+ # @return [DataFrame]
344
+ # grouped DataFrame projected only for group_keys.
345
+ # @since 0.5.0
346
+ #
347
+ def grouped_frame
348
+ DataFrame.create(group_table[group_keys])
349
+ end
350
+ alias_method :none, :grouped_frame
351
+
263
352
  # Aggregating summary.
264
353
  #
265
354
  # @api private
@@ -270,37 +359,49 @@ module RedAmber
270
359
 
271
360
  private
272
361
 
273
- def build_aggregation_keys(function_name, summary_keys)
274
- if summary_keys.empty?
275
- [function_name]
276
- else
277
- summary_keys.map { |key| "#{function_name}(#{key})" }
278
- end
279
- end
280
-
281
- # @note `@group_counts.sum == @dataframe.size``
282
- def group_counts
283
- @group_counts ||= filters.map(&:sum)
362
+ def group_table
363
+ @group_table ||= build_aggregated_table
284
364
  end
285
365
 
286
- def base_table
287
- @base_table ||= begin
288
- indexes = filters.map { |filter| filter.index(true) }
289
- @dataframe.table[@group_keys].take(indexes)
366
+ def build_aggregated_table
367
+ keys = @group_keys
368
+ key = keys[0]
369
+ table = @dataframe.table
370
+
371
+ plan = Arrow::ExecutePlan.new
372
+ source_node = plan.build_source_node(table)
373
+
374
+ aggregate_node =
375
+ plan.build_aggregate_node(source_node, {
376
+ aggregations: [{ function: 'hash_count',
377
+ input: key }], keys: keys
378
+ })
379
+ expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
380
+ null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
381
+ count_field = Arrow::FieldExpression.new("count(#{key})")
382
+ if null_count.zero?
383
+ expressions << count_field
384
+ else
385
+ is_zero =
386
+ Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
387
+ null_count_scalar = Arrow::Int64Scalar.new(null_count)
388
+ expressions <<
389
+ Arrow::CallExpression.new('if_else', [
390
+ is_zero, null_count_scalar, count_field
391
+ ])
290
392
  end
291
- end
393
+ options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
394
+ project_node = plan.build_project_node(aggregate_node, options)
292
395
 
293
- def add_columns_to_table(table, keys, data_arrays)
294
- fields = table.schema.fields
295
- arrays = table.columns.map(&:data)
396
+ sink_and_start_plan(plan, project_node)
397
+ end
296
398
 
297
- keys.zip(data_arrays).each do |key, array|
298
- data = Arrow::ChunkedArray.new([array])
299
- fields << Arrow::Field.new(key, data.value_data_type)
300
- arrays << data
399
+ def build_aggregation_keys(function_name, summary_keys)
400
+ if summary_keys.empty?
401
+ [function_name]
402
+ else
403
+ summary_keys.map { |key| "#{function_name}(#{key})" }
301
404
  end
302
-
303
- Arrow::Table.new(Arrow::Schema.new(fields), arrays)
304
405
  end
305
406
 
306
407
  # Call Vector aggregating function and return an array of arrays:
@@ -78,6 +78,32 @@ module RedAmber
78
78
  Array(range)
79
79
  end
80
80
  end
81
+
82
+ # Create sink node and execute plan
83
+ #
84
+ # @param plan [Arrow::ExecutePlan]
85
+ # Execute plan of Acero.
86
+ # @param node [Arrow::ExecuteNode]
87
+ # Execute node of Acero.
88
+ # @param output_schema [Arrow::Schema, nil]
89
+ # Schema of table to output. If it is nil, output_schema of
90
+ # sink node is used.
91
+ # @return [Arrow::Table]
92
+ # Result of plan.
93
+ # @since 0.5.0
94
+ #
95
+ def sink_and_start_plan(plan, node, output_schema: nil)
96
+ sink_node_options = Arrow::SinkNodeOptions.new
97
+ plan.build_sink_node(node, sink_node_options)
98
+ plan.validate
99
+ plan.start
100
+ plan.wait
101
+ output_schema = node.output_schema if output_schema.nil?
102
+ reader = sink_node_options.get_reader(output_schema)
103
+ table = reader.read_all
104
+ plan.stop
105
+ table
106
+ end
81
107
  end
82
108
 
83
109
  # rubocop:disable Layout/LineLength
@@ -20,6 +20,7 @@ module RedAmber
20
20
  @sizes = []
21
21
  end
22
22
 
23
+ # Generic iterator method
23
24
  def each
24
25
  @selectors.each
25
26
  end
@@ -27,14 +28,20 @@ module RedAmber
27
28
 
28
29
  # Boolean selectors of sub-dataframes
29
30
  class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
30
35
  def sizes
31
- # count true
32
36
  @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
33
37
  end
34
38
  end
35
39
 
36
40
  # Index selectors of sub-dataframes
37
41
  class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
38
45
  def sizes
39
46
  @sizes = @selectors.map(&:size)
40
47
  end
@@ -93,7 +100,7 @@ module RedAmber
93
100
  # @since 0.4.0
94
101
  #
95
102
  def by_group(group)
96
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
97
104
  end
98
105
 
99
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -291,15 +298,15 @@ module RedAmber
291
298
  selectors = yield(dataframe)
292
299
  end
293
300
 
294
- if dataframe.empty? || selectors.nil? || selectors.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
295
302
  @baseframe = DataFrame.new
296
303
  @selectors = Selectors.new([])
297
304
  else
298
305
  @baseframe = dataframe
299
306
  @selectors =
300
- if selectors[0].boolean?
307
+ if selectors.first.boolean?
301
308
  Filters.new(selectors)
302
- elsif selectors[0].numeric?
309
+ elsif selectors.first.numeric?
303
310
  Indices.new(selectors)
304
311
  else
305
312
  raise SubFramesArgumentError, "illegal type: #{selectors}"
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).