red_amber 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -836,6 +836,55 @@ module RedAmber
836
836
  tail(n_obs)
837
837
  end
838
838
 
839
+ # Select records randomly to create a DataFrame.
840
+ # This method calls `indices.sample`.
841
+ # We can use the same arguments in `Vector#sample`.
842
+ # @note This method requires 'arrow-numo-narray' gem.
843
+ #
844
+ # @overload sample()
845
+ # Return a DataFrame with a randomly selected record.
846
+ #
847
+ # @return [DataFrame]
848
+ # a DataFrame with single record.
849
+ #
850
+ # @overload sample(n)
851
+ # Return a DataFrame with n records selected at random.
852
+ #
853
+ # @param n [Integer]
854
+ # positive number of records to select.
855
+ # If n is smaller or equal to size, records are selected by non-repeating.
856
+ # If n is greater than `size`, records are selected repeatedly.
857
+ # @return [DataFrame]
858
+ # a DataFrame with sampled records.
859
+ #
860
+ # @overload sample(prop)
861
+ # Return a DataFrame with records by proportion `prop` at random.
862
+ #
863
+ # @param prop [Float]
864
+ # positive proportion of records to select.
865
+ # Absolute number of records to select:`prop*size` is rounded (by `half: :up`).
866
+ # If prop is smaller or equal to 1.0, records are selected by non-repeating.
867
+ # If prop is greater than 1.0, some records are selected repeatedly.
868
+ # @return [Vector]
869
+ # a DataFrame with sampled records.
870
+ #
871
+ # @since 0.5.0
872
+ #
873
+ def sample(n_or_prop = nil)
874
+ slice { indices.sample(n_or_prop) }
875
+ end
876
+
877
+ # Returns a DataFrame with shuffled rows.
878
+ #
879
+ # @note This method requires 'arrow-numo-narray' gem.
880
+ # @note Same behavior as `DataFrame#sample(1.0)`
881
+ # @return (see #sample)
882
+ # @since 0.5.0
883
+ #
884
+ def shuffle
885
+ sample(1.0)
886
+ end
887
+
839
888
  # Select records by index Array to create a DataFrame.
840
889
  #
841
890
  # - TODO: support for option `boundscheck: true`
@@ -4,6 +4,7 @@ module RedAmber
4
4
  # Group class
5
5
  class Group
6
6
  include Enumerable # This feature is experimental
7
+ include Helper
7
8
 
8
9
  using RefineArrowTable
9
10
 
@@ -114,15 +115,27 @@ module RedAmber
114
115
  #
115
116
  def filters
116
117
  @filters ||= begin
117
- first, *others = @group_keys.map do |key|
118
- vector = @dataframe[key]
119
- vector.uniq.each.map { |u| u.nil? ? vector.is_nil : vector == u }
120
- end
121
-
122
- if others.empty?
123
- first.select(&:any?)
124
- else
125
- first.product(*others).map { |a| a.reduce(&:&) }.select(&:any?)
118
+ group_values = group_table[group_keys].each_record.map(&:to_a)
119
+
120
+ Enumerator.new(group_table.n_rows) do |yielder|
121
+ group_values.each do |values|
122
+ booleans =
123
+ values.map.with_index do |value, i|
124
+ column = @dataframe[group_keys[i]].data
125
+ if value.nil?
126
+ Arrow::Function.find('is_null').execute([column])
127
+ elsif value.is_a?(Float) && value.nan?
128
+ Arrow::Function.find('is_nan').execute([column])
129
+ else
130
+ Arrow::Function.find('equal').execute([column, value])
131
+ end
132
+ end
133
+ filter =
134
+ booleans.reduce do |result, datum|
135
+ Arrow::Function.find('and_kleene').execute([result, datum])
136
+ end
137
+ yielder << Vector.create(filter.value)
138
+ end
126
139
  end
127
140
  end
128
141
  end
@@ -147,11 +160,10 @@ module RedAmber
147
160
  # group size.
148
161
  #
149
162
  def each
150
- filters
151
163
  return enum_for(:each) unless block_given?
152
164
 
153
- @filters.each do |filter|
154
- yield @dataframe[filter]
165
+ filters.each do |filter|
166
+ yield @dataframe.filter(filter)
155
167
  end
156
168
  @filters.size
157
169
  end
@@ -174,7 +186,7 @@ module RedAmber
174
186
  # 2 Gentoo 124
175
187
  #
176
188
  def group_count
177
- DataFrame.create(add_columns_to_table(base_table, [:group_count], [group_counts]))
189
+ DataFrame.create(group_table)
178
190
  end
179
191
 
180
192
  # String representation of self.
@@ -186,80 +198,157 @@ module RedAmber
186
198
  #
187
199
  # # =>
188
200
  # #<RedAmber::Group : 0x0000000000003a98>
189
- # species count
190
- # <string> <uint8>
191
- # 0 Adelie 152
192
- # 1 Chinstrap 68
193
- # 2 Gentoo 124
201
+ # species group_count
202
+ # <string> <uint8>
203
+ # 0 Adelie 152
204
+ # 1 Chinstrap 68
205
+ # 2 Gentoo 124
194
206
  #
195
207
  def inspect
196
- "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{count(@group_keys)}"
208
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n#{group_count}"
197
209
  end
198
210
 
199
211
  # Summarize Group by aggregation functions from the block.
200
212
  #
201
- # @yieldparam group [Group]
202
- # passes group object self.
203
- # @yieldreturn [DataFrame, Array<DataFrame>]
204
- # an aggregated DataFrame or an array of aggregated DataFrames.
205
- # @return [DataFrame]
206
- # summarized DataFrame.
207
- # @example Single function and single variable
208
- # group = penguins.group(:species)
209
- # group
213
+ # @overload summarize
214
+ # Summarize by a function.
215
+ # @yieldparam group [Group]
216
+ # passes group object self.
217
+ # @yieldreturn [DataFrame]
218
+ # @yieldreturn [DataFrame, Array<DataFrame>, Hash{Symbol, String => DataFrame}]
219
+ # an aggregated DataFrame or an array of aggregated DataFrames.
220
+ # @return [DataFrame]
221
+ # summarized DataFrame.
222
+ # @example Single function and single variable
223
+ # group = penguins.group(:species)
224
+ # group
210
225
  #
211
- # # =>
212
- # #<RedAmber::Group : 0x000000000000c314>
213
- # species count
214
- # <string> <uint8>
215
- # 0 Adelie 152
216
- # 1 Chinstrap 68
217
- # 2 Gentoo 124
226
+ # # =>
227
+ # #<RedAmber::Group : 0x000000000000c314>
228
+ # species group_count
229
+ # <string> <uint8>
230
+ # 0 Adelie 152
231
+ # 1 Chinstrap 68
232
+ # 2 Gentoo 124
218
233
  #
219
- # group.summarize { mean(:bill_length_mm) }
234
+ # group.summarize { mean(:bill_length_mm) }
220
235
  #
221
- # # =>
222
- # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
223
- # species mean(bill_length_mm)
224
- # <string> <double>
225
- # 0 Adelie 38.79
226
- # 1 Chinstrap 48.83
227
- # 2 Gentoo 47.5
236
+ # # =>
237
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c364>
238
+ # species mean(bill_length_mm)
239
+ # <string> <double>
240
+ # 0 Adelie 38.79
241
+ # 1 Chinstrap 48.83
242
+ # 2 Gentoo 47.5
228
243
  #
229
- # @example Single function only
230
- # group.summarize { mean }
244
+ # @example Single function only
245
+ # group.summarize { mean }
231
246
  #
232
- # # =>
233
- # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
234
- # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
235
- # <string> <double> <double> ... <double>
236
- # 0 Adelie 38.79 18.35 ... 2008.01
237
- # 1 Chinstrap 48.83 18.42 ... 2007.97
238
- # 2 Gentoo 47.5 14.98 ... 2008.08
247
+ # # =>
248
+ # #<RedAmber::DataFrame : 3 x 6 Vectors, 0x000000000000c350>
249
+ # species mean(bill_length_mm) mean(bill_depth_mm) ... mean(year)
250
+ # <string> <double> <double> ... <double>
251
+ # 0 Adelie 38.79 18.35 ... 2008.01
252
+ # 1 Chinstrap 48.83 18.42 ... 2007.97
253
+ # 2 Gentoo 47.5 14.98 ... 2008.08
239
254
  #
240
- # @example Multiple functions
241
- # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
255
+ # @overload summarize
256
+ # Summarize by a function.
242
257
  #
243
- # # =>
244
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
245
- # species min(bill_length_mm) max(bill_length_mm)
246
- # <string> <double> <double>
247
- # 0 Adelie 32.1 46.0
248
- # 1 Chinstrap 40.9 58.0
249
- # 2 Gentoo 40.9 59.6
250
- #
251
- def summarize(&block)
252
- agg = instance_eval(&block)
258
+ # @yieldparam group [Group]
259
+ # passes group object self.
260
+ # @yieldreturn [Array<DataFrame>]
261
+ # an aggregated DataFrame or an array of aggregated DataFrames.
262
+ # @return [DataFrame]
263
+ # summarized DataFrame.
264
+ # @example Multiple functions
265
+ # group.summarize { [min(:bill_length_mm), max(:bill_length_mm)] }
266
+ #
267
+ # # =>
268
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
269
+ # species min(bill_length_mm) max(bill_length_mm)
270
+ # <string> <double> <double>
271
+ # 0 Adelie 32.1 46.0
272
+ # 1 Chinstrap 40.9 58.0
273
+ # 2 Gentoo 40.9 59.6
274
+ #
275
+ # @overload summarize
276
+ # Summarize by a function.
277
+ #
278
+ # @yieldparam group [Group]
279
+ # passes group object self.
280
+ # @yieldreturn [Hash{Symbol, String => DataFrame}]
281
+ # an aggregated DataFrame or an array of aggregated DataFrames.
282
+ # The DataFrame must return only one aggregated column.
283
+ # @return [DataFrame]
284
+ # summarized DataFrame.
285
+ # @example Rename column name by Hash
286
+ # group.summarize {
287
+ # {
288
+ # min_bill_length_mm: min(:bill_length_mm),
289
+ # max_bill_length_mm: max(:bill_length_mm),
290
+ # }
291
+ # }
292
+ #
293
+ # # =>
294
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
295
+ # species min_bill_length_mm max_bill_length_mm
296
+ # <string> <double> <double>
297
+ # 0 Adelie 32.1 46.0
298
+ # 1 Chinstrap 40.9 58.0
299
+ # 2 Gentoo 40.9 59.6
300
+ #
301
+ def summarize(*args, &block)
302
+ if block
303
+ agg = instance_eval(&block)
304
+ unless args.empty?
305
+ agg = [agg] if agg.is_a?(DataFrame)
306
+ agg = args.zip(agg).to_h
307
+ end
308
+ else
309
+ agg = args
310
+ end
311
+
253
312
  case agg
254
313
  when DataFrame
255
314
  agg
256
315
  when Array
257
- agg.reduce { |aggregated, df| aggregated.assign(df.to_h) }
316
+ aggregations =
317
+ agg.map do |df|
318
+ v = df.vectors[-1]
319
+ [v.key, v]
320
+ end
321
+ agg[0].assign(aggregations)
322
+ when Hash
323
+ aggregations =
324
+ agg.map do |key, df|
325
+ aggregated_keys = df.keys - @group_keys
326
+ if aggregated_keys.size > 1
327
+ message =
328
+ "accept only one column from the Hash: #{aggregated_keys.join(', ')}"
329
+ raise GroupArgumentError, message
330
+ end
331
+
332
+ v = df.vectors[-1]
333
+ [key, v]
334
+ end
335
+ agg.values[-1].drop(-1).assign(aggregations)
258
336
  else
259
337
  raise GroupArgumentError, "Unknown argument: #{agg}"
260
338
  end
261
339
  end
262
340
 
341
+ # Return grouped DataFrame only for group keys.
342
+ #
343
+ # @return [DataFrame]
344
+ # grouped DataFrame projected only for group_keys.
345
+ # @since 0.5.0
346
+ #
347
+ def grouped_frame
348
+ DataFrame.create(group_table[group_keys])
349
+ end
350
+ alias_method :none, :grouped_frame
351
+
263
352
  # Aggregating summary.
264
353
  #
265
354
  # @api private
@@ -270,37 +359,49 @@ module RedAmber
270
359
 
271
360
  private
272
361
 
273
- def build_aggregation_keys(function_name, summary_keys)
274
- if summary_keys.empty?
275
- [function_name]
276
- else
277
- summary_keys.map { |key| "#{function_name}(#{key})" }
278
- end
279
- end
280
-
281
- # @note `@group_counts.sum == @dataframe.size``
282
- def group_counts
283
- @group_counts ||= filters.map(&:sum)
362
+ def group_table
363
+ @group_table ||= build_aggregated_table
284
364
  end
285
365
 
286
- def base_table
287
- @base_table ||= begin
288
- indexes = filters.map { |filter| filter.index(true) }
289
- @dataframe.table[@group_keys].take(indexes)
366
+ def build_aggregated_table
367
+ keys = @group_keys
368
+ key = keys[0]
369
+ table = @dataframe.table
370
+
371
+ plan = Arrow::ExecutePlan.new
372
+ source_node = plan.build_source_node(table)
373
+
374
+ aggregate_node =
375
+ plan.build_aggregate_node(source_node, {
376
+ aggregations: [{ function: 'hash_count',
377
+ input: key }], keys: keys
378
+ })
379
+ expressions = keys.map { |k| Arrow::FieldExpression.new(k) }
380
+ null_count = Arrow::Function.find('is_null').execute([table[key]]).value.sum
381
+ count_field = Arrow::FieldExpression.new("count(#{key})")
382
+ if null_count.zero?
383
+ expressions << count_field
384
+ else
385
+ is_zero =
386
+ Arrow::CallExpression.new('equal', [count_field, Arrow::Int64Scalar.new(0)])
387
+ null_count_scalar = Arrow::Int64Scalar.new(null_count)
388
+ expressions <<
389
+ Arrow::CallExpression.new('if_else', [
390
+ is_zero, null_count_scalar, count_field
391
+ ])
290
392
  end
291
- end
393
+ options = Arrow::ProjectNodeOptions.new(expressions, keys + [:group_count])
394
+ project_node = plan.build_project_node(aggregate_node, options)
292
395
 
293
- def add_columns_to_table(table, keys, data_arrays)
294
- fields = table.schema.fields
295
- arrays = table.columns.map(&:data)
396
+ sink_and_start_plan(plan, project_node)
397
+ end
296
398
 
297
- keys.zip(data_arrays).each do |key, array|
298
- data = Arrow::ChunkedArray.new([array])
299
- fields << Arrow::Field.new(key, data.value_data_type)
300
- arrays << data
399
+ def build_aggregation_keys(function_name, summary_keys)
400
+ if summary_keys.empty?
401
+ [function_name]
402
+ else
403
+ summary_keys.map { |key| "#{function_name}(#{key})" }
301
404
  end
302
-
303
- Arrow::Table.new(Arrow::Schema.new(fields), arrays)
304
405
  end
305
406
 
306
407
  # Call Vector aggregating function and return an array of arrays:
@@ -78,6 +78,32 @@ module RedAmber
78
78
  Array(range)
79
79
  end
80
80
  end
81
+
82
+ # Create sink node and execute plan
83
+ #
84
+ # @param plan [Arrow::ExecutePlan]
85
+ # Execute plan of Acero.
86
+ # @param node [Arrow::ExecuteNode]
87
+ # Execute node of Acero.
88
+ # @param output_schema [Arrow::Schema, nil]
89
+ # Schema of table to output. If it is nil, output_schema of
90
+ # sink node is used.
91
+ # @return [Arrow::Table]
92
+ # Result of plan.
93
+ # @since 0.5.0
94
+ #
95
+ def sink_and_start_plan(plan, node, output_schema: nil)
96
+ sink_node_options = Arrow::SinkNodeOptions.new
97
+ plan.build_sink_node(node, sink_node_options)
98
+ plan.validate
99
+ plan.start
100
+ plan.wait
101
+ output_schema = node.output_schema if output_schema.nil?
102
+ reader = sink_node_options.get_reader(output_schema)
103
+ table = reader.read_all
104
+ plan.stop
105
+ table
106
+ end
81
107
  end
82
108
 
83
109
  # rubocop:disable Layout/LineLength
@@ -20,6 +20,7 @@ module RedAmber
20
20
  @sizes = []
21
21
  end
22
22
 
23
+ # Generic iterator method
23
24
  def each
24
25
  @selectors.each
25
26
  end
@@ -27,14 +28,20 @@ module RedAmber
27
28
 
28
29
  # Boolean selectors of sub-dataframes
29
30
  class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
30
35
  def sizes
31
- # count true
32
36
  @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
33
37
  end
34
38
  end
35
39
 
36
40
  # Index selectors of sub-dataframes
37
41
  class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
38
45
  def sizes
39
46
  @sizes = @selectors.map(&:size)
40
47
  end
@@ -93,7 +100,7 @@ module RedAmber
93
100
  # @since 0.4.0
94
101
  #
95
102
  def by_group(group)
96
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
97
104
  end
98
105
 
99
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -291,15 +298,15 @@ module RedAmber
291
298
  selectors = yield(dataframe)
292
299
  end
293
300
 
294
- if dataframe.empty? || selectors.nil? || selectors.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
295
302
  @baseframe = DataFrame.new
296
303
  @selectors = Selectors.new([])
297
304
  else
298
305
  @baseframe = dataframe
299
306
  @selectors =
300
- if selectors[0].boolean?
307
+ if selectors.first.boolean?
301
308
  Filters.new(selectors)
302
- elsif selectors[0].numeric?
309
+ elsif selectors.first.numeric?
303
310
  Indices.new(selectors)
304
311
  else
305
312
  raise SubFramesArgumentError, "illegal type: #{selectors}"
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).