red_amber 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.devcontainer/Dockerfile +75 -0
  3. data/.devcontainer/devcontainer.json +38 -0
  4. data/.devcontainer/onCreateCommand.sh +22 -0
  5. data/.rubocop.yml +11 -5
  6. data/CHANGELOG.md +141 -17
  7. data/Gemfile +5 -6
  8. data/README.ja.md +271 -0
  9. data/README.md +52 -31
  10. data/Rakefile +55 -0
  11. data/benchmark/group.yml +12 -5
  12. data/doc/Dev_Containers.ja.md +290 -0
  13. data/doc/Dev_Containers.md +292 -0
  14. data/doc/qmd/examples_of_red_amber.qmd +4596 -0
  15. data/doc/qmd/red-amber.qmd +90 -0
  16. data/docker/Dockerfile +2 -2
  17. data/docker/Gemfile +8 -3
  18. data/docker/docker-compose.yml +1 -1
  19. data/docker/readme.md +5 -5
  20. data/lib/red_amber/data_frame.rb +78 -4
  21. data/lib/red_amber/data_frame_combinable.rb +147 -119
  22. data/lib/red_amber/data_frame_displayable.rb +7 -6
  23. data/lib/red_amber/data_frame_loadsave.rb +1 -1
  24. data/lib/red_amber/data_frame_selectable.rb +51 -2
  25. data/lib/red_amber/data_frame_variable_operation.rb +6 -6
  26. data/lib/red_amber/group.rb +476 -127
  27. data/lib/red_amber/helper.rb +26 -0
  28. data/lib/red_amber/subframes.rb +18 -11
  29. data/lib/red_amber/vector.rb +45 -25
  30. data/lib/red_amber/vector_aggregation.rb +26 -0
  31. data/lib/red_amber/vector_selectable.rb +124 -40
  32. data/lib/red_amber/vector_string_function.rb +279 -0
  33. data/lib/red_amber/vector_unary_element_wise.rb +4 -0
  34. data/lib/red_amber/vector_updatable.rb +28 -0
  35. data/lib/red_amber/version.rb +1 -1
  36. data/lib/red_amber.rb +2 -1
  37. data/red_amber.gemspec +3 -3
  38. metadata +19 -14
  39. data/docker/Gemfile.lock +0 -80
  40. data/docker/example +0 -74
  41. data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
  42. data/docker/notebook/red-amber.ipynb +0 -188
@@ -78,6 +78,32 @@ module RedAmber
78
78
  Array(range)
79
79
  end
80
80
  end
81
+
82
+ # Create sink node and execute plan
83
+ #
84
+ # @param plan [Arrow::ExecutePlan]
85
+ # Execute plan of Acero.
86
+ # @param node [Arrow::ExecuteNode]
87
+ # Execute node of Acero.
88
+ # @param output_schema [Arrow::Schema, nil]
89
+ # Schema of table to output. If it is nil, output_schema of
90
+ # sink node is used.
91
+ # @return [Arrow::Table]
92
+ # Result of plan.
93
+ # @since 0.5.0
94
+ #
95
+ def sink_and_start_plan(plan, node, output_schema: nil)
96
+ sink_node_options = Arrow::SinkNodeOptions.new
97
+ plan.build_sink_node(node, sink_node_options)
98
+ plan.validate
99
+ plan.start
100
+ plan.wait
101
+ output_schema = node.output_schema if output_schema.nil?
102
+ reader = sink_node_options.get_reader(output_schema)
103
+ table = reader.read_all
104
+ plan.stop
105
+ table
106
+ end
81
107
  end
82
108
 
83
109
  # rubocop:disable Layout/LineLength
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # class SubFrames treats a set of subsets of a DataFrame
4
+ # class SubFrames treats subsets of a DataFrame
5
5
  # [Experimental feature] Class SubFrames may be removed or be changed in the future.
6
6
  class SubFrames
7
7
  include Enumerable # may change to use Forwardable.
@@ -20,6 +20,7 @@ module RedAmber
20
20
  @sizes = []
21
21
  end
22
22
 
23
+ # Generic iterator method
23
24
  def each
24
25
  @selectors.each
25
26
  end
@@ -27,14 +28,20 @@ module RedAmber
27
28
 
28
29
  # Boolean selectors of sub-dataframes
29
30
  class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
30
35
  def sizes
31
- # count true
32
36
  @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
33
37
  end
34
38
  end
35
39
 
36
40
  # Index selectors of sub-dataframes
37
41
  class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
38
45
  def sizes
39
46
  @sizes = @selectors.map(&:size)
40
47
  end
@@ -93,7 +100,7 @@ module RedAmber
93
100
  # @since 0.4.0
94
101
  #
95
102
  def by_group(group)
96
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
97
104
  end
98
105
 
99
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -291,15 +298,15 @@ module RedAmber
291
298
  selectors = yield(dataframe)
292
299
  end
293
300
 
294
- if dataframe.empty? || selectors.nil? || selectors.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
295
302
  @baseframe = DataFrame.new
296
303
  @selectors = Selectors.new([])
297
304
  else
298
305
  @baseframe = dataframe
299
306
  @selectors =
300
- if selectors[0].boolean?
307
+ if selectors.first.boolean?
301
308
  Filters.new(selectors)
302
- elsif selectors[0].numeric?
309
+ elsif selectors.first.numeric?
303
310
  Indices.new(selectors)
304
311
  else
305
312
  raise SubFramesArgumentError, "illegal type: #{selectors}"
@@ -427,7 +434,7 @@ module RedAmber
427
434
  # @return [DataFrame]
428
435
  # created DataFrame.
429
436
  # @example Aggregate by key labels in arguments and values from block.
430
- # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
437
+ # subframes.aggregate(:y, :sum_x) { [y.one, x.sum] }
431
438
  #
432
439
  # # =>
433
440
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -438,7 +445,7 @@ module RedAmber
438
445
  # 2 C 6
439
446
  #
440
447
  # @example Aggregate by key labels in an Array and values from block.
441
- # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
448
+ # subframes.aggregate([:y, :sum_x]) { [y.one, x.sum] }
442
449
  #
443
450
  # # =>
444
451
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -450,7 +457,7 @@ module RedAmber
450
457
  #
451
458
  # @overload aggregate
452
459
  #
453
- # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
460
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated values
454
461
  # in Hash from the block.
455
462
  #
456
463
  # @yieldparam dataframe [DataFrame]
@@ -463,7 +470,7 @@ module RedAmber
463
470
  # created DataFrame.
464
471
  # @example Aggregate by key and value pairs from block.
465
472
  # subframes.aggregate do
466
- # { y: y.first, sum_x: x.sum }
473
+ # { y: y.one, sum_x: x.sum }
467
474
  # end
468
475
  #
469
476
  # # =>
@@ -705,7 +712,7 @@ module RedAmber
705
712
  # @example
706
713
  # subframes.assign(:sum_x, :frac_x) do
707
714
  # group_sum = x.sum
708
- # [[group_sum] * size, x / s.to_f]
715
+ # [[group_sum] * size, x / group_sum.to_f]
709
716
  # end
710
717
  #
711
718
  # # =>
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,7 +180,8 @@ module RedAmber
161
180
  end
162
181
  sio << ']'
163
182
 
164
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
183
+ chunked = chunked? ? ', chunked' : ''
184
+ format "#<#{self.class}(:#{type}, size=#{size}#{chunked}):0x%016x>\n%s\n",
165
185
  object_id, sio.string
166
186
  end
167
187
  end
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).
@@ -153,10 +153,23 @@ module RedAmber
153
153
  # @param element
154
154
  # an element of self.
155
155
  # @return [integer, nil]
156
- # founded position of element. If it is not found, returns nil.
156
+ # position of element. If it is not found, returns nil.
157
157
  #
158
158
  def index(element)
159
- (0...size).find { |i| self[i] == element }
159
+ if element.nil?
160
+ datum = find(:is_null).execute([data])
161
+ value = Arrow::Scalar.resolve(true, :boolean)
162
+ else
163
+ datum = data
164
+ value = Arrow::Scalar.resolve(element, type)
165
+ end
166
+ datum = find(:index).execute([datum], value: value)
167
+ index = get_scalar(datum)
168
+ if index.negative?
169
+ nil
170
+ else
171
+ index
172
+ end
160
173
  end
161
174
 
162
175
  # Returns first element of self.
@@ -229,55 +242,118 @@ module RedAmber
229
242
  take(sort_indices(order: order))
230
243
  end
231
244
 
232
- # Returns numerical rank of self.
245
+ # Returns 1-based numerical rank of self.
233
246
  # - Nil values are considered greater than any value.
234
247
  # - NaN values are considered greater than any value but smaller than nil values.
235
- # - Tiebreakers are ranked in order of appearance.
236
- # - `RankOptions` in C++ function is not implemented in C GLib yet.
237
- # This method is currently fixed to the default behavior.
238
- #
248
+ # - Order of each element is considered as ascending by default. It is
249
+ # changable by the parameter `order = :descending`.
250
+ # - Tiebreakers are ranked in order of appearance by default or
251
+ # with `tie: :first` option.
252
+ # - Null values (nil and NaN) are placed at end by default.
253
+ # This behavior can be changed by the option `null_placement: :at_start`.
254
+ #
255
+ # @param order [:ascending, '+', :descending, '-']
256
+ # the order of the elements should be ranked in.
257
+ # - :ascending or '+' : rank is computed in ascending order.
258
+ # - :descending or '-' : rank is computed in descending order.
259
+ # @param tie [:first, :min, :max, :dense]
260
+ # configure how ties between equal values are handled.
261
+ # - first: Ranks are assigned in order of when ties appear in the input.
262
+ # - min: Ties get the smallest possible rank in the sorted order.
263
+ # - max: Ties get the largest possible rank in the sorted order.
264
+ # - dense: The ranks span a dense [1, M] interval where M is the number
265
+ # of distinct values in the input.
266
+ # @param null_placement [:at_end, :at_start]
267
+ # configure the position of nulls to be located.
268
+ # Nulls are considered as `NaN < nil`.
239
269
  # @return [Vector]
240
- # 0-based rank of self (0...size in range).
270
+ # 1-based rank in uint64 of self (1..size in range) at maximum.
241
271
  # @example Rank of float Vector
242
- # fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
272
+ # float = Vector[1, 0, nil, Float::NAN, Float::INFINITY, -Float::INFINITY, 3, 2]
273
+ # float
243
274
  #
244
275
  # # =>
245
- # #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
246
- # [0.1, nil, NaN, 0.2, 0.1]
276
+ # #<RedAmber::Vector(:double, size=8):0x0000000000036858>
277
+ # [1.0, 0.0, nil, NaN, Infinity, -Infinity, 3.0, 2.0]
247
278
  #
248
- # fv.rank
279
+ # float.rank
280
+ # # or float.rank(:ascending, tie: :first, null_placement: :at_end)
249
281
  #
250
282
  # # =>
251
- # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
252
- # [0, 4, 3, 2, 1]
283
+ # #<RedAmber::Vector(:uint64, size=8):0x000000000003af84>
284
+ # [3, 2, 8, 7, 6, 1, 5, 4]
253
285
  #
254
286
  # @example Rank of string Vector
255
- # sv = Vector.new("A", "B", nil, "A", "C"); sv
287
+ # string = Vector["A", "A", nil, nil, "C", "B"]
288
+ # string
289
+ #
290
+ # # =>
291
+ # #<RedAmber::Vector(:string, size=6):0x000000000003d568>
292
+ # ["A", "A", nil, nil, "C", "B"]
293
+ #
294
+ # string.rank
295
+ #
296
+ # # =>
297
+ # #<RedAmber::Vector(:uint64, size=6):0x0000000000049bc4>
298
+ # [1, 2, 5, 6, 4, 3]
299
+ #
300
+ # @example Rank with order = :descending
301
+ # float.rank(:descending) # or float.rank('-')
302
+ #
303
+ # # =>
304
+ # #<RedAmber::Vector(:uint64, size=8):0x000000000006ef00>
305
+ # [4, 5, 8, 7, 1, 6, 2, 3]
306
+ #
307
+ # @example Rank with tie: :min
308
+ # string.rank(tie: :min)
309
+ #
310
+ # # =>
311
+ # #<RedAmber::Vector(:uint64, size=6):0x000000000007a1d4>
312
+ # [1, 1, 5, 5, 4, 3]
313
+ #
314
+ # @example Rank with tie: :max
315
+ # string.rank(tie: :max)
316
+ #
317
+ # # =>
318
+ # #<RedAmber::Vector(:uint64, size=6):0x000000000007cba0>
319
+ # [2, 2, 6, 6, 4, 3]
320
+ #
321
+ # @example Rank with tie: :dense
322
+ # string.rank(tie: :dense)
256
323
  #
257
324
  # # =>
258
- # #<RedAmber::Vector(:string, size=5):0x0000000000003854>
259
- # ["A", "B", nil, "A", "C"]
325
+ # #<RedAmber::Vector(:uint64, size=6):0x0000000000080930>
326
+ # [1, 1, 4, 4, 3, 2]
260
327
  #
261
- # sv.rank
328
+ # @example Rank with null_placement: :at_start
329
+ # float.rank(null_placement: :at_start)
262
330
  #
263
331
  # # =>
264
- # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
265
- # [0, 2, 4, 1, 3]
332
+ # #<RedAmber::Vector(:uint64, size=8):0x0000000000082104>
333
+ # [5, 4, 1, 2, 8, 3, 7, 6]
266
334
  #
267
335
  # @since 0.4.0
268
336
  #
269
- def rank
270
- datum =
271
- case data
272
- when Arrow::ChunkedArray
273
- Arrow::Function.find(:rank).execute([data.pack])
337
+ def rank(order = :ascending, tie: :first, null_placement: :at_end)
338
+ func = find(:rank)
339
+ options = func.default_options
340
+ order =
341
+ case order.to_sym
342
+ when :+, :ascending, :increasing
343
+ :ascending
344
+ when :-, :descending, :decreasing
345
+ :descending
274
346
  else
275
- Arrow::Function.find(:rank).execute([data])
347
+ raise VectorArgumentError, "illegal order option: #{order}"
276
348
  end
277
- Vector.create(datum.value) - 1
349
+ options.sort_keys = [Arrow::SortKey.resolve('', order)]
350
+ options.tiebreaker = tie
351
+ options.null_placement = null_placement
352
+ Vector.create(func.execute([data], options).value)
278
353
  end
279
354
 
280
355
  # Pick up elements at random.
356
+ # @note This method requires 'arrow-numo-narray' gem.
281
357
  #
282
358
  # @overload sample()
283
359
  # Return a randomly selected element.
@@ -298,12 +374,12 @@ module RedAmber
298
374
  # "C"
299
375
  #
300
376
  # @overload sample(n)
301
- # Pick up n elements at random.
377
+ # Select n elements at random.
302
378
  #
303
379
  # @param n [Integer]
304
- # positive number of elements to pick.
305
- # If n is smaller or equal to size, elements are picked by non-repeating.
306
- # If n is greater than `size`, elements are picked repeatedly.
380
+ # positive number of elements to select.
381
+ # If n is smaller or equal to size, elements are selected by non-repeating.
382
+ # If n is greater than `size`, elements are selected repeatedly.
307
383
  # @return [Vector]
308
384
  # sampled elements.
309
385
  # If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
@@ -315,7 +391,7 @@ module RedAmber
315
391
  # #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
316
392
  # ["H"]
317
393
  #
318
- # @example Sample same size of self: every element is picked in random order
394
+ # @example Sample same size of self: every element is selected in random order
319
395
  # v.sample(8)
320
396
  #
321
397
  # # =>
@@ -330,18 +406,18 @@ module RedAmber
330
406
  # ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
331
407
  #
332
408
  # @overload sample(prop)
333
- # Pick up elements by proportion `prop` at random.
409
+ # Select elements by proportion `prop` at random.
334
410
  #
335
411
  # @param prop [Float]
336
- # positive proportion of elements to pick.
337
- # Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
338
- # If prop is smaller or equal to 1.0, elements are picked by non-repeating.
339
- # If prop is greater than 1.0, some elements are picked repeatedly.
412
+ # positive proportion of elements to select.
413
+ # Absolute number of elements to select:`prop*size` is rounded (by `half: :up`).
414
+ # If prop is smaller or equal to 1.0, elements are selected by non-repeating.
415
+ # If prop is greater than 1.0, some elements are selected repeatedly.
340
416
  # @return [Vector]
341
417
  # sampled elements.
342
- # If picked element is only one, it returns a Vector of size == 1
418
+ # If selected element is only one, it returns a Vector of size == 1
343
419
  # not a scalar.
344
- # @example Sample same size of self: every element is picked in random order
420
+ # @example Sample same size of self: every element is selected in random order
345
421
  # v.sample(1.0)
346
422
  #
347
423
  # # =>
@@ -355,6 +431,14 @@ module RedAmber
355
431
  # #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
356
432
  # ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
357
433
  #
434
+ # @example prop less than 1.0
435
+ # v.sample(0.7)
436
+ #
437
+ # # =>
438
+ # # Take (8 * 0.7).truncate => 5 samples
439
+ # #<RedAmber::Vector(:string, size=5):0x000000000001afe0>
440
+ # ["C", "A", "E", "H", "D"]
441
+ #
358
442
  # @since 0.4.0
359
443
  #
360
444
  def sample(n_or_prop = nil)
@@ -367,7 +451,7 @@ module RedAmber
367
451
  in Integer
368
452
  n_or_prop
369
453
  in Float
370
- (n_or_prop * size).round
454
+ (n_or_prop * size).truncate
371
455
  in nil
372
456
  return to_a.sample
373
457
  else