red_amber 0.4.2 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.devcontainer/Dockerfile +75 -0
  3. data/.devcontainer/devcontainer.json +38 -0
  4. data/.devcontainer/onCreateCommand.sh +22 -0
  5. data/.rubocop.yml +11 -5
  6. data/CHANGELOG.md +141 -17
  7. data/Gemfile +5 -6
  8. data/README.ja.md +271 -0
  9. data/README.md +52 -31
  10. data/Rakefile +55 -0
  11. data/benchmark/group.yml +12 -5
  12. data/doc/Dev_Containers.ja.md +290 -0
  13. data/doc/Dev_Containers.md +292 -0
  14. data/doc/qmd/examples_of_red_amber.qmd +4596 -0
  15. data/doc/qmd/red-amber.qmd +90 -0
  16. data/docker/Dockerfile +2 -2
  17. data/docker/Gemfile +8 -3
  18. data/docker/docker-compose.yml +1 -1
  19. data/docker/readme.md +5 -5
  20. data/lib/red_amber/data_frame.rb +78 -4
  21. data/lib/red_amber/data_frame_combinable.rb +147 -119
  22. data/lib/red_amber/data_frame_displayable.rb +7 -6
  23. data/lib/red_amber/data_frame_loadsave.rb +1 -1
  24. data/lib/red_amber/data_frame_selectable.rb +51 -2
  25. data/lib/red_amber/data_frame_variable_operation.rb +6 -6
  26. data/lib/red_amber/group.rb +476 -127
  27. data/lib/red_amber/helper.rb +26 -0
  28. data/lib/red_amber/subframes.rb +18 -11
  29. data/lib/red_amber/vector.rb +45 -25
  30. data/lib/red_amber/vector_aggregation.rb +26 -0
  31. data/lib/red_amber/vector_selectable.rb +124 -40
  32. data/lib/red_amber/vector_string_function.rb +279 -0
  33. data/lib/red_amber/vector_unary_element_wise.rb +4 -0
  34. data/lib/red_amber/vector_updatable.rb +28 -0
  35. data/lib/red_amber/version.rb +1 -1
  36. data/lib/red_amber.rb +2 -1
  37. data/red_amber.gemspec +3 -3
  38. metadata +19 -14
  39. data/docker/Gemfile.lock +0 -80
  40. data/docker/example +0 -74
  41. data/docker/notebook/examples_of_red_amber.ipynb +0 -8562
  42. data/docker/notebook/red-amber.ipynb +0 -188
@@ -78,6 +78,32 @@ module RedAmber
78
78
  Array(range)
79
79
  end
80
80
  end
81
+
82
+ # Create sink node and execute plan
83
+ #
84
+ # @param plan [Arrow::ExecutePlan]
85
+ # Execute plan of Acero.
86
+ # @param node [Arrow::ExecuteNode]
87
+ # Execute node of Acero.
88
+ # @param output_schema [Arrow::Schema, nil]
89
+ # Schema of table to output. If it is nil, output_schema of
90
+ # sink node is used.
91
+ # @return [Arrow::Table]
92
+ # Result of plan.
93
+ # @since 0.5.0
94
+ #
95
+ def sink_and_start_plan(plan, node, output_schema: nil)
96
+ sink_node_options = Arrow::SinkNodeOptions.new
97
+ plan.build_sink_node(node, sink_node_options)
98
+ plan.validate
99
+ plan.start
100
+ plan.wait
101
+ output_schema = node.output_schema if output_schema.nil?
102
+ reader = sink_node_options.get_reader(output_schema)
103
+ table = reader.read_all
104
+ plan.stop
105
+ table
106
+ end
81
107
  end
82
108
 
83
109
  # rubocop:disable Layout/LineLength
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # class SubFrames treats a set of subsets of a DataFrame
4
+ # class SubFrames treats subsets of a DataFrame
5
5
  # [Experimental feature] Class SubFrames may be removed or be changed in the future.
6
6
  class SubFrames
7
7
  include Enumerable # may change to use Forwardable.
@@ -20,6 +20,7 @@ module RedAmber
20
20
  @sizes = []
21
21
  end
22
22
 
23
+ # Generic iterator method
23
24
  def each
24
25
  @selectors.each
25
26
  end
@@ -27,14 +28,20 @@ module RedAmber
27
28
 
28
29
  # Boolean selectors of sub-dataframes
29
30
  class Filters < Selectors
31
+ # Return sizes of filter
32
+ # @return [Array<Integer>]
33
+ # sizes of each sub dataframes.
34
+ # Counts true for each filter.
30
35
  def sizes
31
- # count true
32
36
  @sizes = @selectors.map { |s| s.to_a.count { _1 } } # rubocop:disable Performance/Size
33
37
  end
34
38
  end
35
39
 
36
40
  # Index selectors of sub-dataframes
37
41
  class Indices < Selectors
42
+ # Return sizes of selector indices.
43
+ # @return [Array<Integer>]
44
+ # sizes of each sub dataframes.
38
45
  def sizes
39
46
  @sizes = @selectors.map(&:size)
40
47
  end
@@ -93,7 +100,7 @@ module RedAmber
93
100
  # @since 0.4.0
94
101
  #
95
102
  def by_group(group)
96
- SubFrames.new(group.dataframe, group.filters)
103
+ SubFrames.by_filters(group.dataframe, group.filters)
97
104
  end
98
105
 
99
106
  # Create a new SubFrames object from a DataFrame and an array of indices.
@@ -291,15 +298,15 @@ module RedAmber
291
298
  selectors = yield(dataframe)
292
299
  end
293
300
 
294
- if dataframe.empty? || selectors.nil? || selectors.empty?
301
+ if dataframe.empty? || selectors.nil? || selectors.size.zero? # rubocop:disable Style/ZeroLengthPredicate
295
302
  @baseframe = DataFrame.new
296
303
  @selectors = Selectors.new([])
297
304
  else
298
305
  @baseframe = dataframe
299
306
  @selectors =
300
- if selectors[0].boolean?
307
+ if selectors.first.boolean?
301
308
  Filters.new(selectors)
302
- elsif selectors[0].numeric?
309
+ elsif selectors.first.numeric?
303
310
  Indices.new(selectors)
304
311
  else
305
312
  raise SubFramesArgumentError, "illegal type: #{selectors}"
@@ -427,7 +434,7 @@ module RedAmber
427
434
  # @return [DataFrame]
428
435
  # created DataFrame.
429
436
  # @example Aggregate by key labels in arguments and values from block.
430
- # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
437
+ # subframes.aggregate(:y, :sum_x) { [y.one, x.sum] }
431
438
  #
432
439
  # # =>
433
440
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -438,7 +445,7 @@ module RedAmber
438
445
  # 2 C 6
439
446
  #
440
447
  # @example Aggregate by key labels in an Array and values from block.
441
- # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
448
+ # subframes.aggregate([:y, :sum_x]) { [y.one, x.sum] }
442
449
  #
443
450
  # # =>
444
451
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
@@ -450,7 +457,7 @@ module RedAmber
450
457
  #
451
458
  # @overload aggregate
452
459
  #
453
- # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
460
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated values
454
461
  # in Hash from the block.
455
462
  #
456
463
  # @yieldparam dataframe [DataFrame]
@@ -463,7 +470,7 @@ module RedAmber
463
470
  # created DataFrame.
464
471
  # @example Aggregate by key and value pairs from block.
465
472
  # subframes.aggregate do
466
- # { y: y.first, sum_x: x.sum }
473
+ # { y: y.one, sum_x: x.sum }
467
474
  # end
468
475
  #
469
476
  # # =>
@@ -705,7 +712,7 @@ module RedAmber
705
712
  # @example
706
713
  # subframes.assign(:sum_x, :frac_x) do
707
714
  # group_sum = x.sum
708
- # [[group_sum] * size, x / s.to_f]
715
+ # [[group_sum] * size, x / group_sum.to_f]
709
716
  # end
710
717
  #
711
718
  # # =>
@@ -10,21 +10,54 @@ module RedAmber
10
10
  include ArrowFunction
11
11
  include VectorUpdatable
12
12
  include VectorSelectable
13
+ include VectorStringFunction
13
14
 
14
15
  using RefineArrayLike
15
16
 
16
- # Quicker constructor of Vector.
17
+ # Entity of Vector.
17
18
  #
18
- # @param arrow_array [Arrow::Array]
19
- # Arrow::Array object to have in the Vector.
20
- # @return [Vector]
21
- # created Vector.
22
- # @note This method doesn't check argment type.
19
+ # @return [Arrow::Array]
20
+ #
21
+ attr_reader :data
22
+ alias_method :to_arrow_array, :data
23
+
24
+ # Associated key name when self is in a DataFrame.
25
+ #
26
+ # Default Vector is 'head-less' (key-less).
27
+ # @return [Symbol]
23
28
  #
24
- def self.create(arrow_array)
25
- instance = allocate
26
- instance.instance_variable_set(:@data, arrow_array)
27
- instance
29
+ attr_accessor :key
30
+
31
+ class << self
32
+ # Create a Vector (calling `.new`).
33
+ #
34
+ # @param (see #initialize)
35
+ # @return (see #initialize)
36
+ # @example Create an empty Vector.
37
+ # Vector[]
38
+ # # =>
39
+ # #<RedAmber::Vector(:string, size=0):0x000000000000e2cc>
40
+ # []
41
+ #
42
+ # @since 0.5.0
43
+ #
44
+ def [](...)
45
+ new(...)
46
+ end
47
+
48
+ # Quicker constructor of Vector.
49
+ #
50
+ # @param arrow_array [Arrow::Array]
51
+ # Arrow::Array object to have in the Vector.
52
+ # @return [Vector]
53
+ # created Vector.
54
+ # @note This method doesn't check argment type.
55
+ #
56
+ def create(arrow_array)
57
+ instance = allocate
58
+ instance.instance_variable_set(:@data, arrow_array)
59
+ instance
60
+ end
28
61
  end
29
62
 
30
63
  # Create a Vector.
@@ -51,20 +84,6 @@ module RedAmber
51
84
  end
52
85
  end
53
86
 
54
- # Entity of Vector.
55
- #
56
- # @return [Arrow::Array]
57
- #
58
- attr_reader :data
59
- alias_method :to_arrow_array, :data
60
-
61
- # Associated key name when self is in a DataFrame.
62
- #
63
- # Default Vector is 'head-less' (key-less).
64
- # @return [Symbol]
65
- #
66
- attr_accessor :key
67
-
68
87
  # Return other as a Vector which is same data type as self.
69
88
  #
70
89
  # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
@@ -161,7 +180,8 @@ module RedAmber
161
180
  end
162
181
  sio << ']'
163
182
 
164
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
183
+ chunked = chunked? ? ', chunked' : ''
184
+ format "#<#{self.class}(:#{type}, size=#{size}#{chunked}):0x%016x>\n%s\n",
165
185
  object_id, sio.string
166
186
  end
167
187
  end
@@ -161,6 +161,22 @@ module RedAmber
161
161
  #
162
162
  define_unary_aggregation :min_max
163
163
 
164
+ # Compute the 1 most common values and their respective
165
+ # occurence counts.
166
+ #
167
+ # @note Self must be a numeric or a boolean Vector.
168
+ # @note ModeOptions are not supported in 0.5.0 .
169
+ # Only one mode value is returned.
170
+ # @api private
171
+ # @return [Hash{'mode'=>mode, 'count'=>count}]
172
+ # mode and count of self in an array.
173
+ # @since 0.5.0
174
+ #
175
+ def mode
176
+ datum = find(:mode).execute([data])
177
+ datum.value.to_a.first
178
+ end
179
+
164
180
  # Compute product value of self.
165
181
  #
166
182
  # @note Self must be a numeric Vector.
@@ -241,6 +257,16 @@ module RedAmber
241
257
  # - nearest: returns i or j, whichever is closer.
242
258
  # - midpoint: returns (i + j) / 2.
243
259
 
260
+ # Get a non-nil element in self.
261
+ #
262
+ # @return [Object, nil]
263
+ # first non-nil value detected. If all elements are nil, return nil.
264
+ # @since 0.5.0
265
+ #
266
+ def one
267
+ each.find { !_1.nil? }
268
+ end
269
+
244
270
  # Returns a quantile value.
245
271
  # - 0.5 quantile (median) is returned by default.
246
272
  # - Or return quantile for specified probability (prob).
@@ -153,10 +153,23 @@ module RedAmber
153
153
  # @param element
154
154
  # an element of self.
155
155
  # @return [integer, nil]
156
- # founded position of element. If it is not found, returns nil.
156
+ # position of element. If it is not found, returns nil.
157
157
  #
158
158
  def index(element)
159
- (0...size).find { |i| self[i] == element }
159
+ if element.nil?
160
+ datum = find(:is_null).execute([data])
161
+ value = Arrow::Scalar.resolve(true, :boolean)
162
+ else
163
+ datum = data
164
+ value = Arrow::Scalar.resolve(element, type)
165
+ end
166
+ datum = find(:index).execute([datum], value: value)
167
+ index = get_scalar(datum)
168
+ if index.negative?
169
+ nil
170
+ else
171
+ index
172
+ end
160
173
  end
161
174
 
162
175
  # Returns first element of self.
@@ -229,55 +242,118 @@ module RedAmber
229
242
  take(sort_indices(order: order))
230
243
  end
231
244
 
232
- # Returns numerical rank of self.
245
+ # Returns 1-based numerical rank of self.
233
246
  # - Nil values are considered greater than any value.
234
247
  # - NaN values are considered greater than any value but smaller than nil values.
235
- # - Tiebreakers are ranked in order of appearance.
236
- # - `RankOptions` in C++ function is not implemented in C GLib yet.
237
- # This method is currently fixed to the default behavior.
238
- #
248
+ # - Order of each element is considered as ascending by default. It is
249
+ # changable by the parameter `order = :descending`.
250
+ # - Tiebreakers are ranked in order of appearance by default or
251
+ # with `tie: :first` option.
252
+ # - Null values (nil and NaN) are placed at end by default.
253
+ # This behavior can be changed by the option `null_placement: :at_start`.
254
+ #
255
+ # @param order [:ascending, '+', :descending, '-']
256
+ # the order of the elements should be ranked in.
257
+ # - :ascending or '+' : rank is computed in ascending order.
258
+ # - :descending or '-' : rank is computed in descending order.
259
+ # @param tie [:first, :min, :max, :dense]
260
+ # configure how ties between equal values are handled.
261
+ # - first: Ranks are assigned in order of when ties appear in the input.
262
+ # - min: Ties get the smallest possible rank in the sorted order.
263
+ # - max: Ties get the largest possible rank in the sorted order.
264
+ # - dense: The ranks span a dense [1, M] interval where M is the number
265
+ # of distinct values in the input.
266
+ # @param null_placement [:at_end, :at_start]
267
+ # configure the position of nulls to be located.
268
+ # Nulls are considered as `NaN < nil`.
239
269
  # @return [Vector]
240
- # 0-based rank of self (0...size in range).
270
+ # 1-based rank in uint64 of self (1..size in range) at maximum.
241
271
  # @example Rank of float Vector
242
- # fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
272
+ # float = Vector[1, 0, nil, Float::NAN, Float::INFINITY, -Float::INFINITY, 3, 2]
273
+ # float
243
274
  #
244
275
  # # =>
245
- # #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
246
- # [0.1, nil, NaN, 0.2, 0.1]
276
+ # #<RedAmber::Vector(:double, size=8):0x0000000000036858>
277
+ # [1.0, 0.0, nil, NaN, Infinity, -Infinity, 3.0, 2.0]
247
278
  #
248
- # fv.rank
279
+ # float.rank
280
+ # # or float.rank(:ascending, tie: :first, null_placement: :at_end)
249
281
  #
250
282
  # # =>
251
- # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
252
- # [0, 4, 3, 2, 1]
283
+ # #<RedAmber::Vector(:uint64, size=8):0x000000000003af84>
284
+ # [3, 2, 8, 7, 6, 1, 5, 4]
253
285
  #
254
286
  # @example Rank of string Vector
255
- # sv = Vector.new("A", "B", nil, "A", "C"); sv
287
+ # string = Vector["A", "A", nil, nil, "C", "B"]
288
+ # string
289
+ #
290
+ # # =>
291
+ # #<RedAmber::Vector(:string, size=6):0x000000000003d568>
292
+ # ["A", "A", nil, nil, "C", "B"]
293
+ #
294
+ # string.rank
295
+ #
296
+ # # =>
297
+ # #<RedAmber::Vector(:uint64, size=6):0x0000000000049bc4>
298
+ # [1, 2, 5, 6, 4, 3]
299
+ #
300
+ # @example Rank with order = :descending
301
+ # float.rank(:descending) # or float.rank('-')
302
+ #
303
+ # # =>
304
+ # #<RedAmber::Vector(:uint64, size=8):0x000000000006ef00>
305
+ # [4, 5, 8, 7, 1, 6, 2, 3]
306
+ #
307
+ # @example Rank with tie: :min
308
+ # string.rank(tie: :min)
309
+ #
310
+ # # =>
311
+ # #<RedAmber::Vector(:uint64, size=6):0x000000000007a1d4>
312
+ # [1, 1, 5, 5, 4, 3]
313
+ #
314
+ # @example Rank with tie: :max
315
+ # string.rank(tie: :max)
316
+ #
317
+ # # =>
318
+ # #<RedAmber::Vector(:uint64, size=6):0x000000000007cba0>
319
+ # [2, 2, 6, 6, 4, 3]
320
+ #
321
+ # @example Rank with tie: :dense
322
+ # string.rank(tie: :dense)
256
323
  #
257
324
  # # =>
258
- # #<RedAmber::Vector(:string, size=5):0x0000000000003854>
259
- # ["A", "B", nil, "A", "C"]
325
+ # #<RedAmber::Vector(:uint64, size=6):0x0000000000080930>
326
+ # [1, 1, 4, 4, 3, 2]
260
327
  #
261
- # sv.rank
328
+ # @example Rank with null_placement: :at_start
329
+ # float.rank(null_placement: :at_start)
262
330
  #
263
331
  # # =>
264
- # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
265
- # [0, 2, 4, 1, 3]
332
+ # #<RedAmber::Vector(:uint64, size=8):0x0000000000082104>
333
+ # [5, 4, 1, 2, 8, 3, 7, 6]
266
334
  #
267
335
  # @since 0.4.0
268
336
  #
269
- def rank
270
- datum =
271
- case data
272
- when Arrow::ChunkedArray
273
- Arrow::Function.find(:rank).execute([data.pack])
337
+ def rank(order = :ascending, tie: :first, null_placement: :at_end)
338
+ func = find(:rank)
339
+ options = func.default_options
340
+ order =
341
+ case order.to_sym
342
+ when :+, :ascending, :increasing
343
+ :ascending
344
+ when :-, :descending, :decreasing
345
+ :descending
274
346
  else
275
- Arrow::Function.find(:rank).execute([data])
347
+ raise VectorArgumentError, "illegal order option: #{order}"
276
348
  end
277
- Vector.create(datum.value) - 1
349
+ options.sort_keys = [Arrow::SortKey.resolve('', order)]
350
+ options.tiebreaker = tie
351
+ options.null_placement = null_placement
352
+ Vector.create(func.execute([data], options).value)
278
353
  end
279
354
 
280
355
  # Pick up elements at random.
356
+ # @note This method requires 'arrow-numo-narray' gem.
281
357
  #
282
358
  # @overload sample()
283
359
  # Return a randomly selected element.
@@ -298,12 +374,12 @@ module RedAmber
298
374
  # "C"
299
375
  #
300
376
  # @overload sample(n)
301
- # Pick up n elements at random.
377
+ # Select n elements at random.
302
378
  #
303
379
  # @param n [Integer]
304
- # positive number of elements to pick.
305
- # If n is smaller or equal to size, elements are picked by non-repeating.
306
- # If n is greater than `size`, elements are picked repeatedly.
380
+ # positive number of elements to select.
381
+ # If n is smaller or equal to size, elements are selected by non-repeating.
382
+ # If n is greater than `size`, elements are selected repeatedly.
307
383
  # @return [Vector]
308
384
  # sampled elements.
309
385
  # If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
@@ -315,7 +391,7 @@ module RedAmber
315
391
  # #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
316
392
  # ["H"]
317
393
  #
318
- # @example Sample same size of self: every element is picked in random order
394
+ # @example Sample same size of self: every element is selected in random order
319
395
  # v.sample(8)
320
396
  #
321
397
  # # =>
@@ -330,18 +406,18 @@ module RedAmber
330
406
  # ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
331
407
  #
332
408
  # @overload sample(prop)
333
- # Pick up elements by proportion `prop` at random.
409
+ # Select elements by proportion `prop` at random.
334
410
  #
335
411
  # @param prop [Float]
336
- # positive proportion of elements to pick.
337
- # Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
338
- # If prop is smaller or equal to 1.0, elements are picked by non-repeating.
339
- # If prop is greater than 1.0, some elements are picked repeatedly.
412
+ # positive proportion of elements to select.
413
+ # Absolute number of elements to select:`prop*size` is rounded (by `half: :up`).
414
+ # If prop is smaller or equal to 1.0, elements are selected by non-repeating.
415
+ # If prop is greater than 1.0, some elements are selected repeatedly.
340
416
  # @return [Vector]
341
417
  # sampled elements.
342
- # If picked element is only one, it returns a Vector of size == 1
418
+ # If selected element is only one, it returns a Vector of size == 1
343
419
  # not a scalar.
344
- # @example Sample same size of self: every element is picked in random order
420
+ # @example Sample same size of self: every element is selected in random order
345
421
  # v.sample(1.0)
346
422
  #
347
423
  # # =>
@@ -355,6 +431,14 @@ module RedAmber
355
431
  # #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
356
432
  # ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
357
433
  #
434
+ # @example prop less than 1.0
435
+ # v.sample(0.7)
436
+ #
437
+ # # =>
438
+ # # Take (8 * 0.7).truncate => 5 samples
439
+ # #<RedAmber::Vector(:string, size=5):0x000000000001afe0>
440
+ # ["C", "A", "E", "H", "D"]
441
+ #
358
442
  # @since 0.4.0
359
443
  #
360
444
  def sample(n_or_prop = nil)
@@ -367,7 +451,7 @@ module RedAmber
367
451
  in Integer
368
452
  n_or_prop
369
453
  in Float
370
- (n_or_prop * size).round
454
+ (n_or_prop * size).truncate
371
455
  in nil
372
456
  return to_a.sample
373
457
  else