red_amber 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,7 +82,7 @@ module RedAmber
82
82
  enum =
83
83
  Enumerator.new(subset_indices.size) do |y|
84
84
  subset_indices.each do |i|
85
- y.yield dataframe.take(i)
85
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
86
  end
87
87
  end
88
88
  instance.instance_variable_set(:@enum, enum)
@@ -108,7 +108,7 @@ module RedAmber
108
108
  enum =
109
109
  Enumerator.new(subset_filters.size) do |y|
110
110
  subset_filters.each do |i|
111
- y.yield dataframe.filter(i)
111
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
112
  end
113
113
  end
114
114
  instance.instance_variable_set(:@enum, enum)
@@ -139,7 +139,7 @@ module RedAmber
139
139
  y.yield i
140
140
  end
141
141
  end
142
- instance.instance_variable_set(:@baseframe, enum.reduce(&:concatenate))
142
+ instance.instance_variable_set(:@baseframe, enum.lazy)
143
143
  end
144
144
  instance.instance_variable_set(:@enum, enum)
145
145
  instance
@@ -160,11 +160,13 @@ module RedAmber
160
160
  # @return [SubFrames]
161
161
  # a new SubFrames.
162
162
  #
163
+ # @since 0.4.0
164
+ #
163
165
  def define_subframable_method(method)
164
166
  define_method(method) do |&block|
165
167
  return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments
166
168
 
167
- self.class.by_dataframes(super(&block))
169
+ SubFrames.by_dataframes(super(&block))
168
170
  end
169
171
  end
170
172
  end
@@ -195,25 +197,31 @@ module RedAmber
195
197
  # 4 5 B true
196
198
  # 5 6 C false
197
199
  #
198
- # SubFrames.new(dataframe, [[0, 2, 3], [4, 1]])
200
+ # # --- This object is used as common source in this class ---
201
+ # subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]])
199
202
  #
200
203
  # # =>
201
- # #<RedAmber::SubFrames : 0x0000000000003a34>
202
- # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003a48>
203
- # 2 SubFrames: [3, 2] in sizes.
204
+ # #<RedAmber::SubFrames : 0x000000000000cf6c>
205
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80>
206
+ # 3 SubFrames: [2, 3, 1] in sizes.
204
207
  # ---
205
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a5c>
208
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94>
206
209
  # x y z
207
210
  # <uint8> <string> <boolean>
208
211
  # 0 1 A false
209
- # 1 3 B false
210
- # 2 4 B (nil)
212
+ # 1 2 A true
211
213
  # ---
212
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a70>
214
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8>
213
215
  # x y z
214
216
  # <uint8> <string> <boolean>
215
- # 0 5 B true
216
- # 1 2 A true
217
+ # 0 3 B false
218
+ # 1 4 B (nil)
219
+ # 2 5 B true
220
+ # ---
221
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc>
222
+ # x y z
223
+ # <uint8> <string> <boolean>
224
+ # 0 6 C false
217
225
  #
218
226
  # @overload initialize(dataframe)
219
227
  # Create a new SubFrames object by block.
@@ -283,13 +291,13 @@ module RedAmber
283
291
  else
284
292
  raise SubFramesArgumentError, "illegal type: #{i}"
285
293
  end
286
- yielder.yield df
294
+ yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
287
295
  end
288
296
  end
289
297
  end
290
298
  end
291
299
 
292
- # Return concatenated SubFrames as a DataDrame.
300
+ # Return concatenated SubFrames as a DataFrame.
293
301
  #
294
302
  # Once evaluated, memorize it as @baseframe.
295
303
  # @return [DataFrame]
@@ -297,7 +305,11 @@ module RedAmber
297
305
  # @since 0.4.0
298
306
  #
299
307
  def baseframe
300
- @baseframe ||= reduce(&:concatenate)
308
+ if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
+ @baseframe = reduce(&:concatenate)
310
+ else
311
+ @baseframe
312
+ end
301
313
  end
302
314
  alias_method :concatenate, :baseframe
303
315
  alias_method :concat, :baseframe
@@ -325,13 +337,13 @@ module RedAmber
325
337
  # returns self.
326
338
  #
327
339
  # @example Returns Enumerator
328
- # sf.each
340
+ # subframes.each
329
341
  #
330
342
  # # =>
331
343
  # #<Enumerator: ...>
332
344
  #
333
345
  # @example `to_a` from Enumerable.
334
- # sf.to_a
346
+ # subframes.to_a
335
347
  #
336
348
  # # =>
337
349
  # [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
@@ -354,7 +366,7 @@ module RedAmber
354
366
  # ]
355
367
  #
356
368
  # @example Concatenate SubFrames. This example is used in #concatenate.
357
- # sf.reduce(&:concatenate)
369
+ # subframes.reduce(&:concatenate)
358
370
  #
359
371
  # # =>
360
372
  # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c>
@@ -378,13 +390,102 @@ module RedAmber
378
390
 
379
391
  # Aggregate SubFrames to create a DataFrame.
380
392
  #
381
- # This method will check if built-in aggregation function is used.
382
- # @todo Support user-defined aggregation functions.
393
+ # This method creates a DataFrame with one row corresponding to one sub dataframe.
394
+ # @note This method does not check if aggregation function is used.
395
+ #
396
+ # @overload aggregate(keys)
397
+ #
398
+ # Aggregate SubFrames creating DataFrame with label `keys` and
399
+ # its column values by block.
400
+ #
401
+ # @param keys [Symbol, Array<Symbol>]
402
+ # a key or keys of result. Key names may be renamed to new label.
403
+ # @yieldparam dataframe [DataFrame]
404
+ # passes each dataframe in self to the block. Block is called by instance_eval,
405
+ # so inside of the block is the context of passed dataframe.
406
+ # @yieldreturn [Array]
407
+ # aggregated values from the columns of passed dataframe.
408
+ # @return [DataFrame]
409
+ # created DataFrame.
410
+ # @example Aggregate by key labels in arguments and values from block.
411
+ # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
412
+ #
413
+ # # =>
414
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
415
+ # y sum_x
416
+ # <string> <uint8>
417
+ # 0 A 3
418
+ # 1 B 12
419
+ # 2 C 6
420
+ #
421
+ # @example Aggregate by key labels in an Array and values from block.
422
+ # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
423
+ #
424
+ # # =>
425
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
+ # y sum_x
427
+ # <string> <uint8>
428
+ # 0 A 3
429
+ # 1 B 12
430
+ # 2 C 6
431
+ #
432
+ # @overload aggregate
433
+ #
434
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
435
+ # in Hash from the block.
436
+ #
437
+ # @yieldparam dataframe [DataFrame]
438
+ # passes each dataframe in self to the block. Block is called by instance_eval,
439
+ # so inside of the block is the context of passed dataframe.
440
+ # @yieldreturn [Hash<key => aggregated_value>]
441
+ # pairs of key name and aggregated values from the columns of passed dataframe.
442
+ # Key names may be renamed to new label in the result.
443
+ # @return [DataFrame]
444
+ # created DataFrame.
445
+ # @example Aggregate by key and value pairs from block.
446
+ # subframes.aggregate do
447
+ # { y: y.first, sum_x: x.sum }
448
+ # end
449
+ #
450
+ # # =>
451
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
452
+ # y sum_x
453
+ # <string> <uint8>
454
+ # 0 A 3
455
+ # 1 B 12
456
+ # 2 C 6
457
+ #
458
+ # @overload aggregate
459
+ #
460
+ # Aggregate SubFrames creating DataFrame with an Array of key and aggregated value
461
+ # from the block.
462
+ #
463
+ # @yieldparam dataframe [DataFrame]
464
+ # passes each dataframe in self to the block. Block is called by instance_eval,
465
+ # so inside of the block is the context of passed dataframe.
466
+ # @yieldreturn [Array<key, aggregated_value>]
467
+ # pairs of key name and aggregated values from the columns of passed dataframe.
468
+ # Key names may be renamed to new label in the result.
469
+ # @return [DataFrame]
470
+ # created DataFrame.
471
+ # @example Aggregate by key and value arrays from block.
472
+ # subframes.aggregate do
473
+ # [[:y, y.first], [:sum_x, x.sum]]
474
+ # end
475
+ #
476
+ # # =>
477
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
478
+ # y sum_x
479
+ # <string> <uint8>
480
+ # 0 A 3
481
+ # 1 B 12
482
+ # 2 C 6
383
483
  #
384
484
  # @overload aggregate(group_keys, aggregations)
385
485
  #
386
486
  # Aggregate SubFrames for first values of the columns of
387
487
  # `group_keys` and the aggregated results of key-function pairs.
488
+ # [Experimental] This API may be changed in the future.
388
489
  #
389
490
  # @param group_keys [Symbol, String, Array<Symbol, String>]
390
491
  # group key name(s) to output values.
@@ -393,47 +494,23 @@ module RedAmber
393
494
  # Vector aggregate function name to apply.
394
495
  # @return [DataFrame]
395
496
  # an aggregated DataFrame.
396
- # @example
397
- # subframes
398
- #
399
- # # =>
400
- # #<RedAmber::SubFrames : 0x0000000000003980>
401
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003994>
402
- # 3 SubFrames: [2, 3, 1] in sizes.
403
- # ---
404
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000039a8>
405
- # x y z
406
- # <uint8> <string> <boolean>
407
- # 0 1 A false
408
- # 1 2 A true
409
- # ---
410
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000039bc>
411
- # x y z
412
- # <uint8> <string> <boolean>
413
- # 0 3 B false
414
- # 1 4 B (nil)
415
- # 2 5 B true
416
- # ---
417
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000039d0>
418
- # x y z
419
- # <uint8> <string> <boolean>
420
- # 0 6 C false
421
- #
422
- # subframes.aggregate(:y, { x: :sum })
497
+ # @example Aggregate with a group key and key function pairs by a Hash.
498
+ # subframes.aggregate(:y, { x: :sum, z: :count })
423
499
  #
424
500
  # # =>
425
501
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
- # y sum_x
427
- # <string> <uint8>
428
- # 0 A 3
429
- # 1 B 12
430
- # 2 C 6
502
+ # y sum_x count_z
503
+ # <string> <uint8> <uint8>
504
+ # 0 A 3 2
505
+ # 1 B 12 2
506
+ # 2 C 6 1
431
507
  #
432
508
  # @overload aggregate(group_keys, aggregations)
433
509
  #
434
510
  # Aggregate SubFrames for first values of the columns of
435
511
  # `group_keys` and the aggregated results of all combinations
436
512
  # of supplied keys and functions.
513
+ # [Experimental] This API may be changed in the future.
437
514
  #
438
515
  # @param group_keys [Symbol, String, Array<Symbol, String>]
439
516
  # group key name(s) to output values.
@@ -442,83 +519,60 @@ module RedAmber
442
519
  # Array of Vector aggregate function names to apply.
443
520
  # @return [DataFrame]
444
521
  # an aggregated DataFrame.
445
- # @example
522
+ # @example Aggregate with group keys and keys and functions by an Array.
446
523
  # sf.aggregate(:y, [[:x, :z], [:count, :sum]])
447
524
  #
448
525
  # # =>
449
526
  # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc>
450
- # y count_x count_z sum_x sum_z
527
+ # y count_x sum_x count_z sum_z
451
528
  # <string> <uint8> <uint8> <uint8> <uint8>
452
- # 0 A 2 2 3 1
453
- # 1 B 3 2 12 1
454
- # 2 C 1 1 6 0
529
+ # 0 A 2 3 2 1
530
+ # 1 B 3 12 2 1
531
+ # 2 C 1 6 1 0
455
532
  #
456
533
  # @since 0.4.0
457
534
  #
458
- def aggregate(group_keys, aggregations)
535
+ def aggregate(*args, &block)
459
536
  aggregator =
460
- case aggregations
461
- in Hash
462
- sf = self
463
- aggregations.map do |key, func|
464
- unless Vector.aggregate?(func)
465
- raise SubFramesArgumentError, "not an aggregation function: #{func}"
537
+ if block
538
+ if args.empty?
539
+ # aggregate { {key => value} or [[key, value], ...] }
540
+ each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash|
541
+ df.instance_eval(&block).to_h.each do |k, v|
542
+ hash[k] << v
543
+ end
466
544
  end
467
-
468
- ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
545
+ else
546
+ # aggregate(keys) { values }
547
+ values = each.map { |df| Array(df.instance_eval(&block)) }.transpose
548
+ args.flatten.zip(values)
469
549
  end
470
- in [Array => keys, Array => functions]
471
- functions.each do |func|
472
- unless Vector.aggregate?(func)
473
- raise SubFramesArgumentError, "not an aggregation function: #{func}"
474
- end
550
+ else
551
+ # These functions may be removed in the future.
552
+ case args
553
+ in [group_keys1, Hash => h]
554
+ # aggregate(group_keys, { key => func })
555
+ ary = Array(group_keys1).map { |key| [:first, key] }
556
+ ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit
557
+ in [group_keys2, [Array => keys, Array => funcs]]
558
+ # aggregate(group_keys, [keys, funcs])
559
+ ary = Array(group_keys2).map { |key| [:first, key] }
560
+ ary.concat(funcs.product(keys))
561
+ else
562
+ raise SubFramesArgumentError, "invalid argument: #{args}"
475
563
  end
476
564
  sf = self
477
- functions.product(keys).map do |func, key|
478
- ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
565
+ ary.map do |func, key|
566
+ label = func == :first ? key : "#{func}_#{key}"
567
+ [label, sf.each.map { |df| df[key].send(func) }]
479
568
  end
480
- else
481
- raise SubFramesArgumentError, "invalid argument: #{aggregations}"
482
569
  end
483
-
484
- if group_keys.empty?
485
- DataFrame.new(aggregator)
486
- else
487
- baseframe
488
- .pick(group_keys)
489
- .slice(offset_indices)
490
- .assign(aggregator)
491
- end
570
+ DataFrame.new(aggregator)
492
571
  end
493
572
 
494
573
  # Returns a SubFrames containing DataFrames returned by the block.
495
574
  #
496
575
  # @example Map as it is.
497
- # subframes
498
- #
499
- # # =>
500
- # #<RedAmber::SubFrames : 0x000000000001359c>
501
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000135b0>
502
- # 3 SubFrames: [2, 3, 1] in sizes.
503
- # ---
504
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4>
505
- # x y z
506
- # <uint8> <string> <boolean>
507
- # 0 1 A false
508
- # 1 2 A true
509
- # ---
510
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8>
511
- # x y z
512
- # <uint8> <string> <boolean>
513
- # 0 3 B false
514
- # 1 4 B (nil)
515
- # 2 5 B true
516
- # ---
517
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec>
518
- # x y z
519
- # <uint8> <string> <boolean>
520
- # 0 6 C false
521
- #
522
576
  # subframes.map { _1 }
523
577
  #
524
578
  # # This will create a new SubFrame and a new baseframe,
@@ -593,31 +647,6 @@ module RedAmber
593
647
  # @return [SubFrames]
594
648
  # a new SubFrames object with updated DataFrames.
595
649
  # @example
596
- # subframes
597
- #
598
- # # =>
599
- # #<RedAmber::SubFrames : 0x000000000000c33c>
600
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c350>
601
- # 3 SubFrames: [2, 3, 1] in sizes.
602
- # ---
603
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000c364>
604
- # x y z
605
- # <uint8> <string> <boolean>
606
- # 0 1 A false
607
- # 1 2 A true
608
- # ---
609
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
610
- # x y z
611
- # <uint8> <string> <boolean>
612
- # 0 3 B false
613
- # 1 4 B (nil)
614
- # 2 5 B true
615
- # ---
616
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000c38c>
617
- # x y z
618
- # <uint8> <string> <boolean>
619
- # 0 6 C false
620
- #
621
650
  # subframes.assign(:x_plus1) { x + 1 }
622
651
  #
623
652
  # # =>
@@ -912,7 +941,7 @@ module RedAmber
912
941
  # @return [Array<Integer>]
913
942
  # indices of offset of each sub DataFrames.
914
943
  # @example When `sizes` is [2, 3, 1].
915
- # sf.offset_indices # => [0, 2, 5]
944
+ # subframes.offset_indices # => [0, 2, 5]
916
945
  # @since 0.4.0
917
946
  #
918
947
  def offset_indices
@@ -1036,9 +1065,15 @@ module RedAmber
1036
1065
  # @since 0.4.0
1037
1066
  #
1038
1067
  def inspect(limit: 16)
1068
+ shape =
1069
+ if @baseframe.is_a?(Enumerator)
1070
+ "Enumerator::Lazy:size=#{@baseframe.size}"
1071
+ else
1072
+ baseframe.shape_str(with_id: true)
1073
+ end
1039
1074
  sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ')
1040
1075
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \
1041
- "@baseframe=#<#{baseframe.shape_str(with_id: true)}>\n" \
1076
+ "@baseframe=#<#{shape}>\n" \
1042
1077
  "#{size} SubFrame#{pl(size)}: " \
1043
1078
  "[#{sizes_truncated}] in size#{pl(size)}.\n" \
1044
1079
  "---\n#{_to_s(limit: limit, with_id: true)}"
@@ -27,28 +27,6 @@ module RedAmber
27
27
  instance
28
28
  end
29
29
 
30
- # Return true if it is an aggregation function.
31
- #
32
- # @param function [Symbol]
33
- # function name to test.
34
- # @return [Booleans]
35
- # true if function is a aggregation function, otherwise false.
36
- #
37
- # @example
38
- # Vector.aggregate?(:mean) # => true
39
- #
40
- # Vector.aggregate?(:round) # => false
41
- #
42
- # @since 0.4.0
43
- #
44
- def self.aggregate?(function)
45
- %i[
46
- all all? any any? approximate_median count count_distinct count_uniq
47
- max mean median min min_max product quantile sd std stddev sum
48
- unbiased_variance var variance
49
- ].include?(function.to_sym)
50
- end
51
-
52
30
  # Create a Vector.
53
31
  #
54
32
  # @param array [Array, Vector, Range, Arrow::Array, #to_arrow_array]
@@ -532,13 +510,10 @@ module RedAmber
532
510
 
533
511
  yield self
534
512
  else
535
- function = function&.to_sym
536
- unless function && respond_to?(function) && Vector.aggregate?(function)
537
- raise VectorArgumentError, "illegal function: #{function.inspect}"
538
- end
539
-
540
- send(function)
513
+ send(function&.to_sym)
541
514
  end
515
+ raise VectorArgumentError, 'not an aggregation function' if value.is_a?(Vector)
516
+
542
517
  Vector.new([value] * size)
543
518
  end
544
519
  alias_method :expand, :propagate
@@ -555,8 +530,10 @@ module RedAmber
555
530
  case other
556
531
  when Vector
557
532
  find(function).execute([data, other.data], options)
558
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
559
- Array, Numeric, String, TrueClass, FalseClass
533
+ when NilClass
534
+ nils = data.class.new([nil] * size)
535
+ find(function).execute([data, nils], options)
536
+ else
560
537
  find(function).execute([data, other], options)
561
538
  end
562
539
  end