red_amber 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -82,7 +82,7 @@ module RedAmber
82
82
  enum =
83
83
  Enumerator.new(subset_indices.size) do |y|
84
84
  subset_indices.each do |i|
85
- y.yield dataframe.take(i)
85
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
86
  end
87
87
  end
88
88
  instance.instance_variable_set(:@enum, enum)
@@ -108,7 +108,7 @@ module RedAmber
108
108
  enum =
109
109
  Enumerator.new(subset_filters.size) do |y|
110
110
  subset_filters.each do |i|
111
- y.yield dataframe.filter(i)
111
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
112
  end
113
113
  end
114
114
  instance.instance_variable_set(:@enum, enum)
@@ -139,7 +139,7 @@ module RedAmber
139
139
  y.yield i
140
140
  end
141
141
  end
142
- instance.instance_variable_set(:@baseframe, enum.reduce(&:concatenate))
142
+ instance.instance_variable_set(:@baseframe, enum.lazy)
143
143
  end
144
144
  instance.instance_variable_set(:@enum, enum)
145
145
  instance
@@ -160,11 +160,13 @@ module RedAmber
160
160
  # @return [SubFrames]
161
161
  # a new SubFrames.
162
162
  #
163
+ # @since 0.4.0
164
+ #
163
165
  def define_subframable_method(method)
164
166
  define_method(method) do |&block|
165
167
  return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments
166
168
 
167
- self.class.by_dataframes(super(&block))
169
+ SubFrames.by_dataframes(super(&block))
168
170
  end
169
171
  end
170
172
  end
@@ -195,25 +197,31 @@ module RedAmber
195
197
  # 4 5 B true
196
198
  # 5 6 C false
197
199
  #
198
- # SubFrames.new(dataframe, [[0, 2, 3], [4, 1]])
200
+ # # --- This object is used as common source in this class ---
201
+ # subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]])
199
202
  #
200
203
  # # =>
201
- # #<RedAmber::SubFrames : 0x0000000000003a34>
202
- # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003a48>
203
- # 2 SubFrames: [3, 2] in sizes.
204
+ # #<RedAmber::SubFrames : 0x000000000000cf6c>
205
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80>
206
+ # 3 SubFrames: [2, 3, 1] in sizes.
204
207
  # ---
205
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a5c>
208
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94>
206
209
  # x y z
207
210
  # <uint8> <string> <boolean>
208
211
  # 0 1 A false
209
- # 1 3 B false
210
- # 2 4 B (nil)
212
+ # 1 2 A true
211
213
  # ---
212
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a70>
214
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8>
213
215
  # x y z
214
216
  # <uint8> <string> <boolean>
215
- # 0 5 B true
216
- # 1 2 A true
217
+ # 0 3 B false
218
+ # 1 4 B (nil)
219
+ # 2 5 B true
220
+ # ---
221
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc>
222
+ # x y z
223
+ # <uint8> <string> <boolean>
224
+ # 0 6 C false
217
225
  #
218
226
  # @overload initialize(dataframe)
219
227
  # Create a new SubFrames object by block.
@@ -283,13 +291,13 @@ module RedAmber
283
291
  else
284
292
  raise SubFramesArgumentError, "illegal type: #{i}"
285
293
  end
286
- yielder.yield df
294
+ yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
287
295
  end
288
296
  end
289
297
  end
290
298
  end
291
299
 
292
- # Return concatenated SubFrames as a DataDrame.
300
+ # Return concatenated SubFrames as a DataFrame.
293
301
  #
294
302
  # Once evaluated, memorize it as @baseframe.
295
303
  # @return [DataFrame]
@@ -297,7 +305,11 @@ module RedAmber
297
305
  # @since 0.4.0
298
306
  #
299
307
  def baseframe
300
- @baseframe ||= reduce(&:concatenate)
308
+ if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
+ @baseframe = reduce(&:concatenate)
310
+ else
311
+ @baseframe
312
+ end
301
313
  end
302
314
  alias_method :concatenate, :baseframe
303
315
  alias_method :concat, :baseframe
@@ -325,13 +337,13 @@ module RedAmber
325
337
  # returns self.
326
338
  #
327
339
  # @example Returns Enumerator
328
- # sf.each
340
+ # subframes.each
329
341
  #
330
342
  # # =>
331
343
  # #<Enumerator: ...>
332
344
  #
333
345
  # @example `to_a` from Enumerable.
334
- # sf.to_a
346
+ # subframes.to_a
335
347
  #
336
348
  # # =>
337
349
  # [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
@@ -354,7 +366,7 @@ module RedAmber
354
366
  # ]
355
367
  #
356
368
  # @example Concatenate SubFrames. This example is used in #concatenate.
357
- # sf.reduce(&:concatenate)
369
+ # subframes.reduce(&:concatenate)
358
370
  #
359
371
  # # =>
360
372
  # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c>
@@ -378,13 +390,102 @@ module RedAmber
378
390
 
379
391
  # Aggregate SubFrames to create a DataFrame.
380
392
  #
381
- # This method will check if built-in aggregation function is used.
382
- # @todo Support user-defined aggregation functions.
393
+ # This method creates a DataFrame with one row corresponding to one sub dataframe.
394
+ # @note This method does not check if aggregation function is used.
395
+ #
396
+ # @overload aggregate(keys)
397
+ #
398
+ # Aggregate SubFrames creating DataFrame with label `keys` and
399
+ # its column values by block.
400
+ #
401
+ # @param keys [Symbol, Array<Symbol>]
402
+ # a key or keys of result. Key names may be renamed to new label.
403
+ # @yieldparam dataframe [DataFrame]
404
+ # passes each dataframe in self to the block. Block is called by instance_eval,
405
+ # so inside of the block is the context of passed dataframe.
406
+ # @yieldreturn [Array]
407
+ # aggregated values from the columns of passed dataframe.
408
+ # @return [DataFrame]
409
+ # created DataFrame.
410
+ # @example Aggregate by key labels in arguments and values from block.
411
+ # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
412
+ #
413
+ # # =>
414
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
415
+ # y sum_x
416
+ # <string> <uint8>
417
+ # 0 A 3
418
+ # 1 B 12
419
+ # 2 C 6
420
+ #
421
+ # @example Aggregate by key labels in an Array and values from block.
422
+ # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
423
+ #
424
+ # # =>
425
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
+ # y sum_x
427
+ # <string> <uint8>
428
+ # 0 A 3
429
+ # 1 B 12
430
+ # 2 C 6
431
+ #
432
+ # @overload aggregate
433
+ #
434
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
435
+ # in Hash from the block.
436
+ #
437
+ # @yieldparam dataframe [DataFrame]
438
+ # passes each dataframe in self to the block. Block is called by instance_eval,
439
+ # so inside of the block is the context of passed dataframe.
440
+ # @yieldreturn [Hash<key => aggregated_value>]
441
+ # pairs of key name and aggregated values from the columns of passed dataframe.
442
+ # Key names may be renamed to new label in the result.
443
+ # @return [DataFrame]
444
+ # created DataFrame.
445
+ # @example Aggregate by key and value pairs from block.
446
+ # subframes.aggregate do
447
+ # { y: y.first, sum_x: x.sum }
448
+ # end
449
+ #
450
+ # # =>
451
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
452
+ # y sum_x
453
+ # <string> <uint8>
454
+ # 0 A 3
455
+ # 1 B 12
456
+ # 2 C 6
457
+ #
458
+ # @overload aggregate
459
+ #
460
+ # Aggregate SubFrames creating DataFrame with an Array of key and aggregated value
461
+ # from the block.
462
+ #
463
+ # @yieldparam dataframe [DataFrame]
464
+ # passes each dataframe in self to the block. Block is called by instance_eval,
465
+ # so inside of the block is the context of passed dataframe.
466
+ # @yieldreturn [Array<key, aggregated_value>]
467
+ # pairs of key name and aggregated values from the columns of passed dataframe.
468
+ # Key names may be renamed to new label in the result.
469
+ # @return [DataFrame]
470
+ # created DataFrame.
471
+ # @example Aggregate by key and value arrays from block.
472
+ # subframes.aggregate do
473
+ # [[:y, y.first], [:sum_x, x.sum]]
474
+ # end
475
+ #
476
+ # # =>
477
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
478
+ # y sum_x
479
+ # <string> <uint8>
480
+ # 0 A 3
481
+ # 1 B 12
482
+ # 2 C 6
383
483
  #
384
484
  # @overload aggregate(group_keys, aggregations)
385
485
  #
386
486
  # Aggregate SubFrames for first values of the columns of
387
487
  # `group_keys` and the aggregated results of key-function pairs.
488
+ # [Experimental] This API may be changed in the future.
388
489
  #
389
490
  # @param group_keys [Symbol, String, Array<Symbol, String>]
390
491
  # group key name(s) to output values.
@@ -393,47 +494,23 @@ module RedAmber
393
494
  # Vector aggregate function name to apply.
394
495
  # @return [DataFrame]
395
496
  # an aggregated DataFrame.
396
- # @example
397
- # subframes
398
- #
399
- # # =>
400
- # #<RedAmber::SubFrames : 0x0000000000003980>
401
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003994>
402
- # 3 SubFrames: [2, 3, 1] in sizes.
403
- # ---
404
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000039a8>
405
- # x y z
406
- # <uint8> <string> <boolean>
407
- # 0 1 A false
408
- # 1 2 A true
409
- # ---
410
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000039bc>
411
- # x y z
412
- # <uint8> <string> <boolean>
413
- # 0 3 B false
414
- # 1 4 B (nil)
415
- # 2 5 B true
416
- # ---
417
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000039d0>
418
- # x y z
419
- # <uint8> <string> <boolean>
420
- # 0 6 C false
421
- #
422
- # subframes.aggregate(:y, { x: :sum })
497
+ # @example Aggregate with a group key and key function pairs by a Hash.
498
+ # subframes.aggregate(:y, { x: :sum, z: :count })
423
499
  #
424
500
  # # =>
425
501
  # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
- # y sum_x
427
- # <string> <uint8>
428
- # 0 A 3
429
- # 1 B 12
430
- # 2 C 6
502
+ # y sum_x count_z
503
+ # <string> <uint8> <uint8>
504
+ # 0 A 3 2
505
+ # 1 B 12 2
506
+ # 2 C 6 1
431
507
  #
432
508
  # @overload aggregate(group_keys, aggregations)
433
509
  #
434
510
  # Aggregate SubFrames for first values of the columns of
435
511
  # `group_keys` and the aggregated results of all combinations
436
512
  # of supplied keys and functions.
513
+ # [Experimental] This API may be changed in the future.
437
514
  #
438
515
  # @param group_keys [Symbol, String, Array<Symbol, String>]
439
516
  # group key name(s) to output values.
@@ -442,83 +519,60 @@ module RedAmber
442
519
  # Array of Vector aggregate function names to apply.
443
520
  # @return [DataFrame]
444
521
  # an aggregated DataFrame.
445
- # @example
522
+ # @example Aggregate with group keys and keys and functions by an Array.
446
523
  # sf.aggregate(:y, [[:x, :z], [:count, :sum]])
447
524
  #
448
525
  # # =>
449
526
  # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc>
450
- # y count_x count_z sum_x sum_z
527
+ # y count_x sum_x count_z sum_z
451
528
  # <string> <uint8> <uint8> <uint8> <uint8>
452
- # 0 A 2 2 3 1
453
- # 1 B 3 2 12 1
454
- # 2 C 1 1 6 0
529
+ # 0 A 2 3 2 1
530
+ # 1 B 3 12 2 1
531
+ # 2 C 1 6 1 0
455
532
  #
456
533
  # @since 0.4.0
457
534
  #
458
- def aggregate(group_keys, aggregations)
535
+ def aggregate(*args, &block)
459
536
  aggregator =
460
- case aggregations
461
- in Hash
462
- sf = self
463
- aggregations.map do |key, func|
464
- unless Vector.aggregate?(func)
465
- raise SubFramesArgumentError, "not an aggregation function: #{func}"
537
+ if block
538
+ if args.empty?
539
+ # aggregate { {key => value} or [[key, value], ...] }
540
+ each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash|
541
+ df.instance_eval(&block).to_h.each do |k, v|
542
+ hash[k] << v
543
+ end
466
544
  end
467
-
468
- ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
545
+ else
546
+ # aggregate(keys) { values }
547
+ values = each.map { |df| Array(df.instance_eval(&block)) }.transpose
548
+ args.flatten.zip(values)
469
549
  end
470
- in [Array => keys, Array => functions]
471
- functions.each do |func|
472
- unless Vector.aggregate?(func)
473
- raise SubFramesArgumentError, "not an aggregation function: #{func}"
474
- end
550
+ else
551
+ # These functions may be removed in the future.
552
+ case args
553
+ in [group_keys1, Hash => h]
554
+ # aggregate(group_keys, { key => func })
555
+ ary = Array(group_keys1).map { |key| [:first, key] }
556
+ ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit
557
+ in [group_keys2, [Array => keys, Array => funcs]]
558
+ # aggregate(group_keys, [keys, funcs])
559
+ ary = Array(group_keys2).map { |key| [:first, key] }
560
+ ary.concat(funcs.product(keys))
561
+ else
562
+ raise SubFramesArgumentError, "invalid argument: #{args}"
475
563
  end
476
564
  sf = self
477
- functions.product(keys).map do |func, key|
478
- ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
565
+ ary.map do |func, key|
566
+ label = func == :first ? key : "#{func}_#{key}"
567
+ [label, sf.each.map { |df| df[key].send(func) }]
479
568
  end
480
- else
481
- raise SubFramesArgumentError, "invalid argument: #{aggregations}"
482
569
  end
483
-
484
- if group_keys.empty?
485
- DataFrame.new(aggregator)
486
- else
487
- baseframe
488
- .pick(group_keys)
489
- .slice(offset_indices)
490
- .assign(aggregator)
491
- end
570
+ DataFrame.new(aggregator)
492
571
  end
493
572
 
494
573
  # Returns a SubFrames containing DataFrames returned by the block.
495
574
  #
496
575
  # @example Map as it is.
497
- # subframes
498
- #
499
- # # =>
500
- # #<RedAmber::SubFrames : 0x000000000001359c>
501
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000135b0>
502
- # 3 SubFrames: [2, 3, 1] in sizes.
503
- # ---
504
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4>
505
- # x y z
506
- # <uint8> <string> <boolean>
507
- # 0 1 A false
508
- # 1 2 A true
509
- # ---
510
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8>
511
- # x y z
512
- # <uint8> <string> <boolean>
513
- # 0 3 B false
514
- # 1 4 B (nil)
515
- # 2 5 B true
516
- # ---
517
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec>
518
- # x y z
519
- # <uint8> <string> <boolean>
520
- # 0 6 C false
521
- #
522
576
  # subframes.map { _1 }
523
577
  #
524
578
  # # This will create a new SubFrame and a new baseframe,
@@ -593,31 +647,6 @@ module RedAmber
593
647
  # @return [SubFrames]
594
648
  # a new SubFrames object with updated DataFrames.
595
649
  # @example
596
- # subframes
597
- #
598
- # # =>
599
- # #<RedAmber::SubFrames : 0x000000000000c33c>
600
- # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c350>
601
- # 3 SubFrames: [2, 3, 1] in sizes.
602
- # ---
603
- # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000c364>
604
- # x y z
605
- # <uint8> <string> <boolean>
606
- # 0 1 A false
607
- # 1 2 A true
608
- # ---
609
- # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
610
- # x y z
611
- # <uint8> <string> <boolean>
612
- # 0 3 B false
613
- # 1 4 B (nil)
614
- # 2 5 B true
615
- # ---
616
- # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000c38c>
617
- # x y z
618
- # <uint8> <string> <boolean>
619
- # 0 6 C false
620
- #
621
650
  # subframes.assign(:x_plus1) { x + 1 }
622
651
  #
623
652
  # # =>
@@ -912,7 +941,7 @@ module RedAmber
912
941
  # @return [Array<Integer>]
913
942
  # indices of offset of each sub DataFrames.
914
943
  # @example When `sizes` is [2, 3, 1].
915
- # sf.offset_indices # => [0, 2, 5]
944
+ # subframes.offset_indices # => [0, 2, 5]
916
945
  # @since 0.4.0
917
946
  #
918
947
  def offset_indices
@@ -1036,9 +1065,15 @@ module RedAmber
1036
1065
  # @since 0.4.0
1037
1066
  #
1038
1067
  def inspect(limit: 16)
1068
+ shape =
1069
+ if @baseframe.is_a?(Enumerator)
1070
+ "Enumerator::Lazy:size=#{@baseframe.size}"
1071
+ else
1072
+ baseframe.shape_str(with_id: true)
1073
+ end
1039
1074
  sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ')
1040
1075
  "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \
1041
- "@baseframe=#<#{baseframe.shape_str(with_id: true)}>\n" \
1076
+ "@baseframe=#<#{shape}>\n" \
1042
1077
  "#{size} SubFrame#{pl(size)}: " \
1043
1078
  "[#{sizes_truncated}] in size#{pl(size)}.\n" \
1044
1079
  "---\n#{_to_s(limit: limit, with_id: true)}"
@@ -27,28 +27,6 @@ module RedAmber
27
27
  instance
28
28
  end
29
29
 
30
- # Return true if it is an aggregation function.
31
- #
32
- # @param function [Symbol]
33
- # function name to test.
34
- # @return [Booleans]
35
- # true if function is a aggregation function, otherwise false.
36
- #
37
- # @example
38
- # Vector.aggregate?(:mean) # => true
39
- #
40
- # Vector.aggregate?(:round) # => false
41
- #
42
- # @since 0.4.0
43
- #
44
- def self.aggregate?(function)
45
- %i[
46
- all all? any any? approximate_median count count_distinct count_uniq
47
- max mean median min min_max product quantile sd std stddev sum
48
- unbiased_variance var variance
49
- ].include?(function.to_sym)
50
- end
51
-
52
30
  # Create a Vector.
53
31
  #
54
32
  # @param array [Array, Vector, Range, Arrow::Array, #to_arrow_array]
@@ -532,13 +510,10 @@ module RedAmber
532
510
 
533
511
  yield self
534
512
  else
535
- function = function&.to_sym
536
- unless function && respond_to?(function) && Vector.aggregate?(function)
537
- raise VectorArgumentError, "illegal function: #{function.inspect}"
538
- end
539
-
540
- send(function)
513
+ send(function&.to_sym)
541
514
  end
515
+ raise VectorArgumentError, 'not an aggregation function' if value.is_a?(Vector)
516
+
542
517
  Vector.new([value] * size)
543
518
  end
544
519
  alias_method :expand, :propagate
@@ -555,8 +530,10 @@ module RedAmber
555
530
  case other
556
531
  when Vector
557
532
  find(function).execute([data, other.data], options)
558
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
559
- Array, Numeric, String, TrueClass, FalseClass
533
+ when NilClass
534
+ nils = data.class.new([nil] * size)
535
+ find(function).execute([data, nils], options)
536
+ else
560
537
  find(function).execute([data, other], options)
561
538
  end
562
539
  end