red_amber 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,1101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # class SubFrames treats a set of subsets of a DataFrame
5
+ # [Experimental feature] Class SubFrames may be removed or be changed in the future.
6
+ class SubFrames
7
+ include Enumerable # may change to use Forwardable.
8
+ include Helper
9
+
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ class << self
14
+ # Create SubFrames from a Group.
15
+ #
16
+ # [Experimental feature] this method may be removed or be changed in the future.
17
+ # @param group [Group]
18
+ # a Group to be used to create SubFrames.
19
+ # @return [SubFrames]
20
+ # a created SubFrames.
21
+ # @example
22
+ # dataframe
23
+ #
24
+ # # =>
25
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000fba4>
26
+ # x y z
27
+ # <uint8> <string> <boolean>
28
+ # 0 1 A false
29
+ # 1 2 A true
30
+ # 2 3 B false
31
+ # 3 4 B (nil)
32
+ # 4 5 B true
33
+ # 5 6 C false
34
+ #
35
+ # group = Group.new(dataframe, [:y])
36
+ # sf = SubFrames.by_group(group)
37
+ #
38
+ # # =>
39
+ # #<RedAmber::SubFrames : 0x000000000000fbb8>
40
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000fb7c>
41
+ # 3 SubFrames: [2, 3, 1] in sizes.
42
+ # ---
43
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000fbcc>
44
+ # x y z
45
+ # <uint8> <string> <boolean>
46
+ # 0 1 A false
47
+ # 1 2 A true
48
+ # ---
49
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000fbe0>
50
+ # x y z
51
+ # <uint8> <string> <boolean>
52
+ # 0 3 B false
53
+ # 1 4 B (nil)
54
+ # 2 5 B true
55
+ # ---
56
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000fbf4>
57
+ # x y z
58
+ # <uint8> <string> <boolean>
59
+ # 0 6 C false
60
+ #
61
+ # @since 0.4.0
62
+ #
63
+ def by_group(group)
64
+ SubFrames.new(group.dataframe, group.filters)
65
+ end
66
+
67
+ # Create a new SubFrames object from a DataFrame and an array of indices.
68
+ #
69
+ # @api private
70
+ # @note this method doesn't check arguments.
71
+ # @param dataframe [DataFrame]
72
+ # a source dataframe.
73
+ # @param subset_indices [Array, Array<Vector>]
74
+ # an Array of numeric indices to create subsets of DataFrame.
75
+ # @return [SubFrames]
76
+ # a new SubFrames object.
77
+ # @since 0.4.0
78
+ #
79
+ def by_indices(dataframe, subset_indices)
80
+ instance = allocate
81
+ instance.instance_variable_set(:@baseframe, dataframe)
82
+ enum =
83
+ Enumerator.new(subset_indices.size) do |y|
84
+ subset_indices.each do |i|
85
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
+ end
87
+ end
88
+ instance.instance_variable_set(:@enum, enum)
89
+ instance
90
+ end
91
+
92
+ # Create a new SubFrames object from a DataFrame and an array of filters.
93
+ #
94
+ # @api private
95
+ # @note this method doesn't check arguments.
96
+ # @param dataframe [DataFrame]
97
+ # a source dataframe.
98
+ # @param subset_filters [Array, Array<Vector>]
99
+ # an Array of booleans to specify subsets of DataFrame.
100
+ # Each filters must have same length as dataframe.
101
+ # @return [SubFrames]
102
+ # a new SubFrames object.
103
+ # @since 0.4.0
104
+ #
105
+ def by_filters(dataframe, subset_filters)
106
+ instance = allocate
107
+ instance.instance_variable_set(:@baseframe, dataframe)
108
+ enum =
109
+ Enumerator.new(subset_filters.size) do |y|
110
+ subset_filters.each do |i|
111
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
+ end
113
+ end
114
+ instance.instance_variable_set(:@enum, enum)
115
+ instance
116
+ end
117
+
118
+ # Create a new SubFrames from an Array of DataFrames.
119
+ #
120
+ # @api private
121
+ # @note dataframes must have same schema.
122
+ # @param dataframes [Array<DataFrame>]
123
+ # an array of DataFrames which have same schema.
124
+ # @return [SubFrames]
125
+ # a new SubFrames object.
126
+ # @since 0.4.0
127
+ #
128
+ def by_dataframes(dataframes)
129
+ instance = allocate
130
+ case Array(dataframes)
131
+ when [] || [nil]
132
+ instance.instance_variable_set(:@baseframe, DataFrame.new)
133
+ instance.instance_variable_set(:@frames, [])
134
+ enum = [].each
135
+ else
136
+ enum =
137
+ Enumerator.new(dataframes.size) do |y|
138
+ dataframes.each do |i|
139
+ y.yield i
140
+ end
141
+ end
142
+ instance.instance_variable_set(:@baseframe, enum.lazy)
143
+ end
144
+ instance.instance_variable_set(:@enum, enum)
145
+ instance
146
+ end
147
+
148
+ private
149
+
150
+ # This method upgrades a iterating method from Enumerable to return SubFrames.
151
+
152
+ # @!macro [attach] define_subframable_method
153
+ #
154
+ # [Returns SubFrames] Use `#each.$1` if you want to get DataFrames by Array.
155
+ # Returns an Enumerator with no block given.
156
+ # @yieldparam dataframe [DataFrame]
157
+ # gives each element.
158
+ # @yieldreturn [Array<DataFrame>]
159
+ # the block should return DataFrames with same schema.
160
+ # @return [SubFrames]
161
+ # a new SubFrames.
162
+ #
163
+ # @since 0.4.0
164
+ #
165
+ def define_subframable_method(method)
166
+ define_method(method) do |&block|
167
+ return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments
168
+
169
+ SubFrames.by_dataframes(super(&block))
170
+ end
171
+ end
172
+ end
173
+
174
+ # Create a new SubFrames object from a DataFrame and an array of indices or filters.
175
+ #
176
+ # @overload initialize(dataframe, subset_specifier)
177
+ # Create a new SubFrames object.
178
+ #
179
+ # @param dataframe [DataFrame]
180
+ # a source dataframe.
181
+ # @param subset_specifier [Array<Vector>, Array<array-like>]
182
+ # an Array of numeric indices or boolean filters
183
+ # to create subsets of DataFrame.
184
+ # @return [SubFrames]
185
+ # new SubFrames.
186
+ # @example
187
+ # dataframe
188
+ #
189
+ # # =>
190
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000039e4>
191
+ # x y z
192
+ # <uint8> <string> <boolean>
193
+ # 0 1 A false
194
+ # 1 2 A true
195
+ # 2 3 B false
196
+ # 3 4 B (nil)
197
+ # 4 5 B true
198
+ # 5 6 C false
199
+ #
200
+ # # --- This object is used as common source in this class ---
201
+ # subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]])
202
+ #
203
+ # # =>
204
+ # #<RedAmber::SubFrames : 0x000000000000cf6c>
205
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80>
206
+ # 3 SubFrames: [2, 3, 1] in sizes.
207
+ # ---
208
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94>
209
+ # x y z
210
+ # <uint8> <string> <boolean>
211
+ # 0 1 A false
212
+ # 1 2 A true
213
+ # ---
214
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8>
215
+ # x y z
216
+ # <uint8> <string> <boolean>
217
+ # 0 3 B false
218
+ # 1 4 B (nil)
219
+ # 2 5 B true
220
+ # ---
221
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc>
222
+ # x y z
223
+ # <uint8> <string> <boolean>
224
+ # 0 6 C false
225
+ #
226
+ # @overload initialize(dataframe)
227
+ # Create a new SubFrames object by block.
228
+ #
229
+ # @param dataframe [DataFrame]
230
+ # a source dataframe.
231
+ # @yieldparam dataframe [DataFrame]
232
+ # the block is called with `dataframe`.
233
+ # @yieldreturn [Array<numeric_array_like>, Array<boolean_array_like>]
234
+ # an Array of index or boolean array-likes to create subsets of DataFrame.
235
+ # All array-likes are responsible to #numeric? or #boolean?.
236
+ # @return [SubFrames]
237
+ # a new SubFrames object.
238
+ # @example
239
+ # SubFrames.new(dataframe) do |df|
240
+ # booleans = df[:z]
241
+ # [booleans, !booleans]
242
+ # end
243
+ #
244
+ # # =>
245
+ # #<RedAmber::SubFrames : 0x0000000000003aac>
246
+ # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003ac0>
247
+ # 2 SubFrames: [2, 3] in sizes.
248
+ # ---
249
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003ad4>
250
+ # x y z
251
+ # <uint8> <string> <boolean>
252
+ # 0 2 A true
253
+ # 1 5 B true
254
+ # ---
255
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003ae8>
256
+ # x y z
257
+ # <uint8> <string> <boolean>
258
+ # 0 1 A false
259
+ # 1 3 B false
260
+ # 2 6 C false
261
+ #
262
+ # @since 0.4.0
263
+ #
264
+ def initialize(dataframe, subset_specifier = nil, &block)
265
+ unless dataframe.is_a?(DataFrame)
266
+ raise SubFramesArgumentError, "not a DataFrame: #{dataframe}"
267
+ end
268
+
269
+ if block
270
+ unless subset_specifier.nil?
271
+ raise SubFramesArgumentError, 'Must not specify both arguments and block.'
272
+ end
273
+
274
+ subset_specifier = yield(dataframe)
275
+ end
276
+
277
+ if dataframe.empty? || subset_specifier.nil? || subset_specifier.empty?
278
+ @baseframe = DataFrame.new
279
+ @frames = []
280
+ @enum = @frames.each
281
+ else
282
+ @baseframe = nil
283
+ @enum =
284
+ Enumerator.new(subset_specifier.size) do |yielder|
285
+ subset_specifier.map do |i|
286
+ df =
287
+ if i.numeric?
288
+ dataframe.take(i)
289
+ elsif i.boolean?
290
+ dataframe.filter(i)
291
+ else
292
+ raise SubFramesArgumentError, "illegal type: #{i}"
293
+ end
294
+ yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
295
+ end
296
+ end
297
+ end
298
+ end
299
+
300
+ # Return concatenated SubFrames as a DataFrame.
301
+ #
302
+ # Once evaluated, memorize it as @baseframe.
303
+ # @return [DataFrame]
304
+ # a concatenated DataFrame.
305
+ # @since 0.4.0
306
+ #
307
+ def baseframe
308
+ if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
+ @baseframe = reduce(&:concatenate)
310
+ else
311
+ @baseframe
312
+ end
313
+ end
314
+ alias_method :concatenate, :baseframe
315
+ alias_method :concat, :baseframe
316
+
317
+ # Iterates over sub DataFrames or returns an Enumerator.
318
+ #
319
+ # This method will memorize sub DataFrames and always returns the same object.
320
+ # The Class SubFrames is including Enumerable module.
321
+ # So many methods in Enumerable are available.
322
+ #
323
+ # @overload each
324
+ # Returns a new Enumerator if no block given.
325
+ #
326
+ # @return [Enumerator]
327
+ # Enumerator of each elements.
328
+ #
329
+ # @overload each
330
+ # When a block given, passes each sub DataFrames to the block.
331
+ #
332
+ # @yieldparam subframe [DataFrame]
333
+ # passes sub DataFrame by a block parameter.
334
+ # @yieldreturn [Object]
335
+ # evaluated result value from the block.
336
+ # @return [self]
337
+ # returns self.
338
+ #
339
+ # @example Returns Enumerator
340
+ # subframes.each
341
+ #
342
+ # # =>
343
+ # #<Enumerator: ...>
344
+ #
345
+ # @example `to_a` from Enumerable.
346
+ # subframes.to_a
347
+ #
348
+ # # =>
349
+ # [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
350
+ # x y z
351
+ # <uint8> <string> <boolean>
352
+ # 0 1 A false
353
+ # 1 2 A true
354
+ # ,
355
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000002a134>
356
+ # x y z
357
+ # <uint8> <string> <boolean>
358
+ # 0 3 B false
359
+ # 1 4 B (nil)
360
+ # 2 5 B true
361
+ # ,
362
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a148>
363
+ # x y z
364
+ # <uint8> <string> <boolean>
365
+ # 0 6 C false
366
+ # ]
367
+ #
368
+ # @example Concatenate SubFrames. This example is used in #concatenate.
369
+ # subframes.reduce(&:concatenate)
370
+ #
371
+ # # =>
372
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c>
373
+ # x y z
374
+ # <uint8> <string> <boolean>
375
+ # 0 1 A false
376
+ # 1 2 A true
377
+ # 2 3 B false
378
+ # 3 4 B (nil)
379
+ # 4 5 B true
380
+ # 5 6 C false
381
+ #
382
+ # @since 0.4.0
383
+ #
384
+ def each(&block)
385
+ return enum_for(__method__) { size } unless block
386
+
387
+ frames.each(&block)
388
+ nil
389
+ end
390
+
391
+ # Aggregate SubFrames to create a DataFrame.
392
+ #
393
+ # This method creates a DataFrame with one row corresponding to one sub dataframe.
394
+ # @note This method does not check if aggregation function is used.
395
+ #
396
+ # @overload aggregate(keys)
397
+ #
398
+ # Aggregate SubFrames creating DataFrame with label `keys` and
399
+ # its column values by block.
400
+ #
401
+ # @param keys [Symbol, Array<Symbol>]
402
+ # a key or keys of result. Key names may be renamed to new label.
403
+ # @yieldparam dataframe [DataFrame]
404
+ # passes each dataframe in self to the block. Block is called by instance_eval,
405
+ # so inside of the block is the context of passed dataframe.
406
+ # @yieldreturn [Array]
407
+ # aggregated values from the columns of passed dataframe.
408
+ # @return [DataFrame]
409
+ # created DataFrame.
410
+ # @example Aggregate by key labels in arguments and values from block.
411
+ # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
412
+ #
413
+ # # =>
414
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
415
+ # y sum_x
416
+ # <string> <uint8>
417
+ # 0 A 3
418
+ # 1 B 12
419
+ # 2 C 6
420
+ #
421
+ # @example Aggregate by key labels in an Array and values from block.
422
+ # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
423
+ #
424
+ # # =>
425
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
+ # y sum_x
427
+ # <string> <uint8>
428
+ # 0 A 3
429
+ # 1 B 12
430
+ # 2 C 6
431
+ #
432
+ # @overload aggregate
433
+ #
434
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
435
+ # in Hash from the block.
436
+ #
437
+ # @yieldparam dataframe [DataFrame]
438
+ # passes each dataframe in self to the block. Block is called by instance_eval,
439
+ # so inside of the block is the context of passed dataframe.
440
+ # @yieldreturn [Hash<key => aggregated_value>]
441
+ # pairs of key name and aggregated values from the columns of passed dataframe.
442
+ # Key names may be renamed to new label in the result.
443
+ # @return [DataFrame]
444
+ # created DataFrame.
445
+ # @example Aggregate by key and value pairs from block.
446
+ # subframes.aggregate do
447
+ # { y: y.first, sum_x: x.sum }
448
+ # end
449
+ #
450
+ # # =>
451
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
452
+ # y sum_x
453
+ # <string> <uint8>
454
+ # 0 A 3
455
+ # 1 B 12
456
+ # 2 C 6
457
+ #
458
+ # @overload aggregate
459
+ #
460
+ # Aggregate SubFrames creating DataFrame with an Array of key and aggregated value
461
+ # from the block.
462
+ #
463
+ # @yieldparam dataframe [DataFrame]
464
+ # passes each dataframe in self to the block. Block is called by instance_eval,
465
+ # so inside of the block is the context of passed dataframe.
466
+ # @yieldreturn [Array<key, aggregated_value>]
467
+ # pairs of key name and aggregated values from the columns of passed dataframe.
468
+ # Key names may be renamed to new label in the result.
469
+ # @return [DataFrame]
470
+ # created DataFrame.
471
+ # @example Aggregate by key and value arrays from block.
472
+ # subframes.aggregate do
473
+ # [[:y, y.first], [:sum_x, x.sum]]
474
+ # end
475
+ #
476
+ # # =>
477
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
478
+ # y sum_x
479
+ # <string> <uint8>
480
+ # 0 A 3
481
+ # 1 B 12
482
+ # 2 C 6
483
+ #
484
+ # @overload aggregate(group_keys, aggregations)
485
+ #
486
+ # Aggregate SubFrames for first values of the columns of
487
+ # `group_keys` and the aggregated results of key-function pairs.
488
+ # [Experimental] This API may be changed in the future.
489
+ #
490
+ # @param group_keys [Symbol, String, Array<Symbol, String>]
491
+ # group key name(s) to output values.
492
+ # @param aggregations [Hash<Array<Symbol, String> => Array<:Symbol>>]
493
+ # a Hash of variable (column) name and
494
+ # Vector aggregate function name to apply.
495
+ # @return [DataFrame]
496
+ # an aggregated DataFrame.
497
+ # @example Aggregate with a group key and key function pairs by a Hash.
498
+ # subframes.aggregate(:y, { x: :sum, z: :count })
499
+ #
500
+ # # =>
501
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
502
+ # y sum_x count_z
503
+ # <string> <uint8> <uint8>
504
+ # 0 A 3 2
505
+ # 1 B 12 2
506
+ # 2 C 6 1
507
+ #
508
+ # @overload aggregate(group_keys, aggregations)
509
+ #
510
+ # Aggregate SubFrames for first values of the columns of
511
+ # `group_keys` and the aggregated results of all combinations
512
+ # of supplied keys and functions.
513
+ # [Experimental] This API may be changed in the future.
514
+ #
515
+ # @param group_keys [Symbol, String, Array<Symbol, String>]
516
+ # group key name(s) to output values.
517
+ # @param aggregations [Array[Array<Symbol, String>, Array<:Symbol>]]
518
+ # an Array of Array of variable (column) names and
519
+ # Array of Vector aggregate function names to apply.
520
+ # @return [DataFrame]
521
+ # an aggregated DataFrame.
522
+ # @example Aggregate with group keys and keys and functions by an Array.
523
+ # sf.aggregate(:y, [[:x, :z], [:count, :sum]])
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc>
527
+ # y count_x sum_x count_z sum_z
528
+ # <string> <uint8> <uint8> <uint8> <uint8>
529
+ # 0 A 2 3 2 1
530
+ # 1 B 3 12 2 1
531
+ # 2 C 1 6 1 0
532
+ #
533
+ # @since 0.4.0
534
+ #
535
+ def aggregate(*args, &block)
536
+ aggregator =
537
+ if block
538
+ if args.empty?
539
+ # aggregate { {key => value} or [[key, value], ...] }
540
+ each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash|
541
+ df.instance_eval(&block).to_h.each do |k, v|
542
+ hash[k] << v
543
+ end
544
+ end
545
+ else
546
+ # aggregate(keys) { values }
547
+ values = each.map { |df| Array(df.instance_eval(&block)) }.transpose
548
+ args.flatten.zip(values)
549
+ end
550
+ else
551
+ # These functions may be removed in the future.
552
+ case args
553
+ in [group_keys1, Hash => h]
554
+ # aggregate(group_keys, { key => func })
555
+ ary = Array(group_keys1).map { |key| [:first, key] }
556
+ ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit
557
+ in [group_keys2, [Array => keys, Array => funcs]]
558
+ # aggregate(group_keys, [keys, funcs])
559
+ ary = Array(group_keys2).map { |key| [:first, key] }
560
+ ary.concat(funcs.product(keys))
561
+ else
562
+ raise SubFramesArgumentError, "invalid argument: #{args}"
563
+ end
564
+ sf = self
565
+ ary.map do |func, key|
566
+ label = func == :first ? key : "#{func}_#{key}"
567
+ [label, sf.each.map { |df| df[key].send(func) }]
568
+ end
569
+ end
570
+ DataFrame.new(aggregator)
571
+ end
572
+
573
+ # Returns a SubFrames containing DataFrames returned by the block.
574
+ #
575
+ # @example Map as it is.
576
+ # subframes.map { _1 }
577
+ #
578
+ # # This will create a new SubFrame and a new baseframe,
579
+ # # But each element DataFrames are re-used.
580
+ # # =>
581
+ # #<RedAmber::SubFrames : 0x000000000001e6cc>
582
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000001e6e0>
583
+ # 3 SubFrames: [2, 3, 1] in sizes.
584
+ # ---
585
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4>
586
+ # x y z
587
+ # <uint8> <string> <boolean>
588
+ # 0 1 A false
589
+ # 1 2 A true
590
+ # ---
591
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8>
592
+ # x y z
593
+ # <uint8> <string> <boolean>
594
+ # 0 3 B false
595
+ # 1 4 B (nil)
596
+ # 2 5 B true
597
+ # ---
598
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec>
599
+ # x y z
600
+ # <uint8> <string> <boolean>
601
+ # 0 6 C false
602
+ #
603
+ # @example Assign a new column.
604
+ # subframes.map { |df| df.assign(x_plus1: df[:x] + 1) }
605
+ #
606
+ # # =>
607
+ # #<RedAmber::SubFrames : 0x0000000000040948>
608
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000004095c>
609
+ # 3 SubFrames: [2, 3, 1] in sizes.
610
+ # ---
611
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x0000000000040970>
612
+ # x y z x_plus1
613
+ # <uint8> <string> <boolean> <uint8>
614
+ # 0 1 A false 2
615
+ # 1 2 A true 3
616
+ # ---
617
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x0000000000040984>
618
+ # x y z x_plus1
619
+ # <uint8> <string> <boolean> <uint8>
620
+ # 0 3 B false 4
621
+ # 1 4 B (nil) 5
622
+ # 2 5 B true 6
623
+ # ---
624
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x0000000000040998>
625
+ # x y z x_plus1
626
+ # <uint8> <string> <boolean> <uint8>
627
+ # 0 6 C false 7
628
+ #
629
+ # @since 0.4.0
630
+ #
631
+ define_subframable_method :map
632
+ alias_method :collect, :map
633
+
634
+ # Update existing column(s) or create new columns(s) for each DataFrames in self.
635
+ #
636
+ # Column values are updated by an oveloaded common operation.
637
+ #
638
+ # @overload assign(key)
639
+ # Assign a column by argument and block.
640
+ #
641
+ # @param key [Symbol, String]
642
+ # a key of column to assign.
643
+ # @yieldparam dataframe [DataFrame]
644
+ # gives overloaded dataframe in self to the block.
645
+ # @yieldreturn [Vector, Array, Arrow::Array]
646
+ # an updated column value which are overloaded.
647
+ # @return [SubFrames]
648
+ # a new SubFrames object with updated DataFrames.
649
+ # @example
650
+ # subframes.assign(:x_plus1) { x + 1 }
651
+ #
652
+ # # =>
653
+ # #<RedAmber::SubFrames : 0x000000000000c3a0>
654
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000c3b4>
655
+ # 3 SubFrames: [2, 3, 1] in sizes.
656
+ # ---
657
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000c3c8>
658
+ # x y z x_plus1
659
+ # <uint8> <string> <boolean> <uint8>
660
+ # 0 1 A false 2
661
+ # 1 2 A true 3
662
+ # ---
663
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x000000000000c3dc>
664
+ # x y z x_plus1
665
+ # <uint8> <string> <boolean> <uint8>
666
+ # 0 3 B false 4
667
+ # 1 4 B (nil) 5
668
+ # 2 5 B true 6
669
+ # ---
670
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x000000000000c3f0>
671
+ # x y z x_plus1
672
+ # <uint8> <string> <boolean> <uint8>
673
+ # 0 6 C false 7
674
+ #
675
+ # @overload assign(keys)
676
+ # Assign columns by arguments and block.
677
+ #
678
+ # @param keys [Array<Symbol, String>]
679
+ # keys of columns to assign.
680
+ # @yieldparam dataframe [DataFrame]
681
+ # gives overloaded dataframes in self to the block.
682
+ # @yieldreturn [Array<Vector, Array, Arrow::Array>]
683
+ # an updated column values which are overloaded.
684
+ # @return [SubFrames]
685
+ # a new SubFrames object with updated DataFrames.
686
+ # @example
687
+ # subframes.assign(:sum_x, :frac_x) do
688
+ # group_sum = x.sum
689
+ # [[group_sum] * size, x / s.to_f]
690
+ # end
691
+ #
692
+ # # =>
693
+ # #<RedAmber::SubFrames : 0x000000000000fce4>
694
+ # @baseframe=#<RedAmber::DataFrame : 6 x 5 Vectors, 0x000000000000fcf8>
695
+ # 3 SubFrames: [2, 3, 1] in sizes.
696
+ # ---
697
+ # #<RedAmber::DataFrame : 2 x 5 Vectors, 0x000000000000fd0c>
698
+ # x y z sum_x frac_x
699
+ # <uint8> <string> <boolean> <uint8> <double>
700
+ # 0 1 A false 3 0.33
701
+ # 1 2 A true 3 0.67
702
+ # ---
703
+ # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fd20>
704
+ # x y z sum_x frac_x
705
+ # <uint8> <string> <boolean> <uint8> <double>
706
+ # 0 3 B false 12 0.25
707
+ # 1 4 B (nil) 12 0.33
708
+ # 2 5 B true 12 0.42
709
+ # ---
710
+ # #<RedAmber::DataFrame : 1 x 5 Vectors, 0x000000000000fd34>
711
+ # x y z sum_x frac_x
712
+ # <uint8> <string> <boolean> <uint8> <double>
713
+ # 0 6 C false 6 1.0
714
+ #
715
+ # @overload assign
716
+ # Assign column(s) by block.
717
+ #
718
+ # @yieldparam dataframe [DataFrame]
719
+ # gives overloaded dataframes in self to the block.
720
+ # @yieldreturn [Hash, Array]
721
+ # pairs of keys and column values which are overloaded.
722
+ # @return [SubFrames]
723
+ # a new SubFrames object with updated DataFrames.
724
+ # @example Compute 'x * z' when (true, not_true) = (1, 0) in z
725
+ # subframes.assign do
726
+ # { 'x*z': x * z.if_else(1, 0) }
727
+ # end
728
+ #
729
+ # # =>
730
+ # #<RedAmber::SubFrames : 0x000000000000fd98>
731
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000fdac>
732
+ # 3 SubFrames: [2, 3, 1] in sizes.
733
+ # ---
734
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000fdc0>
735
+ # x y z x*z
736
+ # <uint8> <string> <boolean> <uint8>
737
+ # 0 1 A false 0
738
+ # 1 2 A true 2
739
+ # ---
740
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x000000000000fdd4>
741
+ # x y z x*z
742
+ # <uint8> <string> <boolean> <uint8>
743
+ # 0 3 B false 0
744
+ # 1 4 B (nil) (nil)
745
+ # 2 5 B true 5
746
+ # ---
747
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x000000000000fde8>
748
+ # x y z x*z
749
+ # <uint8> <string> <boolean> <uint8>
750
+ # 0 6 C false 0
751
+ #
752
+ # @since 0.4.0
753
+ #
754
+ def assign(...)
755
+ map { |df| df.assign(...) }
756
+ end
757
+
758
+ # Returns a SubFrames containing DataFrames selected by the block.
759
+ #
760
+ # With a block given, calls the block with successive DataFrames;
761
+ # returns a SubFrames of those DataFrames for
762
+ # which the block returns a truthy value.
763
+ #
764
+ # @example Select all.
765
+ # subframes.select { true }
766
+ #
767
+ # # =>
768
+ # #<RedAmber::SubFrames : 0x0000000000003a84>
769
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003a98>
770
+ # 3 SubFrames: [2, 3, 1] in sizes.
771
+ # ---
772
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
773
+ # x y z
774
+ # <uint8> <string> <boolean>
775
+ # 0 1 A false
776
+ # 1 2 A true
777
+ # ---
778
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
779
+ # x y z
780
+ # <uint8> <string> <boolean>
781
+ # 0 3 B false
782
+ # 1 4 B (nil)
783
+ # 2 5 B true
784
+ # ---
785
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x0000000000003a34>
786
+ # x y z
787
+ # <uint8> <string> <boolean>
788
+ # 0 6 C false
789
+ #
790
+ # @example Select nothing.
791
+ # subframes.select { false }
792
+ #
793
+ # # =>
794
+ # #<RedAmber::SubFrames : 0x00000000000238c0>
795
+ # @baseframe=#<RedAmber::DataFrame : (empty), 0x00000000000238d4>
796
+ # 0 SubFrame: [] in size.
797
+ # ---
798
+ #
799
+ # @example Select if Vector `:z` has any true.
800
+ # subframes.select { |df| df[:z].any? }
801
+ #
802
+ # # =>
803
+ # #<RedAmber::SubFrames : 0x000000000000fba4>
804
+ # @baseframe=#<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000fbb8>
805
+ # 2 SubFrames: [2, 1] in sizes.
806
+ # ---
807
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
808
+ # x y z
809
+ # <uint8> <string> <boolean>
810
+ # 0 1 A false
811
+ # 1 2 A true
812
+ # ---
813
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
814
+ # x y z
815
+ # <uint8> <string> <boolean>
816
+ # 0 3 B false
817
+ # 1 4 B (nil)
818
+ # 2 5 B true
819
+ #
820
+ # @since 0.4.0
821
+ #
822
+ define_subframable_method :select
823
+ alias_method :filter, :select
824
+ alias_method :find_all, :select
825
+
826
+ # Returns a SubFrames containing DataFrames rejected by the block.
827
+ #
828
+ # With a block given, calls the block with successive DataFrames;
829
+ # returns a SubFrames of those DataFrames for
830
+ # which the block returns nil or false.
831
+ # @example Reject all.
832
+ # subframes.reject { true }
833
+ #
834
+ # # =>
835
+ # #<RedAmber::SubFrames : 0x00000000000238c0>
836
+ # @baseframe=#<RedAmber::DataFrame : (empty), 0x00000000000238d4>
837
+ # 0 SubFrame: [] in size.
838
+ # ---
839
+ #
840
+ # @example Reject nothing.
841
+ # subframes.reject { false }
842
+ #
843
+ # # =>
844
+ # #<RedAmber::SubFrames : 0x0000000000003a84>
845
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003a98>
846
+ # 3 SubFrames: [2, 3, 1] in sizes.
847
+ # ---
848
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
849
+ # x y z
850
+ # <uint8> <string> <boolean>
851
+ # 0 1 A false
852
+ # 1 2 A true
853
+ # ---
854
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
855
+ # x y z
856
+ # <uint8> <string> <boolean>
857
+ # 0 3 B false
858
+ # 1 4 B (nil)
859
+ # 2 5 B true
860
+ # ---
861
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x0000000000003a34>
862
+ # x y z
863
+ # <uint8> <string> <boolean>
864
+ # 0 6 C false
865
+ #
866
+ # @example Reject if Vector `:z` has any true.
867
+ # subframes.reject { |df| df[:z].any? }
868
+ #
869
+ # # =>
870
+ # #<RedAmber::SubFrames : 0x0000000000038d74>
871
+ # @baseframe=#<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000001ad10>
872
+ # 1 SubFrame: [1] in size.
873
+ # ---
874
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000001ad10>
875
+ # x y z
876
+ # <uint8> <string> <boolean>
877
+ # 0 6 C false
878
+ #
879
+ # @since 0.4.0
880
+ #
881
+ define_subframable_method :reject
882
+
883
+ # Returns a SubFrames containing truthy DataFrames returned by the block.
884
+ #
885
+ # With a block given, calls the block with successive DataFrames;
886
+ # returns a SubFrames of those DataFrames for
887
+ # which the block returns nil or false.
888
+ # @example Filter for size is larger than 1 and append number to column 'y'.
889
+ # subframes.filter_map do |df|
890
+ # if df.size > 1
891
+ # df.assign(:y) do
892
+ # y.merge(indices('1'), sep: '')
893
+ # end
894
+ # end
895
+ # end
896
+ #
897
+ # # =>
898
+ # #<RedAmber::SubFrames : 0x000000000001da88>
899
+ # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000001da9c>
900
+ # 2 SubFrames: [2, 3] in sizes.
901
+ # ---
902
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000001dab0>
903
+ # x y z
904
+ # <uint8> <string> <boolean>
905
+ # 0 1 A1 false
906
+ # 1 2 A2 true
907
+ # ---
908
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000001dac4>
909
+ # x y z
910
+ # <uint8> <string> <boolean>
911
+ # 0 3 B1 false
912
+ # 1 4 B2 (nil)
913
+ # 2 5 B3 true
914
+ #
915
+ # @since 0.4.0
916
+ #
917
+ define_subframable_method :filter_map
918
+
919
+ # Number of subsets.
920
+ #
921
+ # @return [Integer]
922
+ # number of subsets in self.
923
+ # @since 0.4.0
924
+ #
925
+ def size
926
+ @size ||= @enum.size
927
+ end
928
+
929
+ # Size list of subsets.
930
+ #
931
+ # @return [Array<Integer>]
932
+ # sizes of sub DataFrames.
933
+ # @since 0.4.0
934
+ #
935
+ def sizes
936
+ @sizes ||= @enum.map(&:size)
937
+ end
938
+
939
+ # Indices at the top of each sub DataFrames.
940
+ #
941
+ # @return [Array<Integer>]
942
+ # indices of offset of each sub DataFrames.
943
+ # @example When `sizes` is [2, 3, 1].
944
+ # subframes.offset_indices # => [0, 2, 5]
945
+ # @since 0.4.0
946
+ #
947
+ def offset_indices
948
+ sum = 0
949
+ sizes.map do |size|
950
+ sum += size
951
+ sum - size
952
+ end
953
+ end
954
+
955
+ # Test if subset is empty?.
956
+ #
957
+ # @return [true, false]
958
+ # true if self is an empty subset.
959
+ # @since 0.4.0
960
+ #
961
+ def empty?
962
+ size.zero?
963
+ end
964
+
965
+ # Test if self has only one subset and it is comprehensive.
966
+ #
967
+ # @return [true, false]
968
+ # true if only member of self is equal to universal DataFrame.
969
+ # @since 0.4.0
970
+ #
971
+ def universal?
972
+ size == 1 && @enum.first == baseframe
973
+ end
974
+
975
+ # Return string representation of self.
976
+ #
977
+ # @param limit [Integer]
978
+ # maximum number of DataFrames to show.
979
+ # @return [String]
980
+ # return string representation of each sub DataFrame.
981
+ # @example
982
+ # df
983
+ #
984
+ # # =>
985
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000caa8>
986
+ # x y z
987
+ # <uint8> <string> <boolean>
988
+ # 0 1 A false
989
+ # 1 2 A true
990
+ # 2 3 B false
991
+ # 3 4 B (nil)
992
+ # 4 5 B true
993
+ # 5 6 C false
994
+ #
995
+ # puts SubFrames.new(df, [[0, 1], [2, 3, 4], [5]])
996
+ #
997
+ # # =>
998
+ # x y z
999
+ # <uint8> <string> <boolean>
1000
+ # 0 1 A false
1001
+ # 1 2 A true
1002
+ # ---
1003
+ # x y z
1004
+ # <uint8> <string> <boolean>
1005
+ # 0 3 B false
1006
+ # 1 4 B (nil)
1007
+ # 2 5 B true
1008
+ # ---
1009
+ # x y z
1010
+ # <uint8> <string> <boolean>
1011
+ # 0 6 C false
1012
+ #
1013
+ # @since 0.4.0
1014
+ #
1015
+ def to_s(limit: 16)
1016
+ _to_s(limit: limit)
1017
+ end
1018
+
1019
+ # Return summary information of self.
1020
+ #
1021
+ # @param limit [Integer]
1022
+ # maximum number of DataFrames to show.
1023
+ # @return [String]
1024
+ # return class name, object id, universal DataFrame,
1025
+ # size and subset sizes in a String.
1026
+ # @example
1027
+ # df
1028
+ #
1029
+ # # =>
1030
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000caa8>
1031
+ # x y z
1032
+ # <uint8> <string> <boolean>
1033
+ # 0 1 A false
1034
+ # 1 2 A true
1035
+ # 2 3 B false
1036
+ # 3 4 B (nil)
1037
+ # 4 5 B true
1038
+ # 5 6 C false
1039
+ #
1040
+ # SubFrames.new(df, [[0, 1], [2, 3, 4], [5]])
1041
+ #
1042
+ # # =>
1043
+ # #<RedAmber::SubFrames : 0x000000000000c1fc>
1044
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c170>
1045
+ # 3 SubFrames: [2, 3, 1] in sizes.
1046
+ # ---
1047
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
1048
+ # x y z
1049
+ # <uint8> <string> <boolean>
1050
+ # 0 1 A false
1051
+ # 1 2 A true
1052
+ # ---
1053
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a134>
1054
+ # x y z
1055
+ # <uint8> <string> <boolean>
1056
+ # 0 3 B false
1057
+ # 1 4 B (nil)
1058
+ # 2 5 B true
1059
+ # ---
1060
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a148>
1061
+ # x y z
1062
+ # <uint8> <string> <boolean>
1063
+ # 0 6 C false
1064
+ #
1065
+ # @since 0.4.0
1066
+ #
1067
+ def inspect(limit: 16)
1068
+ shape =
1069
+ if @baseframe.is_a?(Enumerator)
1070
+ "Enumerator::Lazy:size=#{@baseframe.size}"
1071
+ else
1072
+ baseframe.shape_str(with_id: true)
1073
+ end
1074
+ sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ')
1075
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \
1076
+ "@baseframe=#<#{shape}>\n" \
1077
+ "#{size} SubFrame#{pl(size)}: " \
1078
+ "[#{sizes_truncated}] in size#{pl(size)}.\n" \
1079
+ "---\n#{_to_s(limit: limit, with_id: true)}"
1080
+ end
1081
+
1082
+ private
1083
+
1084
+ def frames
1085
+ @frames ||= @enum.to_a
1086
+ end
1087
+
1088
+ def _to_s(limit: 16, with_id: false)
1089
+ a = take(limit).map do |df|
1090
+ if with_id
1091
+ "#<#{df.shape_str(with_id: with_id)}>\n" \
1092
+ "#{df.to_s(head: 2, tail: 2)}"
1093
+ else
1094
+ df.to_s(head: 2, tail: 2)
1095
+ end
1096
+ end
1097
+ a << "+ #{size - limit} more DataFrame#{pl(size - limit)}.\n" if size > limit
1098
+ a.join("---\n")
1099
+ end
1100
+ end
1101
+ end