red_amber 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -0,0 +1,1101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # class SubFrames treats a set of subsets of a DataFrame
5
+ # [Experimental feature] Class SubFrames may be removed or be changed in the future.
6
+ class SubFrames
7
+ include Enumerable # may change to use Forwardable.
8
+ include Helper
9
+
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ class << self
14
+ # Create SubFrames from a Group.
15
+ #
16
+ # [Experimental feature] this method may be removed or be changed in the future.
17
+ # @param group [Group]
18
+ # a Group to be used to create SubFrames.
19
+ # @return [SubFrames]
20
+ # a created SubFrames.
21
+ # @example
22
+ # dataframe
23
+ #
24
+ # # =>
25
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000fba4>
26
+ # x y z
27
+ # <uint8> <string> <boolean>
28
+ # 0 1 A false
29
+ # 1 2 A true
30
+ # 2 3 B false
31
+ # 3 4 B (nil)
32
+ # 4 5 B true
33
+ # 5 6 C false
34
+ #
35
+ # group = Group.new(dataframe, [:y])
36
+ # sf = SubFrames.by_group(group)
37
+ #
38
+ # # =>
39
+ # #<RedAmber::SubFrames : 0x000000000000fbb8>
40
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000fb7c>
41
+ # 3 SubFrames: [2, 3, 1] in sizes.
42
+ # ---
43
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000fbcc>
44
+ # x y z
45
+ # <uint8> <string> <boolean>
46
+ # 0 1 A false
47
+ # 1 2 A true
48
+ # ---
49
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000fbe0>
50
+ # x y z
51
+ # <uint8> <string> <boolean>
52
+ # 0 3 B false
53
+ # 1 4 B (nil)
54
+ # 2 5 B true
55
+ # ---
56
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000fbf4>
57
+ # x y z
58
+ # <uint8> <string> <boolean>
59
+ # 0 6 C false
60
+ #
61
+ # @since 0.4.0
62
+ #
63
+ def by_group(group)
64
+ SubFrames.new(group.dataframe, group.filters)
65
+ end
66
+
67
+ # Create a new SubFrames object from a DataFrame and an array of indices.
68
+ #
69
+ # @api private
70
+ # @note this method doesn't check arguments.
71
+ # @param dataframe [DataFrame]
72
+ # a source dataframe.
73
+ # @param subset_indices [Array, Array<Vector>]
74
+ # an Array of numeric indices to create subsets of DataFrame.
75
+ # @return [SubFrames]
76
+ # a new SubFrames object.
77
+ # @since 0.4.0
78
+ #
79
+ def by_indices(dataframe, subset_indices)
80
+ instance = allocate
81
+ instance.instance_variable_set(:@baseframe, dataframe)
82
+ enum =
83
+ Enumerator.new(subset_indices.size) do |y|
84
+ subset_indices.each do |i|
85
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
86
+ end
87
+ end
88
+ instance.instance_variable_set(:@enum, enum)
89
+ instance
90
+ end
91
+
92
+ # Create a new SubFrames object from a DataFrame and an array of filters.
93
+ #
94
+ # @api private
95
+ # @note this method doesn't check arguments.
96
+ # @param dataframe [DataFrame]
97
+ # a source dataframe.
98
+ # @param subset_filters [Array, Array<Vector>]
99
+ # an Array of booleans to specify subsets of DataFrame.
100
+ # Each filters must have same length as dataframe.
101
+ # @return [SubFrames]
102
+ # a new SubFrames object.
103
+ # @since 0.4.0
104
+ #
105
+ def by_filters(dataframe, subset_filters)
106
+ instance = allocate
107
+ instance.instance_variable_set(:@baseframe, dataframe)
108
+ enum =
109
+ Enumerator.new(subset_filters.size) do |y|
110
+ subset_filters.each do |i|
111
+ y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
112
+ end
113
+ end
114
+ instance.instance_variable_set(:@enum, enum)
115
+ instance
116
+ end
117
+
118
+ # Create a new SubFrames from an Array of DataFrames.
119
+ #
120
+ # @api private
121
+ # @note dataframes must have same schema.
122
+ # @param dataframes [Array<DataFrame>]
123
+ # an array of DataFrames which have same schema.
124
+ # @return [SubFrames]
125
+ # a new SubFrames object.
126
+ # @since 0.4.0
127
+ #
128
+ def by_dataframes(dataframes)
129
+ instance = allocate
130
+ case Array(dataframes)
131
+ when [] || [nil]
132
+ instance.instance_variable_set(:@baseframe, DataFrame.new)
133
+ instance.instance_variable_set(:@frames, [])
134
+ enum = [].each
135
+ else
136
+ enum =
137
+ Enumerator.new(dataframes.size) do |y|
138
+ dataframes.each do |i|
139
+ y.yield i
140
+ end
141
+ end
142
+ instance.instance_variable_set(:@baseframe, enum.lazy)
143
+ end
144
+ instance.instance_variable_set(:@enum, enum)
145
+ instance
146
+ end
147
+
148
+ private
149
+
150
+ # This method upgrades a iterating method from Enumerable to return SubFrames.
151
+
152
+ # @!macro [attach] define_subframable_method
153
+ #
154
+ # [Returns SubFrames] Use `#each.$1` if you want to get DataFrames by Array.
155
+ # Returns an Enumerator with no block given.
156
+ # @yieldparam dataframe [DataFrame]
157
+ # gives each element.
158
+ # @yieldreturn [Array<DataFrame>]
159
+ # the block should return DataFrames with same schema.
160
+ # @return [SubFrames]
161
+ # a new SubFrames.
162
+ #
163
+ # @since 0.4.0
164
+ #
165
+ def define_subframable_method(method)
166
+ define_method(method) do |&block|
167
+ return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments
168
+
169
+ SubFrames.by_dataframes(super(&block))
170
+ end
171
+ end
172
+ end
173
+
174
+ # Create a new SubFrames object from a DataFrame and an array of indices or filters.
175
+ #
176
+ # @overload initialize(dataframe, subset_specifier)
177
+ # Create a new SubFrames object.
178
+ #
179
+ # @param dataframe [DataFrame]
180
+ # a source dataframe.
181
+ # @param subset_specifier [Array<Vector>, Array<array-like>]
182
+ # an Array of numeric indices or boolean filters
183
+ # to create subsets of DataFrame.
184
+ # @return [SubFrames]
185
+ # new SubFrames.
186
+ # @example
187
+ # dataframe
188
+ #
189
+ # # =>
190
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000039e4>
191
+ # x y z
192
+ # <uint8> <string> <boolean>
193
+ # 0 1 A false
194
+ # 1 2 A true
195
+ # 2 3 B false
196
+ # 3 4 B (nil)
197
+ # 4 5 B true
198
+ # 5 6 C false
199
+ #
200
+ # # --- This object is used as common source in this class ---
201
+ # subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]])
202
+ #
203
+ # # =>
204
+ # #<RedAmber::SubFrames : 0x000000000000cf6c>
205
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80>
206
+ # 3 SubFrames: [2, 3, 1] in sizes.
207
+ # ---
208
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94>
209
+ # x y z
210
+ # <uint8> <string> <boolean>
211
+ # 0 1 A false
212
+ # 1 2 A true
213
+ # ---
214
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8>
215
+ # x y z
216
+ # <uint8> <string> <boolean>
217
+ # 0 3 B false
218
+ # 1 4 B (nil)
219
+ # 2 5 B true
220
+ # ---
221
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc>
222
+ # x y z
223
+ # <uint8> <string> <boolean>
224
+ # 0 6 C false
225
+ #
226
+ # @overload initialize(dataframe)
227
+ # Create a new SubFrames object by block.
228
+ #
229
+ # @param dataframe [DataFrame]
230
+ # a source dataframe.
231
+ # @yieldparam dataframe [DataFrame]
232
+ # the block is called with `dataframe`.
233
+ # @yieldreturn [Array<numeric_array_like>, Array<boolean_array_like>]
234
+ # an Array of index or boolean array-likes to create subsets of DataFrame.
235
+ # All array-likes are responsible to #numeric? or #boolean?.
236
+ # @return [SubFrames]
237
+ # a new SubFrames object.
238
+ # @example
239
+ # SubFrames.new(dataframe) do |df|
240
+ # booleans = df[:z]
241
+ # [booleans, !booleans]
242
+ # end
243
+ #
244
+ # # =>
245
+ # #<RedAmber::SubFrames : 0x0000000000003aac>
246
+ # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003ac0>
247
+ # 2 SubFrames: [2, 3] in sizes.
248
+ # ---
249
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003ad4>
250
+ # x y z
251
+ # <uint8> <string> <boolean>
252
+ # 0 2 A true
253
+ # 1 5 B true
254
+ # ---
255
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003ae8>
256
+ # x y z
257
+ # <uint8> <string> <boolean>
258
+ # 0 1 A false
259
+ # 1 3 B false
260
+ # 2 6 C false
261
+ #
262
+ # @since 0.4.0
263
+ #
264
+ def initialize(dataframe, subset_specifier = nil, &block)
265
+ unless dataframe.is_a?(DataFrame)
266
+ raise SubFramesArgumentError, "not a DataFrame: #{dataframe}"
267
+ end
268
+
269
+ if block
270
+ unless subset_specifier.nil?
271
+ raise SubFramesArgumentError, 'Must not specify both arguments and block.'
272
+ end
273
+
274
+ subset_specifier = yield(dataframe)
275
+ end
276
+
277
+ if dataframe.empty? || subset_specifier.nil? || subset_specifier.empty?
278
+ @baseframe = DataFrame.new
279
+ @frames = []
280
+ @enum = @frames.each
281
+ else
282
+ @baseframe = nil
283
+ @enum =
284
+ Enumerator.new(subset_specifier.size) do |yielder|
285
+ subset_specifier.map do |i|
286
+ df =
287
+ if i.numeric?
288
+ dataframe.take(i)
289
+ elsif i.boolean?
290
+ dataframe.filter(i)
291
+ else
292
+ raise SubFramesArgumentError, "illegal type: #{i}"
293
+ end
294
+ yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
295
+ end
296
+ end
297
+ end
298
+ end
299
+
300
+ # Return concatenated SubFrames as a DataFrame.
301
+ #
302
+ # Once evaluated, memorize it as @baseframe.
303
+ # @return [DataFrame]
304
+ # a concatenated DataFrame.
305
+ # @since 0.4.0
306
+ #
307
+ def baseframe
308
+ if @baseframe.nil? || @baseframe.is_a?(Enumerator)
309
+ @baseframe = reduce(&:concatenate)
310
+ else
311
+ @baseframe
312
+ end
313
+ end
314
+ alias_method :concatenate, :baseframe
315
+ alias_method :concat, :baseframe
316
+
317
+ # Iterates over sub DataFrames or returns an Enumerator.
318
+ #
319
+ # This method will memorize sub DataFrames and always returns the same object.
320
+ # The Class SubFrames is including Enumerable module.
321
+ # So many methods in Enumerable are available.
322
+ #
323
+ # @overload each
324
+ # Returns a new Enumerator if no block given.
325
+ #
326
+ # @return [Enumerator]
327
+ # Enumerator of each elements.
328
+ #
329
+ # @overload each
330
+ # When a block given, passes each sub DataFrames to the block.
331
+ #
332
+ # @yieldparam subframe [DataFrame]
333
+ # passes sub DataFrame by a block parameter.
334
+ # @yieldreturn [Object]
335
+ # evaluated result value from the block.
336
+ # @return [self]
337
+ # returns self.
338
+ #
339
+ # @example Returns Enumerator
340
+ # subframes.each
341
+ #
342
+ # # =>
343
+ # #<Enumerator: ...>
344
+ #
345
+ # @example `to_a` from Enumerable.
346
+ # subframes.to_a
347
+ #
348
+ # # =>
349
+ # [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
350
+ # x y z
351
+ # <uint8> <string> <boolean>
352
+ # 0 1 A false
353
+ # 1 2 A true
354
+ # ,
355
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000002a134>
356
+ # x y z
357
+ # <uint8> <string> <boolean>
358
+ # 0 3 B false
359
+ # 1 4 B (nil)
360
+ # 2 5 B true
361
+ # ,
362
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a148>
363
+ # x y z
364
+ # <uint8> <string> <boolean>
365
+ # 0 6 C false
366
+ # ]
367
+ #
368
+ # @example Concatenate SubFrames. This example is used in #concatenate.
369
+ # subframes.reduce(&:concatenate)
370
+ #
371
+ # # =>
372
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c>
373
+ # x y z
374
+ # <uint8> <string> <boolean>
375
+ # 0 1 A false
376
+ # 1 2 A true
377
+ # 2 3 B false
378
+ # 3 4 B (nil)
379
+ # 4 5 B true
380
+ # 5 6 C false
381
+ #
382
+ # @since 0.4.0
383
+ #
384
+ def each(&block)
385
+ return enum_for(__method__) { size } unless block
386
+
387
+ frames.each(&block)
388
+ nil
389
+ end
390
+
391
+ # Aggregate SubFrames to create a DataFrame.
392
+ #
393
+ # This method creates a DataFrame with one row corresponding to one sub dataframe.
394
+ # @note This method does not check if aggregation function is used.
395
+ #
396
+ # @overload aggregate(keys)
397
+ #
398
+ # Aggregate SubFrames creating DataFrame with label `keys` and
399
+ # its column values by block.
400
+ #
401
+ # @param keys [Symbol, Array<Symbol>]
402
+ # a key or keys of result. Key names may be renamed to new label.
403
+ # @yieldparam dataframe [DataFrame]
404
+ # passes each dataframe in self to the block. Block is called by instance_eval,
405
+ # so inside of the block is the context of passed dataframe.
406
+ # @yieldreturn [Array]
407
+ # aggregated values from the columns of passed dataframe.
408
+ # @return [DataFrame]
409
+ # created DataFrame.
410
+ # @example Aggregate by key labels in arguments and values from block.
411
+ # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
412
+ #
413
+ # # =>
414
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
415
+ # y sum_x
416
+ # <string> <uint8>
417
+ # 0 A 3
418
+ # 1 B 12
419
+ # 2 C 6
420
+ #
421
+ # @example Aggregate by key labels in an Array and values from block.
422
+ # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
423
+ #
424
+ # # =>
425
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
426
+ # y sum_x
427
+ # <string> <uint8>
428
+ # 0 A 3
429
+ # 1 B 12
430
+ # 2 C 6
431
+ #
432
+ # @overload aggregate
433
+ #
434
+ # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
435
+ # in Hash from the block.
436
+ #
437
+ # @yieldparam dataframe [DataFrame]
438
+ # passes each dataframe in self to the block. Block is called by instance_eval,
439
+ # so inside of the block is the context of passed dataframe.
440
+ # @yieldreturn [Hash<key => aggregated_value>]
441
+ # pairs of key name and aggregated values from the columns of passed dataframe.
442
+ # Key names may be renamed to new label in the result.
443
+ # @return [DataFrame]
444
+ # created DataFrame.
445
+ # @example Aggregate by key and value pairs from block.
446
+ # subframes.aggregate do
447
+ # { y: y.first, sum_x: x.sum }
448
+ # end
449
+ #
450
+ # # =>
451
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
452
+ # y sum_x
453
+ # <string> <uint8>
454
+ # 0 A 3
455
+ # 1 B 12
456
+ # 2 C 6
457
+ #
458
+ # @overload aggregate
459
+ #
460
+ # Aggregate SubFrames creating DataFrame with an Array of key and aggregated value
461
+ # from the block.
462
+ #
463
+ # @yieldparam dataframe [DataFrame]
464
+ # passes each dataframe in self to the block. Block is called by instance_eval,
465
+ # so inside of the block is the context of passed dataframe.
466
+ # @yieldreturn [Array<key, aggregated_value>]
467
+ # pairs of key name and aggregated values from the columns of passed dataframe.
468
+ # Key names may be renamed to new label in the result.
469
+ # @return [DataFrame]
470
+ # created DataFrame.
471
+ # @example Aggregate by key and value arrays from block.
472
+ # subframes.aggregate do
473
+ # [[:y, y.first], [:sum_x, x.sum]]
474
+ # end
475
+ #
476
+ # # =>
477
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
478
+ # y sum_x
479
+ # <string> <uint8>
480
+ # 0 A 3
481
+ # 1 B 12
482
+ # 2 C 6
483
+ #
484
+ # @overload aggregate(group_keys, aggregations)
485
+ #
486
+ # Aggregate SubFrames for first values of the columns of
487
+ # `group_keys` and the aggregated results of key-function pairs.
488
+ # [Experimental] This API may be changed in the future.
489
+ #
490
+ # @param group_keys [Symbol, String, Array<Symbol, String>]
491
+ # group key name(s) to output values.
492
+ # @param aggregations [Hash<Array<Symbol, String> => Array<:Symbol>>]
493
+ # a Hash of variable (column) name and
494
+ # Vector aggregate function name to apply.
495
+ # @return [DataFrame]
496
+ # an aggregated DataFrame.
497
+ # @example Aggregate with a group key and key function pairs by a Hash.
498
+ # subframes.aggregate(:y, { x: :sum, z: :count })
499
+ #
500
+ # # =>
501
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
502
+ # y sum_x count_z
503
+ # <string> <uint8> <uint8>
504
+ # 0 A 3 2
505
+ # 1 B 12 2
506
+ # 2 C 6 1
507
+ #
508
+ # @overload aggregate(group_keys, aggregations)
509
+ #
510
+ # Aggregate SubFrames for first values of the columns of
511
+ # `group_keys` and the aggregated results of all combinations
512
+ # of supplied keys and functions.
513
+ # [Experimental] This API may be changed in the future.
514
+ #
515
+ # @param group_keys [Symbol, String, Array<Symbol, String>]
516
+ # group key name(s) to output values.
517
+ # @param aggregations [Array[Array<Symbol, String>, Array<:Symbol>]]
518
+ # an Array of Array of variable (column) names and
519
+ # Array of Vector aggregate function names to apply.
520
+ # @return [DataFrame]
521
+ # an aggregated DataFrame.
522
+ # @example Aggregate with group keys and keys and functions by an Array.
523
+ # sf.aggregate(:y, [[:x, :z], [:count, :sum]])
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc>
527
+ # y count_x sum_x count_z sum_z
528
+ # <string> <uint8> <uint8> <uint8> <uint8>
529
+ # 0 A 2 3 2 1
530
+ # 1 B 3 12 2 1
531
+ # 2 C 1 6 1 0
532
+ #
533
+ # @since 0.4.0
534
+ #
535
+ def aggregate(*args, &block)
536
+ aggregator =
537
+ if block
538
+ if args.empty?
539
+ # aggregate { {key => value} or [[key, value], ...] }
540
+ each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash|
541
+ df.instance_eval(&block).to_h.each do |k, v|
542
+ hash[k] << v
543
+ end
544
+ end
545
+ else
546
+ # aggregate(keys) { values }
547
+ values = each.map { |df| Array(df.instance_eval(&block)) }.transpose
548
+ args.flatten.zip(values)
549
+ end
550
+ else
551
+ # These functions may be removed in the future.
552
+ case args
553
+ in [group_keys1, Hash => h]
554
+ # aggregate(group_keys, { key => func })
555
+ ary = Array(group_keys1).map { |key| [:first, key] }
556
+ ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit
557
+ in [group_keys2, [Array => keys, Array => funcs]]
558
+ # aggregate(group_keys, [keys, funcs])
559
+ ary = Array(group_keys2).map { |key| [:first, key] }
560
+ ary.concat(funcs.product(keys))
561
+ else
562
+ raise SubFramesArgumentError, "invalid argument: #{args}"
563
+ end
564
+ sf = self
565
+ ary.map do |func, key|
566
+ label = func == :first ? key : "#{func}_#{key}"
567
+ [label, sf.each.map { |df| df[key].send(func) }]
568
+ end
569
+ end
570
+ DataFrame.new(aggregator)
571
+ end
572
+
573
+ # Returns a SubFrames containing DataFrames returned by the block.
574
+ #
575
+ # @example Map as it is.
576
+ # subframes.map { _1 }
577
+ #
578
+ # # This will create a new SubFrame and a new baseframe,
579
+ # # But each element DataFrames are re-used.
580
+ # # =>
581
+ # #<RedAmber::SubFrames : 0x000000000001e6cc>
582
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000001e6e0>
583
+ # 3 SubFrames: [2, 3, 1] in sizes.
584
+ # ---
585
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4>
586
+ # x y z
587
+ # <uint8> <string> <boolean>
588
+ # 0 1 A false
589
+ # 1 2 A true
590
+ # ---
591
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8>
592
+ # x y z
593
+ # <uint8> <string> <boolean>
594
+ # 0 3 B false
595
+ # 1 4 B (nil)
596
+ # 2 5 B true
597
+ # ---
598
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec>
599
+ # x y z
600
+ # <uint8> <string> <boolean>
601
+ # 0 6 C false
602
+ #
603
+ # @example Assign a new column.
604
+ # subframes.map { |df| df.assign(x_plus1: df[:x] + 1) }
605
+ #
606
+ # # =>
607
+ # #<RedAmber::SubFrames : 0x0000000000040948>
608
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000004095c>
609
+ # 3 SubFrames: [2, 3, 1] in sizes.
610
+ # ---
611
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x0000000000040970>
612
+ # x y z x_plus1
613
+ # <uint8> <string> <boolean> <uint8>
614
+ # 0 1 A false 2
615
+ # 1 2 A true 3
616
+ # ---
617
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x0000000000040984>
618
+ # x y z x_plus1
619
+ # <uint8> <string> <boolean> <uint8>
620
+ # 0 3 B false 4
621
+ # 1 4 B (nil) 5
622
+ # 2 5 B true 6
623
+ # ---
624
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x0000000000040998>
625
+ # x y z x_plus1
626
+ # <uint8> <string> <boolean> <uint8>
627
+ # 0 6 C false 7
628
+ #
629
+ # @since 0.4.0
630
+ #
631
+ define_subframable_method :map
632
+ alias_method :collect, :map
633
+
634
+ # Update existing column(s) or create new columns(s) for each DataFrames in self.
635
+ #
636
+ # Column values are updated by an oveloaded common operation.
637
+ #
638
+ # @overload assign(key)
639
+ # Assign a column by argument and block.
640
+ #
641
+ # @param key [Symbol, String]
642
+ # a key of column to assign.
643
+ # @yieldparam dataframe [DataFrame]
644
+ # gives overloaded dataframe in self to the block.
645
+ # @yieldreturn [Vector, Array, Arrow::Array]
646
+ # an updated column value which are overloaded.
647
+ # @return [SubFrames]
648
+ # a new SubFrames object with updated DataFrames.
649
+ # @example
650
+ # subframes.assign(:x_plus1) { x + 1 }
651
+ #
652
+ # # =>
653
+ # #<RedAmber::SubFrames : 0x000000000000c3a0>
654
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000c3b4>
655
+ # 3 SubFrames: [2, 3, 1] in sizes.
656
+ # ---
657
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000c3c8>
658
+ # x y z x_plus1
659
+ # <uint8> <string> <boolean> <uint8>
660
+ # 0 1 A false 2
661
+ # 1 2 A true 3
662
+ # ---
663
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x000000000000c3dc>
664
+ # x y z x_plus1
665
+ # <uint8> <string> <boolean> <uint8>
666
+ # 0 3 B false 4
667
+ # 1 4 B (nil) 5
668
+ # 2 5 B true 6
669
+ # ---
670
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x000000000000c3f0>
671
+ # x y z x_plus1
672
+ # <uint8> <string> <boolean> <uint8>
673
+ # 0 6 C false 7
674
+ #
675
+ # @overload assign(keys)
676
+ # Assign columns by arguments and block.
677
+ #
678
+ # @param keys [Array<Symbol, String>]
679
+ # keys of columns to assign.
680
+ # @yieldparam dataframe [DataFrame]
681
+ # gives overloaded dataframes in self to the block.
682
+ # @yieldreturn [Array<Vector, Array, Arrow::Array>]
683
+ # an updated column values which are overloaded.
684
+ # @return [SubFrames]
685
+ # a new SubFrames object with updated DataFrames.
686
+ # @example
687
+ # subframes.assign(:sum_x, :frac_x) do
688
+ # group_sum = x.sum
689
+ # [[group_sum] * size, x / s.to_f]
690
+ # end
691
+ #
692
+ # # =>
693
+ # #<RedAmber::SubFrames : 0x000000000000fce4>
694
+ # @baseframe=#<RedAmber::DataFrame : 6 x 5 Vectors, 0x000000000000fcf8>
695
+ # 3 SubFrames: [2, 3, 1] in sizes.
696
+ # ---
697
+ # #<RedAmber::DataFrame : 2 x 5 Vectors, 0x000000000000fd0c>
698
+ # x y z sum_x frac_x
699
+ # <uint8> <string> <boolean> <uint8> <double>
700
+ # 0 1 A false 3 0.33
701
+ # 1 2 A true 3 0.67
702
+ # ---
703
+ # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fd20>
704
+ # x y z sum_x frac_x
705
+ # <uint8> <string> <boolean> <uint8> <double>
706
+ # 0 3 B false 12 0.25
707
+ # 1 4 B (nil) 12 0.33
708
+ # 2 5 B true 12 0.42
709
+ # ---
710
+ # #<RedAmber::DataFrame : 1 x 5 Vectors, 0x000000000000fd34>
711
+ # x y z sum_x frac_x
712
+ # <uint8> <string> <boolean> <uint8> <double>
713
+ # 0 6 C false 6 1.0
714
+ #
715
+ # @overload assign
716
+ # Assign column(s) by block.
717
+ #
718
+ # @yieldparam dataframe [DataFrame]
719
+ # gives overloaded dataframes in self to the block.
720
+ # @yieldreturn [Hash, Array]
721
+ # pairs of keys and column values which are overloaded.
722
+ # @return [SubFrames]
723
+ # a new SubFrames object with updated DataFrames.
724
+ # @example Compute 'x * z' when (true, not_true) = (1, 0) in z
725
+ # subframes.assign do
726
+ # { 'x*z': x * z.if_else(1, 0) }
727
+ # end
728
+ #
729
+ # # =>
730
+ # #<RedAmber::SubFrames : 0x000000000000fd98>
731
+ # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000fdac>
732
+ # 3 SubFrames: [2, 3, 1] in sizes.
733
+ # ---
734
+ # #<RedAmber::DataFrame : 2 x 4 Vectors, 0x000000000000fdc0>
735
+ # x y z x*z
736
+ # <uint8> <string> <boolean> <uint8>
737
+ # 0 1 A false 0
738
+ # 1 2 A true 2
739
+ # ---
740
+ # #<RedAmber::DataFrame : 3 x 4 Vectors, 0x000000000000fdd4>
741
+ # x y z x*z
742
+ # <uint8> <string> <boolean> <uint8>
743
+ # 0 3 B false 0
744
+ # 1 4 B (nil) (nil)
745
+ # 2 5 B true 5
746
+ # ---
747
+ # #<RedAmber::DataFrame : 1 x 4 Vectors, 0x000000000000fde8>
748
+ # x y z x*z
749
+ # <uint8> <string> <boolean> <uint8>
750
+ # 0 6 C false 0
751
+ #
752
+ # @since 0.4.0
753
+ #
754
+ def assign(...)
755
+ map { |df| df.assign(...) }
756
+ end
757
+
758
+ # Returns a SubFrames containing DataFrames selected by the block.
759
+ #
760
+ # With a block given, calls the block with successive DataFrames;
761
+ # returns a SubFrames of those DataFrames for
762
+ # which the block returns a truthy value.
763
+ #
764
+ # @example Select all.
765
+ # subframes.select { true }
766
+ #
767
+ # # =>
768
+ # #<RedAmber::SubFrames : 0x0000000000003a84>
769
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003a98>
770
+ # 3 SubFrames: [2, 3, 1] in sizes.
771
+ # ---
772
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
773
+ # x y z
774
+ # <uint8> <string> <boolean>
775
+ # 0 1 A false
776
+ # 1 2 A true
777
+ # ---
778
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
779
+ # x y z
780
+ # <uint8> <string> <boolean>
781
+ # 0 3 B false
782
+ # 1 4 B (nil)
783
+ # 2 5 B true
784
+ # ---
785
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x0000000000003a34>
786
+ # x y z
787
+ # <uint8> <string> <boolean>
788
+ # 0 6 C false
789
+ #
790
+ # @example Select nothing.
791
+ # subframes.select { false }
792
+ #
793
+ # # =>
794
+ # #<RedAmber::SubFrames : 0x00000000000238c0>
795
+ # @baseframe=#<RedAmber::DataFrame : (empty), 0x00000000000238d4>
796
+ # 0 SubFrame: [] in size.
797
+ # ---
798
+ #
799
+ # @example Select if Vector `:z` has any true.
800
+ # subframes.select { |df| df[:z].any? }
801
+ #
802
+ # # =>
803
+ # #<RedAmber::SubFrames : 0x000000000000fba4>
804
+ # @baseframe=#<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000fbb8>
805
+ # 2 SubFrames: [2, 1] in sizes.
806
+ # ---
807
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
808
+ # x y z
809
+ # <uint8> <string> <boolean>
810
+ # 0 1 A false
811
+ # 1 2 A true
812
+ # ---
813
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
814
+ # x y z
815
+ # <uint8> <string> <boolean>
816
+ # 0 3 B false
817
+ # 1 4 B (nil)
818
+ # 2 5 B true
819
+ #
820
+ # @since 0.4.0
821
+ #
822
+ define_subframable_method :select
823
+ alias_method :filter, :select
824
+ alias_method :find_all, :select
825
+
826
+ # Returns a SubFrames containing DataFrames rejected by the block.
827
+ #
828
+ # With a block given, calls the block with successive DataFrames;
829
+ # returns a SubFrames of those DataFrames for
830
+ # which the block returns nil or false.
831
+ # @example Reject all.
832
+ # subframes.reject { true }
833
+ #
834
+ # # =>
835
+ # #<RedAmber::SubFrames : 0x00000000000238c0>
836
+ # @baseframe=#<RedAmber::DataFrame : (empty), 0x00000000000238d4>
837
+ # 0 SubFrame: [] in size.
838
+ # ---
839
+ #
840
+ # @example Reject nothing.
841
+ # subframes.reject { false }
842
+ #
843
+ # # =>
844
+ # #<RedAmber::SubFrames : 0x0000000000003a84>
845
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003a98>
846
+ # 3 SubFrames: [2, 3, 1] in sizes.
847
+ # ---
848
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a0c>
849
+ # x y z
850
+ # <uint8> <string> <boolean>
851
+ # 0 1 A false
852
+ # 1 2 A true
853
+ # ---
854
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a20>
855
+ # x y z
856
+ # <uint8> <string> <boolean>
857
+ # 0 3 B false
858
+ # 1 4 B (nil)
859
+ # 2 5 B true
860
+ # ---
861
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x0000000000003a34>
862
+ # x y z
863
+ # <uint8> <string> <boolean>
864
+ # 0 6 C false
865
+ #
866
+ # @example Reject if Vector `:z` has any true.
867
+ # subframes.reject { |df| df[:z].any? }
868
+ #
869
+ # # =>
870
+ # #<RedAmber::SubFrames : 0x0000000000038d74>
871
+ # @baseframe=#<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000001ad10>
872
+ # 1 SubFrame: [1] in size.
873
+ # ---
874
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000001ad10>
875
+ # x y z
876
+ # <uint8> <string> <boolean>
877
+ # 0 6 C false
878
+ #
879
+ # @since 0.4.0
880
+ #
881
+ define_subframable_method :reject
882
+
883
+ # Returns a SubFrames containing truthy DataFrames returned by the block.
884
+ #
885
+ # With a block given, calls the block with successive DataFrames;
886
+ # returns a SubFrames of those DataFrames for
887
+ # which the block returns nil or false.
888
+ # @example Filter for size is larger than 1 and append number to column 'y'.
889
+ # subframes.filter_map do |df|
890
+ # if df.size > 1
891
+ # df.assign(:y) do
892
+ # y.merge(indices('1'), sep: '')
893
+ # end
894
+ # end
895
+ # end
896
+ #
897
+ # # =>
898
+ # #<RedAmber::SubFrames : 0x000000000001da88>
899
+ # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x000000000001da9c>
900
+ # 2 SubFrames: [2, 3] in sizes.
901
+ # ---
902
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000001dab0>
903
+ # x y z
904
+ # <uint8> <string> <boolean>
905
+ # 0 1 A1 false
906
+ # 1 2 A2 true
907
+ # ---
908
+ # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000001dac4>
909
+ # x y z
910
+ # <uint8> <string> <boolean>
911
+ # 0 3 B1 false
912
+ # 1 4 B2 (nil)
913
+ # 2 5 B3 true
914
+ #
915
+ # @since 0.4.0
916
+ #
917
+ define_subframable_method :filter_map
918
+
919
+ # Number of subsets.
920
+ #
921
+ # @return [Integer]
922
+ # number of subsets in self.
923
+ # @since 0.4.0
924
+ #
925
+ def size
926
+ @size ||= @enum.size
927
+ end
928
+
929
+ # Size list of subsets.
930
+ #
931
+ # @return [Array<Integer>]
932
+ # sizes of sub DataFrames.
933
+ # @since 0.4.0
934
+ #
935
+ def sizes
936
+ @sizes ||= @enum.map(&:size)
937
+ end
938
+
939
+ # Indices at the top of each sub DataFrames.
940
+ #
941
+ # @return [Array<Integer>]
942
+ # indices of offset of each sub DataFrames.
943
+ # @example When `sizes` is [2, 3, 1].
944
+ # subframes.offset_indices # => [0, 2, 5]
945
+ # @since 0.4.0
946
+ #
947
+ def offset_indices
948
+ sum = 0
949
+ sizes.map do |size|
950
+ sum += size
951
+ sum - size
952
+ end
953
+ end
954
+
955
+ # Test if subset is empty?.
956
+ #
957
+ # @return [true, false]
958
+ # true if self is an empty subset.
959
+ # @since 0.4.0
960
+ #
961
+ def empty?
962
+ size.zero?
963
+ end
964
+
965
+ # Test if self has only one subset and it is comprehensive.
966
+ #
967
+ # @return [true, false]
968
+ # true if only member of self is equal to universal DataFrame.
969
+ # @since 0.4.0
970
+ #
971
+ def universal?
972
+ size == 1 && @enum.first == baseframe
973
+ end
974
+
975
+ # Return string representation of self.
976
+ #
977
+ # @param limit [Integer]
978
+ # maximum number of DataFrames to show.
979
+ # @return [String]
980
+ # return string representation of each sub DataFrame.
981
+ # @example
982
+ # df
983
+ #
984
+ # # =>
985
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000caa8>
986
+ # x y z
987
+ # <uint8> <string> <boolean>
988
+ # 0 1 A false
989
+ # 1 2 A true
990
+ # 2 3 B false
991
+ # 3 4 B (nil)
992
+ # 4 5 B true
993
+ # 5 6 C false
994
+ #
995
+ # puts SubFrames.new(df, [[0, 1], [2, 3, 4], [5]])
996
+ #
997
+ # # =>
998
+ # x y z
999
+ # <uint8> <string> <boolean>
1000
+ # 0 1 A false
1001
+ # 1 2 A true
1002
+ # ---
1003
+ # x y z
1004
+ # <uint8> <string> <boolean>
1005
+ # 0 3 B false
1006
+ # 1 4 B (nil)
1007
+ # 2 5 B true
1008
+ # ---
1009
+ # x y z
1010
+ # <uint8> <string> <boolean>
1011
+ # 0 6 C false
1012
+ #
1013
+ # @since 0.4.0
1014
+ #
1015
+ def to_s(limit: 16)
1016
+ _to_s(limit: limit)
1017
+ end
1018
+
1019
+ # Return summary information of self.
1020
+ #
1021
+ # @param limit [Integer]
1022
+ # maximum number of DataFrames to show.
1023
+ # @return [String]
1024
+ # return class name, object id, universal DataFrame,
1025
+ # size and subset sizes in a String.
1026
+ # @example
1027
+ # df
1028
+ #
1029
+ # # =>
1030
+ # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000caa8>
1031
+ # x y z
1032
+ # <uint8> <string> <boolean>
1033
+ # 0 1 A false
1034
+ # 1 2 A true
1035
+ # 2 3 B false
1036
+ # 3 4 B (nil)
1037
+ # 4 5 B true
1038
+ # 5 6 C false
1039
+ #
1040
+ # SubFrames.new(df, [[0, 1], [2, 3, 4], [5]])
1041
+ #
1042
+ # # =>
1043
+ # #<RedAmber::SubFrames : 0x000000000000c1fc>
1044
+ # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c170>
1045
+ # 3 SubFrames: [2, 3, 1] in sizes.
1046
+ # ---
1047
+ # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
1048
+ # x y z
1049
+ # <uint8> <string> <boolean>
1050
+ # 0 1 A false
1051
+ # 1 2 A true
1052
+ # ---
1053
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a134>
1054
+ # x y z
1055
+ # <uint8> <string> <boolean>
1056
+ # 0 3 B false
1057
+ # 1 4 B (nil)
1058
+ # 2 5 B true
1059
+ # ---
1060
+ # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000002a148>
1061
+ # x y z
1062
+ # <uint8> <string> <boolean>
1063
+ # 0 6 C false
1064
+ #
1065
+ # @since 0.4.0
1066
+ #
1067
+ def inspect(limit: 16)
1068
+ shape =
1069
+ if @baseframe.is_a?(Enumerator)
1070
+ "Enumerator::Lazy:size=#{@baseframe.size}"
1071
+ else
1072
+ baseframe.shape_str(with_id: true)
1073
+ end
1074
+ sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ')
1075
+ "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \
1076
+ "@baseframe=#<#{shape}>\n" \
1077
+ "#{size} SubFrame#{pl(size)}: " \
1078
+ "[#{sizes_truncated}] in size#{pl(size)}.\n" \
1079
+ "---\n#{_to_s(limit: limit, with_id: true)}"
1080
+ end
1081
+
1082
+ private
1083
+
1084
+ def frames
1085
+ @frames ||= @enum.to_a
1086
+ end
1087
+
1088
+ def _to_s(limit: 16, with_id: false)
1089
+ a = take(limit).map do |df|
1090
+ if with_id
1091
+ "#<#{df.shape_str(with_id: with_id)}>\n" \
1092
+ "#{df.to_s(head: 2, tail: 2)}"
1093
+ else
1094
+ df.to_s(head: 2, tail: 2)
1095
+ end
1096
+ end
1097
+ a << "+ #{size - limit} more DataFrame#{pl(size - limit)}.\n" if size > limit
1098
+ a.join("---\n")
1099
+ end
1100
+ end
1101
+ end