red_amber 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -5,23 +5,57 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
- include VectorFunctions
8
+ include Enumerable
9
+ include Helper
10
+ include ArrowFunction
9
11
  include VectorUpdatable
10
12
  include VectorSelectable
11
- include Helper
12
13
 
13
14
  using RefineArrayLike
14
15
 
15
16
  # Quicker constructor of Vector.
16
17
  #
18
+ # @param arrow_array [Arrow::Array]
19
+ # Arrow::Array object to have in the Vector.
20
+ # @return [Vector]
21
+ # created Vector.
22
+ # @note This method doesn't check argment type.
23
+ #
17
24
  def self.create(arrow_array)
18
25
  instance = allocate
19
26
  instance.instance_variable_set(:@data, arrow_array)
20
27
  instance
21
28
  end
22
29
 
30
+ # Return true if it is an aggregation function.
31
+ #
32
+ # @param function [Symbol]
33
+ # function name to test.
34
+ # @return [Booleans]
35
+ # true if function is a aggregation function, otherwise false.
36
+ #
37
+ # @example
38
+ # Vector.aggregate?(:mean) # => true
39
+ #
40
+ # Vector.aggregate?(:round) # => false
41
+ #
42
+ # @since 0.4.0
43
+ #
44
+ def self.aggregate?(function)
45
+ %i[
46
+ all all? any any? approximate_median count count_distinct count_uniq
47
+ max mean median min min_max product quantile sd std stddev sum
48
+ unbiased_variance var variance
49
+ ].include?(function.to_sym)
50
+ end
51
+
23
52
  # Create a Vector.
24
53
  #
54
+ # @param array [Array, Vector, Range, Arrow::Array, #to_arrow_array]
55
+ # array-like.
56
+ # @return [Vector]
57
+ # created Vector.
58
+ #
25
59
  # @note default is headless Vector and '@key == nil'
26
60
  def initialize(*array)
27
61
  @data =
@@ -39,15 +73,99 @@ module RedAmber
39
73
  end
40
74
  end
41
75
 
76
+ # Entity of Vector.
77
+ #
78
+ # @return [Arrow::Array]
79
+ #
42
80
  attr_reader :data
43
81
  alias_method :to_arrow_array, :data
44
82
 
83
+ # Associated key name when self is in a DataFrame.
84
+ #
85
+ # Default Vector is 'head-less' (key-less).
86
+ # @return [Symbol]
87
+ #
45
88
  attr_accessor :key
46
89
 
90
+ # Return other as a Vector which is same data type as self.
91
+ #
92
+ # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
93
+ # a source array-like which will be converted.
94
+ # @return [Vector]
95
+ # resolved Vector.
96
+ # @example Integer to String
97
+ # Vector.new('A').resolve([1, 2])
98
+ #
99
+ # # =>
100
+ # #<RedAmber::Vector(:string, size=2):0x00000000000037b4>
101
+ # ["1", "2"]
102
+ #
103
+ # @example String to Ineger
104
+ # Vector.new(1).resolve(['A'])
105
+ #
106
+ # # =>
107
+ # #<RedAmber::Vector(:uint8, size=1):0x00000000000037dc>
108
+ # [65]
109
+ #
110
+ # @example Upcast to uint16
111
+ # vector = Vector.new(256)
112
+ #
113
+ # # =>
114
+ # #<RedAmber::Vector(:uint16, size=1):0x000000000000c1fc>
115
+ # [256]
116
+ #
117
+ # vector.resolve([1, 2])
118
+ #
119
+ # # =>
120
+ # # Not a uint8 Vector
121
+ # #<RedAmber::Vector(:uint16, size=2):0x000000000000c328>
122
+ # [1, 2]
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def resolve(other)
127
+ case other
128
+ when Vector
129
+ Vector.create(data.resolve(other.data))
130
+ when Array, Arrow::Array, Arrow::ChunkedArray
131
+ Vector.create(data.resolve(other))
132
+ else
133
+ raise VectorArgumentError, "invalid argument: #{other}"
134
+ end
135
+ end
136
+
137
+ # String representation of self like an Array.
138
+ #
139
+ # @return [String]
140
+ # return self as same as Array's inspect.
141
+ #
47
142
  def to_s
48
143
  @data.to_a.inspect
49
144
  end
50
145
 
146
+ # String representation of self.
147
+ #
148
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
149
+ # - If it is 'MINIMUM', returns class and size.
150
+ # - If it is otherwise, returns class, size and preview.
151
+ # Default value of the ENV is 'Table'.
152
+ # @param limit [Integer]
153
+ # max width of the result.
154
+ # @return [String]
155
+ # show information of self as a String.
156
+ # @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
157
+ # puts vector.inspect
158
+ #
159
+ # # =>
160
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000037f0>
161
+ # [1, 2, 3]
162
+ #
163
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
164
+ # puts vector.inspect
165
+ #
166
+ # # =>
167
+ # RedAmber::Vector(:uint8, size=3)
168
+ #
51
169
  def inspect(limit: 80)
52
170
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
53
171
  # Better performance than `.upcase == 'MINIMUM'`
@@ -70,82 +188,187 @@ module RedAmber
70
188
  end
71
189
  end
72
190
 
191
+ # Convert to an Array.
192
+ #
193
+ # @return [Array]
194
+ # array representation.
195
+ #
73
196
  def to_ary
74
197
  @data.values
75
198
  end
76
-
77
199
  alias_method :to_a, :to_ary
78
200
  alias_method :values, :to_ary
79
201
  alias_method :entries, :to_ary
80
202
 
203
+ # Indeces from 0 to size-1 by Array.
204
+ #
205
+ # @return [Array]
206
+ # indices.
207
+ #
81
208
  def indices
82
209
  (0...size).to_a
83
210
  end
84
-
85
211
  alias_method :indexes, :indices
86
212
  alias_method :indeces, :indices
87
213
 
214
+ # Vector size.
215
+ #
216
+ # @return [Integer]
217
+ # size of self.
218
+ #
88
219
  def size
89
220
  # only defined :length in Arrow?
90
221
  @data.length
91
222
  end
92
-
93
223
  alias_method :length, :size
94
224
  alias_method :n_rows, :size
95
225
  alias_method :nrow, :size
96
226
 
227
+ # Test wheather self is empty.
228
+ #
229
+ # @return [true, false]
230
+ # true if self is empty.
231
+ #
97
232
  def empty?
98
233
  size.zero?
99
234
  end
100
235
 
236
+ # Type nickname of self.
237
+ #
238
+ # @return [Symbol]
239
+ # type nickname of values.
240
+ #
101
241
  def type
102
242
  list? ? :list : @data.value_type.nick.to_sym
103
243
  end
104
244
 
245
+ # Type Class of self.
246
+ #
247
+ # @return [type_Class]
248
+ # type class.
249
+ #
250
+ def type_class
251
+ @data.type_class
252
+ end
253
+
254
+ # Test if self is a boolean Vector.
255
+ #
256
+ # @return [true, false]
257
+ # test result.
258
+ #
105
259
  def boolean?
106
260
  @data.boolean?
107
261
  end
108
262
 
263
+ # Test if self is a numeric Vector.
264
+ #
265
+ # @return [true, false]
266
+ # test result.
267
+ #
109
268
  def numeric?
110
269
  @data.numeric?
111
270
  end
112
271
 
272
+ # Test if self is a float Vector.
273
+ #
274
+ # @return [true, false]
275
+ # test result.
276
+ #
113
277
  def float?
114
278
  @data.float?
115
279
  end
116
280
 
281
+ # Test if self is a integer Vector.
282
+ #
283
+ # @return [true, false]
284
+ # test result.
285
+ #
117
286
  def integer?
118
287
  @data.integer?
119
288
  end
120
289
 
290
+ # Test if self is a string Vector.
291
+ #
292
+ # @return [true, false]
293
+ # test result.
294
+ #
121
295
  def string?
122
296
  @data.string?
123
297
  end
124
298
 
299
+ # Test if self is a dictionary Vector.
300
+ #
301
+ # @return [true, false]
302
+ # test result.
303
+ #
125
304
  def dictionary?
126
305
  @data.dictionary?
127
306
  end
128
307
 
308
+ # Test if self is a temporal Vector.
309
+ #
310
+ # @return [true, false]
311
+ # test result.
312
+ #
129
313
  def temporal?
130
314
  @data.temporal?
131
315
  end
132
316
 
317
+ # Test if self is a list Vector.
318
+ #
319
+ # @return [true, false]
320
+ # test result.
321
+ #
133
322
  def list?
134
323
  @data.list?
135
324
  end
136
325
 
137
- def type_class
138
- @data.type_class
139
- end
140
-
326
+ # Iterates over Vector elements or returns a Enumerator.
327
+ #
328
+ # @overload each
329
+ # Returns a new Enumerator if no block given.
330
+ #
331
+ # @return [Enumerator]
332
+ # Enumerator of each elements.
333
+ #
334
+ # @overload each
335
+ # When a block given, passes each element in self to the block.
336
+ #
337
+ # @yieldparam element [Object]
338
+ # passes element by a block parameter.
339
+ # @yieldreturn [Object]
340
+ # evaluated result value from the block.
341
+ # @return [self]
342
+ # returns self.
343
+ #
141
344
  def each
142
345
  return enum_for(:each) unless block_given?
143
346
 
144
347
  size.times do |i|
145
348
  yield data[i]
146
349
  end
350
+ self
147
351
  end
148
352
 
353
+ # Returns a Vector from collected objects from the block.
354
+ #
355
+ # @overload map
356
+ # Returns a new Enumerator if no block given.
357
+ #
358
+ # @return [Enumerator]
359
+ # a new Enumerator.
360
+ #
361
+ # @overload map
362
+ # When a block given, calls the block with successive elements.
363
+ # Returns a Vector of the objects returned by the block.
364
+ #
365
+ # @yieldparam element [Object]
366
+ # passes element by a block parameter.
367
+ # @yieldreturn [Object]
368
+ # evaluated result value from the block.
369
+ # @return [self]
370
+ # returns the collected values from the block as a Vector.
371
+ #
149
372
  def map(&block)
150
373
  return enum_for(:map) unless block
151
374
 
@@ -153,18 +376,35 @@ module RedAmber
153
376
  end
154
377
  alias_method :collect, :map
155
378
 
156
- # undocumented
379
+ # Tests wheather self is chunked or not.
380
+ #
381
+ # @api private
382
+ # @return [true, false]
383
+ # returns true if #data is chunked.
384
+ #
157
385
  def chunked?
158
386
  @data.is_a? Arrow::ChunkedArray
159
387
  end
160
388
 
161
- # undocumented
389
+ # Returns the number of chunks.
390
+ #
391
+ # @api private
392
+ # @return [Integer]
393
+ # the number of chunks. If self is not chunked, returns zero.
394
+ #
162
395
  def n_chunks
163
396
  chunked? ? @data.n_chunks : 0
164
397
  end
165
398
 
166
399
  # def each_chunk() end
167
400
 
401
+ # Returns a hash containing the counts of equal elements.
402
+ #
403
+ # - Each key is an element of self.
404
+ # - Each value is the number of elements equal to the key.
405
+ # @return [Hash]
406
+ # result in a Hash.
407
+ #
168
408
  def tally
169
409
  hash = values.tally
170
410
  if (type_class < Arrow::FloatingPointDataType) && is_nan.any
@@ -180,22 +420,156 @@ module RedAmber
180
420
  hash
181
421
  end
182
422
 
423
+ # @api private
424
+ # Arrow imprementation of #tally
183
425
  def value_counts
184
426
  values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
185
427
  values.zip(counts).to_h
186
428
  end
187
429
 
430
+ # Count nils in self.
431
+ #
432
+ # @return [Integer]
433
+ # the number of nils.
434
+ #
188
435
  def n_nulls
189
436
  @data.n_nulls
190
437
  end
191
438
  alias_method :n_nils, :n_nulls
192
439
 
440
+ # Count NaNs in self if self is a numeric Vector
441
+ #
442
+ # @return [Integer]
443
+ # the number of Float::NANs. If self is not a numeric Vector,
444
+ # returns 0.
445
+ #
193
446
  def n_nans
194
447
  numeric? ? is_nan.to_a.count(true) : 0
195
448
  end
196
449
 
450
+ # Return true if self has any nil.
451
+ #
452
+ # @return [true, false]
453
+ # true or false.
454
+ #
197
455
  def has_nil?
198
456
  is_nil.any
199
457
  end
458
+
459
+ # Enable to compute with coercion mechanism.
460
+ #
461
+ # @example
462
+ # vector = Vector.new(1,2,3)
463
+ #
464
+ # # =>
465
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
466
+ # [1, 2, 3]
467
+ #
468
+ # # Vector's `#*` method
469
+ # vector * -1
470
+ #
471
+ # # =>
472
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000e3698>
473
+ # [-1, -2, -3]
474
+ #
475
+ # # coerced calculation
476
+ # -1 * vector
477
+ #
478
+ # # =>
479
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000ea4ac>
480
+ # [-1, -2, -3]
481
+ #
482
+ # # `@-` operator
483
+ # -vector
484
+ #
485
+ # # =>
486
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000ee7b4>
487
+ # [255, 254, 253]
488
+ #
489
+ def coerce(other)
490
+ [Vector.new(Array(other) * size), self]
491
+ end
492
+
493
+ # Spread the return value of an aggregate function as if
494
+ # it is a element-wise function.
495
+ #
496
+ # @overload propagate(function)
497
+ # Returns a Vector of same size as self spreading the value from function.
498
+ #
499
+ # @param function [Symbol] a name of aggregation function for self.
500
+ # Return value of the function must be a scalar.
501
+ # @return [Vector] Returns a Vector that is the same size as self
502
+ # and such that all elements are the same as the result of aggregation `function`.
503
+ # @example propagate by an aggragation function name
504
+ # vec = Vector.new(1, 2, 3, 4)
505
+ # vec.propagate(:mean)
506
+ # # =>
507
+ # #<RedAmber::Vector(:double, size=4):0x000000000001985c>
508
+ # [2.5, 2.5, 2.5, 2.5]
509
+ #
510
+ # @overload propagate
511
+ # Returns a Vector of same size as self spreading the value from block.
512
+ #
513
+ # @yieldparam self [Vector]
514
+ # gives self to the block.
515
+ # @yieldreturn [scalar]
516
+ # a scalar value.
517
+ # @return [Vector]
518
+ # returns a Vector that is the same size as self
519
+ # and such that all elements are the same as the yielded value from the block.
520
+ # @example propagate by a block
521
+ # vec.propagate { |v| v.mean.round }
522
+ # # =>
523
+ # #<RedAmber::Vector(:uint8, size=4):0x000000000000cb98>
524
+ # [3, 3, 3, 3]
525
+ #
526
+ # @since 0.4.0
527
+ #
528
+ def propagate(function = nil, &block)
529
+ value =
530
+ if block
531
+ raise VectorArgumentError, "can't specify both function and block" if function
532
+
533
+ yield self
534
+ else
535
+ function = function&.to_sym
536
+ unless function && respond_to?(function) && Vector.aggregate?(function)
537
+ raise VectorArgumentError, "illegal function: #{function.inspect}"
538
+ end
539
+
540
+ send(function)
541
+ end
542
+ Vector.new([value] * size)
543
+ end
544
+ alias_method :expand, :propagate
545
+
546
+ private # =======
547
+
548
+ def exec_func_unary(function, options)
549
+ options = nil if options.empty?
550
+ find(function).execute([data], options)
551
+ end
552
+
553
+ def exec_func_binary(function, other, options)
554
+ options = nil if options.empty?
555
+ case other
556
+ when Vector
557
+ find(function).execute([data, other.data], options)
558
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
559
+ Array, Numeric, String, TrueClass, FalseClass
560
+ find(function).execute([data, other], options)
561
+ end
562
+ end
563
+
564
+ def get_scalar(datum)
565
+ output = datum.value
566
+ case output
567
+ when Arrow::StringScalar then output.to_s
568
+ when Arrow::StructScalar
569
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
570
+ else
571
+ output.value
572
+ end
573
+ end
200
574
  end
201
575
  end