red_amber 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -5,88 +5,217 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
- include VectorFunctions
8
+ include Enumerable
9
+ include Helper
10
+ include ArrowFunction
9
11
  include VectorUpdatable
10
12
  include VectorSelectable
11
- include Helper
12
13
 
13
- def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
28
- arrow_array_like.to_arrow_array
29
- in [Range => r]
30
- Arrow::Array.new(Array(r))
31
- else
32
- begin
33
- Arrow::Array.new(Array(array))
34
- rescue Error
35
- raise VectorArgumentError, "Invalid argument: #{array}"
36
- end
37
- end
38
- end
14
+ using RefineArrayLike
15
+
16
+ # Quicker constructor of Vector.
17
+ #
18
+ # @param arrow_array [Arrow::Array]
19
+ # Arrow::Array object to have in the Vector.
20
+ # @return [Vector]
21
+ # created Vector.
22
+ # @note This method doesn't check argment type.
23
+ #
24
+ def self.create(arrow_array)
25
+ instance = allocate
26
+ instance.instance_variable_set(:@data, arrow_array)
27
+ instance
39
28
  end
40
29
 
41
- attr_reader :data
30
+ # Return true if it is an aggregation function.
31
+ #
32
+ # @param function [Symbol]
33
+ # function name to test.
34
+ # @return [Booleans]
35
+ # true if function is a aggregation function, otherwise false.
36
+ #
37
+ # @example
38
+ # Vector.aggregate?(:mean) # => true
39
+ #
40
+ # Vector.aggregate?(:round) # => false
41
+ #
42
+ # @since 0.4.0
43
+ #
44
+ def self.aggregate?(function)
45
+ %i[
46
+ all all? any any? approximate_median count count_distinct count_uniq
47
+ max mean median min min_max product quantile sd std stddev sum
48
+ unbiased_variance var variance
49
+ ].include?(function.to_sym)
50
+ end
42
51
 
43
- def to_arrow_array
44
- @data
52
+ # Create a Vector.
53
+ #
54
+ # @param array [Array, Vector, Range, Arrow::Array, #to_arrow_array]
55
+ # array-like.
56
+ # @return [Vector]
57
+ # created Vector.
58
+ #
59
+ # @note default is headless Vector and '@key == nil'
60
+ def initialize(*array)
61
+ @data =
62
+ case array
63
+ in [Vector => v]
64
+ v.data
65
+ in [Range => r]
66
+ Arrow::Array.new(Array(r))
67
+ in [Arrow::Array | Arrow::ChunkedArray]
68
+ array[0]
69
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
70
+ arrow_array_like.to_arrow_array
71
+ else
72
+ Arrow::Array.new(array.flatten)
73
+ end
45
74
  end
46
75
 
76
+ # Entity of Vector.
77
+ #
78
+ # @return [Arrow::Array]
79
+ #
80
+ attr_reader :data
81
+ alias_method :to_arrow_array, :data
82
+
83
+ # Associated key name when self is in a DataFrame.
84
+ #
85
+ # Default Vector is 'head-less' (key-less).
86
+ # @return [Symbol]
87
+ #
47
88
  attr_accessor :key
48
89
 
90
+ # Return other as a Vector which is same data type as self.
91
+ #
92
+ # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
93
+ # a source array-like which will be converted.
94
+ # @return [Vector]
95
+ # resolved Vector.
96
+ # @example Integer to String
97
+ # Vector.new('A').resolve([1, 2])
98
+ #
99
+ # # =>
100
+ # #<RedAmber::Vector(:string, size=2):0x00000000000037b4>
101
+ # ["1", "2"]
102
+ #
103
+ # @example String to Ineger
104
+ # Vector.new(1).resolve(['A'])
105
+ #
106
+ # # =>
107
+ # #<RedAmber::Vector(:uint8, size=1):0x00000000000037dc>
108
+ # [65]
109
+ #
110
+ # @example Upcast to uint16
111
+ # vector = Vector.new(256)
112
+ #
113
+ # # =>
114
+ # #<RedAmber::Vector(:uint16, size=1):0x000000000000c1fc>
115
+ # [256]
116
+ #
117
+ # vector.resolve([1, 2])
118
+ #
119
+ # # =>
120
+ # # Not a uint8 Vector
121
+ # #<RedAmber::Vector(:uint16, size=2):0x000000000000c328>
122
+ # [1, 2]
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def resolve(other)
127
+ case other
128
+ when Vector
129
+ Vector.create(data.resolve(other.data))
130
+ when Array, Arrow::Array, Arrow::ChunkedArray
131
+ Vector.create(data.resolve(other))
132
+ else
133
+ raise VectorArgumentError, "invalid argument: #{other}"
134
+ end
135
+ end
136
+
137
+ # String representation of self like an Array.
138
+ #
139
+ # @return [String]
140
+ # return self as same as Array's inspect.
141
+ #
49
142
  def to_s
50
143
  @data.to_a.inspect
51
144
  end
52
145
 
146
+ # String representation of self.
147
+ #
148
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
149
+ # - If it is 'MINIMUM', returns class and size.
150
+ # - If it is otherwise, returns class, size and preview.
151
+ # Default value of the ENV is 'Table'.
152
+ # @param limit [Integer]
153
+ # max width of the result.
154
+ # @return [String]
155
+ # show information of self as a String.
156
+ # @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
157
+ # puts vector.inspect
158
+ #
159
+ # # =>
160
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000037f0>
161
+ # [1, 2, 3]
162
+ #
163
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
164
+ # puts vector.inspect
165
+ #
166
+ # # =>
167
+ # RedAmber::Vector(:uint8, size=3)
168
+ #
53
169
  def inspect(limit: 80)
54
170
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
55
- # Better performance than `.upcase == 'MINIMUM'``
171
+ # Better performance than `.upcase == 'MINIMUM'`
56
172
  "#{self.class}(:#{type}, size=#{size})"
57
173
  else
58
174
  sio = StringIO.new << '['
59
- to_a.each_with_object(sio).with_index do |(e, s), i|
60
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
61
- if (s.size + next_str.size) < limit
62
- s << next_str
175
+ each.with_index do |e, i|
176
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
177
+ if (sio.size + next_str.size) < limit
178
+ sio << next_str
63
179
  else
64
- s << ', ... ' if i < size
180
+ sio << ', ... ' if i < size
65
181
  break
66
182
  end
67
183
  end
68
184
  sio << ']'
69
185
 
70
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
186
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
187
+ object_id, sio.string
71
188
  end
72
189
  end
73
190
 
74
- def values
191
+ # Convert to an Array.
192
+ #
193
+ # @return [Array]
194
+ # array representation.
195
+ #
196
+ def to_ary
75
197
  @data.values
76
198
  end
77
- alias_method :to_a, :values
78
- alias_method :entries, :values
79
-
199
+ alias_method :to_a, :to_ary
200
+ alias_method :values, :to_ary
201
+ alias_method :entries, :to_ary
202
+
203
+ # Indeces from 0 to size-1 by Array.
204
+ #
205
+ # @return [Array]
206
+ # indices.
207
+ #
80
208
  def indices
81
209
  (0...size).to_a
82
210
  end
83
211
  alias_method :indexes, :indices
84
212
  alias_method :indeces, :indices
85
213
 
86
- def to_ary
87
- values
88
- end
89
-
214
+ # Vector size.
215
+ #
216
+ # @return [Integer]
217
+ # size of self.
218
+ #
90
219
  def size
91
220
  # only defined :length in Arrow?
92
221
  @data.length
@@ -95,54 +224,151 @@ module RedAmber
95
224
  alias_method :n_rows, :size
96
225
  alias_method :nrow, :size
97
226
 
227
+ # Test wheather self is empty.
228
+ #
229
+ # @return [true, false]
230
+ # true if self is empty.
231
+ #
98
232
  def empty?
99
233
  size.zero?
100
234
  end
101
235
 
236
+ # Type nickname of self.
237
+ #
238
+ # @return [Symbol]
239
+ # type nickname of values.
240
+ #
102
241
  def type
103
- @data.value_type.nick.to_sym
242
+ list? ? :list : @data.value_type.nick.to_sym
104
243
  end
105
244
 
245
+ # Type Class of self.
246
+ #
247
+ # @return [type_Class]
248
+ # type class.
249
+ #
250
+ def type_class
251
+ @data.type_class
252
+ end
253
+
254
+ # Test if self is a boolean Vector.
255
+ #
256
+ # @return [true, false]
257
+ # test result.
258
+ #
106
259
  def boolean?
107
- type_class == Arrow::BooleanDataType
260
+ @data.boolean?
108
261
  end
109
262
 
263
+ # Test if self is a numeric Vector.
264
+ #
265
+ # @return [true, false]
266
+ # test result.
267
+ #
110
268
  def numeric?
111
- type_class < Arrow::NumericDataType
269
+ @data.numeric?
112
270
  end
113
271
 
272
+ # Test if self is a float Vector.
273
+ #
274
+ # @return [true, false]
275
+ # test result.
276
+ #
114
277
  def float?
115
- type_class < Arrow::FloatingPointDataType
278
+ @data.float?
116
279
  end
117
280
 
281
+ # Test if self is a integer Vector.
282
+ #
283
+ # @return [true, false]
284
+ # test result.
285
+ #
118
286
  def integer?
119
- type_class < Arrow::IntegerDataType
287
+ @data.integer?
120
288
  end
121
289
 
290
+ # Test if self is a string Vector.
291
+ #
292
+ # @return [true, false]
293
+ # test result.
294
+ #
122
295
  def string?
123
- type_class == Arrow::StringDataType
296
+ @data.string?
124
297
  end
125
298
 
299
+ # Test if self is a dictionary Vector.
300
+ #
301
+ # @return [true, false]
302
+ # test result.
303
+ #
126
304
  def dictionary?
127
- type_class == Arrow::DictionaryDataType
305
+ @data.dictionary?
128
306
  end
129
307
 
308
+ # Test if self is a temporal Vector.
309
+ #
310
+ # @return [true, false]
311
+ # test result.
312
+ #
130
313
  def temporal?
131
- type_class < Arrow::TemporalDataType
314
+ @data.temporal?
132
315
  end
133
316
 
134
- def type_class
135
- @data.value_data_type.class
317
+ # Test if self is a list Vector.
318
+ #
319
+ # @return [true, false]
320
+ # test result.
321
+ #
322
+ def list?
323
+ @data.list?
136
324
  end
137
325
 
326
+ # Iterates over Vector elements or returns a Enumerator.
327
+ #
328
+ # @overload each
329
+ # Returns a new Enumerator if no block given.
330
+ #
331
+ # @return [Enumerator]
332
+ # Enumerator of each elements.
333
+ #
334
+ # @overload each
335
+ # When a block given, passes each element in self to the block.
336
+ #
337
+ # @yieldparam element [Object]
338
+ # passes element by a block parameter.
339
+ # @yieldreturn [Object]
340
+ # evaluated result value from the block.
341
+ # @return [self]
342
+ # returns self.
343
+ #
138
344
  def each
139
345
  return enum_for(:each) unless block_given?
140
346
 
141
347
  size.times do |i|
142
348
  yield data[i]
143
349
  end
350
+ self
144
351
  end
145
352
 
353
+ # Returns a Vector from collected objects from the block.
354
+ #
355
+ # @overload map
356
+ # Returns a new Enumerator if no block given.
357
+ #
358
+ # @return [Enumerator]
359
+ # a new Enumerator.
360
+ #
361
+ # @overload map
362
+ # When a block given, calls the block with successive elements.
363
+ # Returns a Vector of the objects returned by the block.
364
+ #
365
+ # @yieldparam element [Object]
366
+ # passes element by a block parameter.
367
+ # @yieldreturn [Object]
368
+ # evaluated result value from the block.
369
+ # @return [self]
370
+ # returns the collected values from the block as a Vector.
371
+ #
146
372
  def map(&block)
147
373
  return enum_for(:map) unless block
148
374
 
@@ -150,18 +376,35 @@ module RedAmber
150
376
  end
151
377
  alias_method :collect, :map
152
378
 
153
- # undocumented
379
+ # Tests wheather self is chunked or not.
380
+ #
381
+ # @api private
382
+ # @return [true, false]
383
+ # returns true if #data is chunked.
384
+ #
154
385
  def chunked?
155
386
  @data.is_a? Arrow::ChunkedArray
156
387
  end
157
388
 
158
- # undocumented
389
+ # Returns the number of chunks.
390
+ #
391
+ # @api private
392
+ # @return [Integer]
393
+ # the number of chunks. If self is not chunked, returns zero.
394
+ #
159
395
  def n_chunks
160
396
  chunked? ? @data.n_chunks : 0
161
397
  end
162
398
 
163
399
  # def each_chunk() end
164
400
 
401
+ # Returns a hash containing the counts of equal elements.
402
+ #
403
+ # - Each key is an element of self.
404
+ # - Each value is the number of elements equal to the key.
405
+ # @return [Hash]
406
+ # result in a Hash.
407
+ #
165
408
  def tally
166
409
  hash = values.tally
167
410
  if (type_class < Arrow::FloatingPointDataType) && is_nan.any
@@ -177,22 +420,156 @@ module RedAmber
177
420
  hash
178
421
  end
179
422
 
423
+ # @api private
424
+ # Arrow imprementation of #tally
180
425
  def value_counts
181
426
  values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
182
427
  values.zip(counts).to_h
183
428
  end
184
429
 
430
+ # Count nils in self.
431
+ #
432
+ # @return [Integer]
433
+ # the number of nils.
434
+ #
185
435
  def n_nulls
186
436
  @data.n_nulls
187
437
  end
188
438
  alias_method :n_nils, :n_nulls
189
439
 
440
+ # Count NaNs in self if self is a numeric Vector
441
+ #
442
+ # @return [Integer]
443
+ # the number of Float::NANs. If self is not a numeric Vector,
444
+ # returns 0.
445
+ #
190
446
  def n_nans
191
447
  numeric? ? is_nan.to_a.count(true) : 0
192
448
  end
193
449
 
450
+ # Return true if self has any nil.
451
+ #
452
+ # @return [true, false]
453
+ # true or false.
454
+ #
194
455
  def has_nil?
195
456
  is_nil.any
196
457
  end
458
+
459
+ # Enable to compute with coercion mechanism.
460
+ #
461
+ # @example
462
+ # vector = Vector.new(1,2,3)
463
+ #
464
+ # # =>
465
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
466
+ # [1, 2, 3]
467
+ #
468
+ # # Vector's `#*` method
469
+ # vector * -1
470
+ #
471
+ # # =>
472
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000e3698>
473
+ # [-1, -2, -3]
474
+ #
475
+ # # coerced calculation
476
+ # -1 * vector
477
+ #
478
+ # # =>
479
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000ea4ac>
480
+ # [-1, -2, -3]
481
+ #
482
+ # # `@-` operator
483
+ # -vector
484
+ #
485
+ # # =>
486
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000ee7b4>
487
+ # [255, 254, 253]
488
+ #
489
+ def coerce(other)
490
+ [Vector.new(Array(other) * size), self]
491
+ end
492
+
493
+ # Spread the return value of an aggregate function as if
494
+ # it is a element-wise function.
495
+ #
496
+ # @overload propagate(function)
497
+ # Returns a Vector of same size as self spreading the value from function.
498
+ #
499
+ # @param function [Symbol] a name of aggregation function for self.
500
+ # Return value of the function must be a scalar.
501
+ # @return [Vector] Returns a Vector that is the same size as self
502
+ # and such that all elements are the same as the result of aggregation `function`.
503
+ # @example propagate by an aggragation function name
504
+ # vec = Vector.new(1, 2, 3, 4)
505
+ # vec.propagate(:mean)
506
+ # # =>
507
+ # #<RedAmber::Vector(:double, size=4):0x000000000001985c>
508
+ # [2.5, 2.5, 2.5, 2.5]
509
+ #
510
+ # @overload propagate
511
+ # Returns a Vector of same size as self spreading the value from block.
512
+ #
513
+ # @yieldparam self [Vector]
514
+ # gives self to the block.
515
+ # @yieldreturn [scalar]
516
+ # a scalar value.
517
+ # @return [Vector]
518
+ # returns a Vector that is the same size as self
519
+ # and such that all elements are the same as the yielded value from the block.
520
+ # @example propagate by a block
521
+ # vec.propagate { |v| v.mean.round }
522
+ # # =>
523
+ # #<RedAmber::Vector(:uint8, size=4):0x000000000000cb98>
524
+ # [3, 3, 3, 3]
525
+ #
526
+ # @since 0.4.0
527
+ #
528
+ def propagate(function = nil, &block)
529
+ value =
530
+ if block
531
+ raise VectorArgumentError, "can't specify both function and block" if function
532
+
533
+ yield self
534
+ else
535
+ function = function&.to_sym
536
+ unless function && respond_to?(function) && Vector.aggregate?(function)
537
+ raise VectorArgumentError, "illegal function: #{function.inspect}"
538
+ end
539
+
540
+ send(function)
541
+ end
542
+ Vector.new([value] * size)
543
+ end
544
+ alias_method :expand, :propagate
545
+
546
+ private # =======
547
+
548
+ def exec_func_unary(function, options)
549
+ options = nil if options.empty?
550
+ find(function).execute([data], options)
551
+ end
552
+
553
+ def exec_func_binary(function, other, options)
554
+ options = nil if options.empty?
555
+ case other
556
+ when Vector
557
+ find(function).execute([data, other.data], options)
558
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
559
+ Array, Numeric, String, TrueClass, FalseClass
560
+ find(function).execute([data, other], options)
561
+ end
562
+ end
563
+
564
+ def get_scalar(datum)
565
+ output = datum.value
566
+ case output
567
+ when Arrow::StringScalar then output.to_s
568
+ when Arrow::StructScalar
569
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
570
+ else
571
+ output.value
572
+ end
573
+ end
197
574
  end
198
575
  end