red_amber 0.2.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -5,88 +5,217 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
- include VectorFunctions
8
+ include Enumerable
9
+ include Helper
10
+ include ArrowFunction
9
11
  include VectorUpdatable
10
12
  include VectorSelectable
11
- include Helper
12
13
 
13
- def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
28
- arrow_array_like.to_arrow_array
29
- in [Range => r]
30
- Arrow::Array.new(Array(r))
31
- else
32
- begin
33
- Arrow::Array.new(Array(array))
34
- rescue Error
35
- raise VectorArgumentError, "Invalid argument: #{array}"
36
- end
37
- end
38
- end
14
+ using RefineArrayLike
15
+
16
+ # Quicker constructor of Vector.
17
+ #
18
+ # @param arrow_array [Arrow::Array]
19
+ # Arrow::Array object to have in the Vector.
20
+ # @return [Vector]
21
+ # created Vector.
22
+ # @note This method doesn't check argment type.
23
+ #
24
+ def self.create(arrow_array)
25
+ instance = allocate
26
+ instance.instance_variable_set(:@data, arrow_array)
27
+ instance
39
28
  end
40
29
 
41
- attr_reader :data
30
+ # Return true if it is an aggregation function.
31
+ #
32
+ # @param function [Symbol]
33
+ # function name to test.
34
+ # @return [Booleans]
35
+ # true if function is a aggregation function, otherwise false.
36
+ #
37
+ # @example
38
+ # Vector.aggregate?(:mean) # => true
39
+ #
40
+ # Vector.aggregate?(:round) # => false
41
+ #
42
+ # @since 0.4.0
43
+ #
44
+ def self.aggregate?(function)
45
+ %i[
46
+ all all? any any? approximate_median count count_distinct count_uniq
47
+ max mean median min min_max product quantile sd std stddev sum
48
+ unbiased_variance var variance
49
+ ].include?(function.to_sym)
50
+ end
42
51
 
43
- def to_arrow_array
44
- @data
52
+ # Create a Vector.
53
+ #
54
+ # @param array [Array, Vector, Range, Arrow::Array, #to_arrow_array]
55
+ # array-like.
56
+ # @return [Vector]
57
+ # created Vector.
58
+ #
59
+ # @note default is headless Vector and '@key == nil'
60
+ def initialize(*array)
61
+ @data =
62
+ case array
63
+ in [Vector => v]
64
+ v.data
65
+ in [Range => r]
66
+ Arrow::Array.new(Array(r))
67
+ in [Arrow::Array | Arrow::ChunkedArray]
68
+ array[0]
69
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
70
+ arrow_array_like.to_arrow_array
71
+ else
72
+ Arrow::Array.new(array.flatten)
73
+ end
45
74
  end
46
75
 
76
+ # Entity of Vector.
77
+ #
78
+ # @return [Arrow::Array]
79
+ #
80
+ attr_reader :data
81
+ alias_method :to_arrow_array, :data
82
+
83
+ # Associated key name when self is in a DataFrame.
84
+ #
85
+ # Default Vector is 'head-less' (key-less).
86
+ # @return [Symbol]
87
+ #
47
88
  attr_accessor :key
48
89
 
90
+ # Return other as a Vector which is same data type as self.
91
+ #
92
+ # @param other [Vector, Array, Arrow::Array, Arrow::ChunkedArray]
93
+ # a source array-like which will be converted.
94
+ # @return [Vector]
95
+ # resolved Vector.
96
+ # @example Integer to String
97
+ # Vector.new('A').resolve([1, 2])
98
+ #
99
+ # # =>
100
+ # #<RedAmber::Vector(:string, size=2):0x00000000000037b4>
101
+ # ["1", "2"]
102
+ #
103
+ # @example String to Ineger
104
+ # Vector.new(1).resolve(['A'])
105
+ #
106
+ # # =>
107
+ # #<RedAmber::Vector(:uint8, size=1):0x00000000000037dc>
108
+ # [65]
109
+ #
110
+ # @example Upcast to uint16
111
+ # vector = Vector.new(256)
112
+ #
113
+ # # =>
114
+ # #<RedAmber::Vector(:uint16, size=1):0x000000000000c1fc>
115
+ # [256]
116
+ #
117
+ # vector.resolve([1, 2])
118
+ #
119
+ # # =>
120
+ # # Not a uint8 Vector
121
+ # #<RedAmber::Vector(:uint16, size=2):0x000000000000c328>
122
+ # [1, 2]
123
+ #
124
+ # @since 0.4.0
125
+ #
126
+ def resolve(other)
127
+ case other
128
+ when Vector
129
+ Vector.create(data.resolve(other.data))
130
+ when Array, Arrow::Array, Arrow::ChunkedArray
131
+ Vector.create(data.resolve(other))
132
+ else
133
+ raise VectorArgumentError, "invalid argument: #{other}"
134
+ end
135
+ end
136
+
137
+ # String representation of self like an Array.
138
+ #
139
+ # @return [String]
140
+ # return self as same as Array's inspect.
141
+ #
49
142
  def to_s
50
143
  @data.to_a.inspect
51
144
  end
52
145
 
146
+ # String representation of self.
147
+ #
148
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
149
+ # - If it is 'MINIMUM', returns class and size.
150
+ # - If it is otherwise, returns class, size and preview.
151
+ # Default value of the ENV is 'Table'.
152
+ # @param limit [Integer]
153
+ # max width of the result.
154
+ # @return [String]
155
+ # show information of self as a String.
156
+ # @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
157
+ # puts vector.inspect
158
+ #
159
+ # # =>
160
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000037f0>
161
+ # [1, 2, 3]
162
+ #
163
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
164
+ # puts vector.inspect
165
+ #
166
+ # # =>
167
+ # RedAmber::Vector(:uint8, size=3)
168
+ #
53
169
  def inspect(limit: 80)
54
170
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
55
- # Better performance than `.upcase == 'MINIMUM'``
171
+ # Better performance than `.upcase == 'MINIMUM'`
56
172
  "#{self.class}(:#{type}, size=#{size})"
57
173
  else
58
174
  sio = StringIO.new << '['
59
- to_a.each_with_object(sio).with_index do |(e, s), i|
60
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
61
- if (s.size + next_str.size) < limit
62
- s << next_str
175
+ each.with_index do |e, i|
176
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
177
+ if (sio.size + next_str.size) < limit
178
+ sio << next_str
63
179
  else
64
- s << ', ... ' if i < size
180
+ sio << ', ... ' if i < size
65
181
  break
66
182
  end
67
183
  end
68
184
  sio << ']'
69
185
 
70
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
186
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
187
+ object_id, sio.string
71
188
  end
72
189
  end
73
190
 
74
- def values
191
+ # Convert to an Array.
192
+ #
193
+ # @return [Array]
194
+ # array representation.
195
+ #
196
+ def to_ary
75
197
  @data.values
76
198
  end
77
- alias_method :to_a, :values
78
- alias_method :entries, :values
79
-
199
+ alias_method :to_a, :to_ary
200
+ alias_method :values, :to_ary
201
+ alias_method :entries, :to_ary
202
+
203
+ # Indeces from 0 to size-1 by Array.
204
+ #
205
+ # @return [Array]
206
+ # indices.
207
+ #
80
208
  def indices
81
209
  (0...size).to_a
82
210
  end
83
211
  alias_method :indexes, :indices
84
212
  alias_method :indeces, :indices
85
213
 
86
- def to_ary
87
- values
88
- end
89
-
214
+ # Vector size.
215
+ #
216
+ # @return [Integer]
217
+ # size of self.
218
+ #
90
219
  def size
91
220
  # only defined :length in Arrow?
92
221
  @data.length
@@ -95,54 +224,151 @@ module RedAmber
95
224
  alias_method :n_rows, :size
96
225
  alias_method :nrow, :size
97
226
 
227
+ # Test wheather self is empty.
228
+ #
229
+ # @return [true, false]
230
+ # true if self is empty.
231
+ #
98
232
  def empty?
99
233
  size.zero?
100
234
  end
101
235
 
236
+ # Type nickname of self.
237
+ #
238
+ # @return [Symbol]
239
+ # type nickname of values.
240
+ #
102
241
  def type
103
- @data.value_type.nick.to_sym
242
+ list? ? :list : @data.value_type.nick.to_sym
104
243
  end
105
244
 
245
+ # Type Class of self.
246
+ #
247
+ # @return [type_Class]
248
+ # type class.
249
+ #
250
+ def type_class
251
+ @data.type_class
252
+ end
253
+
254
+ # Test if self is a boolean Vector.
255
+ #
256
+ # @return [true, false]
257
+ # test result.
258
+ #
106
259
  def boolean?
107
- type_class == Arrow::BooleanDataType
260
+ @data.boolean?
108
261
  end
109
262
 
263
+ # Test if self is a numeric Vector.
264
+ #
265
+ # @return [true, false]
266
+ # test result.
267
+ #
110
268
  def numeric?
111
- type_class < Arrow::NumericDataType
269
+ @data.numeric?
112
270
  end
113
271
 
272
+ # Test if self is a float Vector.
273
+ #
274
+ # @return [true, false]
275
+ # test result.
276
+ #
114
277
  def float?
115
- type_class < Arrow::FloatingPointDataType
278
+ @data.float?
116
279
  end
117
280
 
281
+ # Test if self is a integer Vector.
282
+ #
283
+ # @return [true, false]
284
+ # test result.
285
+ #
118
286
  def integer?
119
- type_class < Arrow::IntegerDataType
287
+ @data.integer?
120
288
  end
121
289
 
290
+ # Test if self is a string Vector.
291
+ #
292
+ # @return [true, false]
293
+ # test result.
294
+ #
122
295
  def string?
123
- type_class == Arrow::StringDataType
296
+ @data.string?
124
297
  end
125
298
 
299
+ # Test if self is a dictionary Vector.
300
+ #
301
+ # @return [true, false]
302
+ # test result.
303
+ #
126
304
  def dictionary?
127
- type_class == Arrow::DictionaryDataType
305
+ @data.dictionary?
128
306
  end
129
307
 
308
+ # Test if self is a temporal Vector.
309
+ #
310
+ # @return [true, false]
311
+ # test result.
312
+ #
130
313
  def temporal?
131
- type_class < Arrow::TemporalDataType
314
+ @data.temporal?
132
315
  end
133
316
 
134
- def type_class
135
- @data.value_data_type.class
317
+ # Test if self is a list Vector.
318
+ #
319
+ # @return [true, false]
320
+ # test result.
321
+ #
322
+ def list?
323
+ @data.list?
136
324
  end
137
325
 
326
+ # Iterates over Vector elements or returns a Enumerator.
327
+ #
328
+ # @overload each
329
+ # Returns a new Enumerator if no block given.
330
+ #
331
+ # @return [Enumerator]
332
+ # Enumerator of each elements.
333
+ #
334
+ # @overload each
335
+ # When a block given, passes each element in self to the block.
336
+ #
337
+ # @yieldparam element [Object]
338
+ # passes element by a block parameter.
339
+ # @yieldreturn [Object]
340
+ # evaluated result value from the block.
341
+ # @return [self]
342
+ # returns self.
343
+ #
138
344
  def each
139
345
  return enum_for(:each) unless block_given?
140
346
 
141
347
  size.times do |i|
142
348
  yield data[i]
143
349
  end
350
+ self
144
351
  end
145
352
 
353
+ # Returns a Vector from collected objects from the block.
354
+ #
355
+ # @overload map
356
+ # Returns a new Enumerator if no block given.
357
+ #
358
+ # @return [Enumerator]
359
+ # a new Enumerator.
360
+ #
361
+ # @overload map
362
+ # When a block given, calls the block with successive elements.
363
+ # Returns a Vector of the objects returned by the block.
364
+ #
365
+ # @yieldparam element [Object]
366
+ # passes element by a block parameter.
367
+ # @yieldreturn [Object]
368
+ # evaluated result value from the block.
369
+ # @return [self]
370
+ # returns the collected values from the block as a Vector.
371
+ #
146
372
  def map(&block)
147
373
  return enum_for(:map) unless block
148
374
 
@@ -150,18 +376,35 @@ module RedAmber
150
376
  end
151
377
  alias_method :collect, :map
152
378
 
153
- # undocumented
379
+ # Tests wheather self is chunked or not.
380
+ #
381
+ # @api private
382
+ # @return [true, false]
383
+ # returns true if #data is chunked.
384
+ #
154
385
  def chunked?
155
386
  @data.is_a? Arrow::ChunkedArray
156
387
  end
157
388
 
158
- # undocumented
389
+ # Returns the number of chunks.
390
+ #
391
+ # @api private
392
+ # @return [Integer]
393
+ # the number of chunks. If self is not chunked, returns zero.
394
+ #
159
395
  def n_chunks
160
396
  chunked? ? @data.n_chunks : 0
161
397
  end
162
398
 
163
399
  # def each_chunk() end
164
400
 
401
+ # Returns a hash containing the counts of equal elements.
402
+ #
403
+ # - Each key is an element of self.
404
+ # - Each value is the number of elements equal to the key.
405
+ # @return [Hash]
406
+ # result in a Hash.
407
+ #
165
408
  def tally
166
409
  hash = values.tally
167
410
  if (type_class < Arrow::FloatingPointDataType) && is_nan.any
@@ -177,22 +420,156 @@ module RedAmber
177
420
  hash
178
421
  end
179
422
 
423
+ # @api private
424
+ # Arrow imprementation of #tally
180
425
  def value_counts
181
426
  values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
182
427
  values.zip(counts).to_h
183
428
  end
184
429
 
430
+ # Count nils in self.
431
+ #
432
+ # @return [Integer]
433
+ # the number of nils.
434
+ #
185
435
  def n_nulls
186
436
  @data.n_nulls
187
437
  end
188
438
  alias_method :n_nils, :n_nulls
189
439
 
440
+ # Count NaNs in self if self is a numeric Vector
441
+ #
442
+ # @return [Integer]
443
+ # the number of Float::NANs. If self is not a numeric Vector,
444
+ # returns 0.
445
+ #
190
446
  def n_nans
191
447
  numeric? ? is_nan.to_a.count(true) : 0
192
448
  end
193
449
 
450
+ # Return true if self has any nil.
451
+ #
452
+ # @return [true, false]
453
+ # true or false.
454
+ #
194
455
  def has_nil?
195
456
  is_nil.any
196
457
  end
458
+
459
+ # Enable to compute with coercion mechanism.
460
+ #
461
+ # @example
462
+ # vector = Vector.new(1,2,3)
463
+ #
464
+ # # =>
465
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
466
+ # [1, 2, 3]
467
+ #
468
+ # # Vector's `#*` method
469
+ # vector * -1
470
+ #
471
+ # # =>
472
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000e3698>
473
+ # [-1, -2, -3]
474
+ #
475
+ # # coerced calculation
476
+ # -1 * vector
477
+ #
478
+ # # =>
479
+ # #<RedAmber::Vector(:int16, size=3):0x00000000000ea4ac>
480
+ # [-1, -2, -3]
481
+ #
482
+ # # `@-` operator
483
+ # -vector
484
+ #
485
+ # # =>
486
+ # #<RedAmber::Vector(:uint8, size=3):0x00000000000ee7b4>
487
+ # [255, 254, 253]
488
+ #
489
+ def coerce(other)
490
+ [Vector.new(Array(other) * size), self]
491
+ end
492
+
493
+ # Spread the return value of an aggregate function as if
494
+ # it is a element-wise function.
495
+ #
496
+ # @overload propagate(function)
497
+ # Returns a Vector of same size as self spreading the value from function.
498
+ #
499
+ # @param function [Symbol] a name of aggregation function for self.
500
+ # Return value of the function must be a scalar.
501
+ # @return [Vector] Returns a Vector that is the same size as self
502
+ # and such that all elements are the same as the result of aggregation `function`.
503
+ # @example propagate by an aggragation function name
504
+ # vec = Vector.new(1, 2, 3, 4)
505
+ # vec.propagate(:mean)
506
+ # # =>
507
+ # #<RedAmber::Vector(:double, size=4):0x000000000001985c>
508
+ # [2.5, 2.5, 2.5, 2.5]
509
+ #
510
+ # @overload propagate
511
+ # Returns a Vector of same size as self spreading the value from block.
512
+ #
513
+ # @yieldparam self [Vector]
514
+ # gives self to the block.
515
+ # @yieldreturn [scalar]
516
+ # a scalar value.
517
+ # @return [Vector]
518
+ # returns a Vector that is the same size as self
519
+ # and such that all elements are the same as the yielded value from the block.
520
+ # @example propagate by a block
521
+ # vec.propagate { |v| v.mean.round }
522
+ # # =>
523
+ # #<RedAmber::Vector(:uint8, size=4):0x000000000000cb98>
524
+ # [3, 3, 3, 3]
525
+ #
526
+ # @since 0.4.0
527
+ #
528
+ def propagate(function = nil, &block)
529
+ value =
530
+ if block
531
+ raise VectorArgumentError, "can't specify both function and block" if function
532
+
533
+ yield self
534
+ else
535
+ function = function&.to_sym
536
+ unless function && respond_to?(function) && Vector.aggregate?(function)
537
+ raise VectorArgumentError, "illegal function: #{function.inspect}"
538
+ end
539
+
540
+ send(function)
541
+ end
542
+ Vector.new([value] * size)
543
+ end
544
+ alias_method :expand, :propagate
545
+
546
+ private # =======
547
+
548
+ def exec_func_unary(function, options)
549
+ options = nil if options.empty?
550
+ find(function).execute([data], options)
551
+ end
552
+
553
+ def exec_func_binary(function, other, options)
554
+ options = nil if options.empty?
555
+ case other
556
+ when Vector
557
+ find(function).execute([data, other.data], options)
558
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
559
+ Array, Numeric, String, TrueClass, FalseClass
560
+ find(function).execute([data, other], options)
561
+ end
562
+ end
563
+
564
+ def get_scalar(datum)
565
+ output = datum.value
566
+ case output
567
+ when Arrow::StringScalar then output.to_s
568
+ when Arrow::StructScalar
569
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
570
+ else
571
+ output.value
572
+ end
573
+ end
197
574
  end
198
575
  end