red_amber 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -4,91 +4,131 @@
4
4
  # reference: https://arrow.apache.org/docs/cpp/compute.html
5
5
 
6
6
  module RedAmber
7
- # mix-ins for class Vector
8
- # Functions to select some data.
7
+ # Mix-in for class Vector
8
+ # Functions to select some data.
9
9
  module VectorSelectable
10
- def drop_nil
11
- datum = find(:drop_null).execute([data])
12
- Vector.new(datum.value)
13
- end
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ # Select elements in the self by indices.
14
+ #
15
+ # @param indices [Array<Numeric>, Vector]
16
+ # an array-like of indices.
17
+ # @yieldreturn [Array<Numeric>, Vector]
18
+ # an array-like of indices from the block.
19
+ # @return [Vector]
20
+ # vector by selected elements.
21
+ #
22
+ # TODO: support for the option `boundscheck: true`
23
+ def take(*indices, &block)
24
+ if block
25
+ unless indices.empty?
26
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
27
+ end
14
28
 
15
- # vector calculation version of selection by indices
16
- # TODO: support for option {boundscheck: true}
17
- def take(*indices)
18
- indices.flatten!
19
- return Vector.new([]) if indices.empty?
29
+ indices = [yield]
30
+ end
31
+
32
+ vector =
33
+ case indices
34
+ in [Vector => v] if v.numeric?
35
+ return Vector.create(take_by_vector(v))
36
+ in []
37
+ return Vector.new
38
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
39
+ Vector.create(aa)
40
+ else
41
+ Vector.new(indices.flatten)
42
+ end
20
43
 
21
- indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
- indices = Vector.new(indices) unless indices.is_a?(Vector)
44
+ unless vector.numeric?
45
+ raise VectorArgumentError, "argument must be a integers: #{indices}"
46
+ end
23
47
 
24
- take_by_vector(indices) # returns sub Vector
48
+ Vector.create(take_by_vector(vector))
25
49
  end
26
50
 
27
- # TODO: support for option {null_selection_behavior: :drop}
51
+ # Select elements in the self by booleans.
52
+ #
53
+ # @param booleans [Array<true, false, nil>, Vector]
54
+ # an array-like of booleans.
55
+ # @yieldreturn [Array<true, false, nil>, Vector]
56
+ # an array-like of booleans from the block.
57
+ # @return [Vector]
58
+ # vector by selected elements.
59
+ #
60
+ # TODO: support for the option `null_selection_behavior: :drop`
28
61
  def filter(*booleans, &block)
29
62
  if block
30
- raise VectorArgumentError, 'Must not specify both arguments and block.' unless booleans.empty?
63
+ unless booleans.empty?
64
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
65
+ end
31
66
 
32
67
  booleans = [yield]
33
68
  end
34
69
 
35
- booleans.flatten!
36
- return Vector.new([]) if booleans.empty?
70
+ case booleans
71
+ in [Vector => v]
72
+ raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
37
73
 
38
- b = booleans[0]
39
- boolean_array =
40
- case b
41
- when Vector
42
- raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
43
-
44
- b.data
45
- when Arrow::BooleanArray
46
- b
74
+ Vector.create(filter_by_array(v.data))
75
+ in [Arrow::BooleanArray => ba]
76
+ Vector.create(filter_by_array(ba))
77
+ in []
78
+ Vector.new
79
+ else
80
+ booleans.flatten!
81
+ a = Arrow::Array.new(booleans)
82
+ if a.boolean?
83
+ Vector.create(filter_by_array(a))
84
+ elsif booleans.compact.empty? # [nil, nil] becomes string array
85
+ Vector.new
47
86
  else
48
- raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
49
-
50
- Arrow::BooleanArray.new(booleans)
87
+ raise VectorTypeError, "Argument is not a boolean: #{booleans}"
51
88
  end
52
-
53
- filter_by_array(boolean_array) # returns sub Vector
89
+ end
54
90
  end
55
91
  alias_method :select, :filter
56
92
  alias_method :find_all, :filter
57
93
 
58
- # @param indices
59
- # @param booleans
94
+ # Select elements in the self by indices or booleans.
95
+ #
96
+ # @param args [Array<Numeric, true, false, nil>, Vector]
97
+ # specifier. Indices or booleans.
98
+ # @yieldparam [Array<Numeric, true, false, nil>, Vector]
99
+ # specifier. Indices or booleans.
100
+ # @return [scalar, Array]
101
+ # returns scalar or array.
102
+ #
60
103
  def [](*args)
61
- args.flatten!
62
- return Vector.new([]) if args.empty?
63
-
64
- arg = args[0]
65
- case arg
66
- when Vector
67
- return take_by_vector(arg) if arg.numeric?
68
- return filter_by_array(arg.data) if arg.boolean?
69
-
70
- raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
71
- when Arrow::BooleanArray
72
- return filter_by_array(arg)
73
- when Arrow::Array
74
- array = arg
75
- when Range
76
- array = normalize_element(arg)
77
- else
78
- unless arg.is_a?(Numeric) || booleans?([arg])
79
- raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
104
+ array =
105
+ case args
106
+ in [Vector => v]
107
+ return scalar_or_array(take_by_vector(v)) if v.numeric?
108
+ return scalar_or_array(filter_by_array(v.data)) if v.boolean?
109
+
110
+ raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
111
+ in [Arrow::BooleanArray => ba]
112
+ return scalar_or_array(filter_by_array(ba))
113
+ in []
114
+ return nil
115
+ in [Arrow::Array => arrow_array]
116
+ arrow_array
117
+ in [Range => r]
118
+ Arrow::Array.new(parse_range(r, size))
119
+ else
120
+ Arrow::Array.new(args.flatten)
80
121
  end
81
- end
82
- array ||= Arrow::Array.new(args)
83
- return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
122
+
123
+ return scalar_or_array(filter_by_array(array)) if array.boolean?
84
124
 
85
125
  vector = Vector.new(array)
86
- return take_by_vector(vector) if vector.numeric?
126
+ return scalar_or_array(take_by_vector(vector)) if vector.numeric?
87
127
 
88
128
  raise VectorArgumentError, "Invalid argument: #{args}"
89
129
  end
90
130
 
91
- # @param values [Array, Arrow::Array, Vector]
131
+ # @param values [Array, Arrow::Array, Vector]
92
132
  def is_in(*values)
93
133
  self_data = chunked? ? data.pack : data
94
134
 
@@ -100,7 +140,7 @@ module RedAmber
100
140
  Array(values).flatten
101
141
  end
102
142
 
103
- Vector.new(self_data.is_in(array))
143
+ Vector.create(self_data.is_in(array))
104
144
  end
105
145
 
106
146
  # Arrow's support required
@@ -108,28 +148,240 @@ module RedAmber
108
148
  to_a.index(element)
109
149
  end
110
150
 
151
+ # Drop nil in self and returns a new Vector as a result.
152
+ #
153
+ # @return [Vector]
154
+ # a Vector without nils.
155
+ #
156
+ def drop_nil
157
+ datum = find(:drop_null).execute([data])
158
+ Vector.create(datum.value)
159
+ end
160
+
161
+ # Arrange values in Vector.
162
+ #
163
+ # @param order [Symbol]
164
+ # sort order.
165
+ # - `:+`, `:ascending` or without argument will sort in increasing order.
166
+ # - `:-` or `:descending` will sort in decreasing order.
167
+ # @return [Vector]
168
+ # sorted Vector.
169
+ # @example Sort in increasing order (default)
170
+ # Vector.new(%w[B D A E C]).sort
171
+ # # same as #sort(:+)
172
+ # # same as #sort(:ascending)
173
+ #
174
+ # # =>
175
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c134>
176
+ # ["A", "B", "C", "D", "E"]
177
+ #
178
+ # @example Sort in decreasing order
179
+ # Vector.new(%w[B D A E C]).sort(:-)
180
+ # # same as #sort(:descending)
181
+ #
182
+ # # =>
183
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c148>
184
+ # ["E", "D", "C", "B", "A"]
185
+ #
186
+ # @since 0.4.0
187
+ #
188
+ def sort(order = :ascending)
189
+ order =
190
+ case order.to_sym
191
+ when :+, :ascending, :increasing
192
+ :ascending
193
+ when :-, :descending, :decreasing
194
+ :descending
195
+ else
196
+ raise VectorArgumentError, "illegal order option: #{order}"
197
+ end
198
+ take(sort_indices(order: order))
199
+ end
200
+
201
+ # Returns numerical rank of self.
202
+ # - Nil values are considered greater than any value.
203
+ # - NaN values are considered greater than any value but smaller than nil values.
204
+ # - Tiebreakers are ranked in order of appearance.
205
+ # - `RankOptions` in C++ function is not implemented in C GLib yet.
206
+ # This method is currently fixed to the default behavior.
207
+ #
208
+ # @return [Vector]
209
+ # 0-based rank of self (0...size in range).
210
+ # @example Rank of float Vector
211
+ # fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
212
+ #
213
+ # # =>
214
+ # #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
215
+ # [0.1, nil, NaN, 0.2, 0.1]
216
+ #
217
+ # fv.rank
218
+ #
219
+ # # =>
220
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
221
+ # [0, 4, 3, 2, 1]
222
+ #
223
+ # @example Rank of string Vector
224
+ # sv = Vector.new("A", "B", nil, "A", "C"); sv
225
+ #
226
+ # # =>
227
+ # #<RedAmber::Vector(:string, size=5):0x0000000000003854>
228
+ # ["A", "B", nil, "A", "C"]
229
+ #
230
+ # sv.rank
231
+ #
232
+ # # =>
233
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
234
+ # [0, 2, 4, 1, 3]
235
+ #
236
+ # @since 0.4.0
237
+ #
238
+ def rank
239
+ datum = Arrow::Function.find(:rank).execute([data])
240
+ Vector.create(datum.value) - 1
241
+ end
242
+
243
+ # Pick up elements at random.
244
+ #
245
+ # @overload sample()
246
+ # Return a randomly selected element.
247
+ # This is one of an aggregation function.
248
+ #
249
+ # @return [scalar]
250
+ # one of an element in self.
251
+ # @example Sample a element
252
+ # v = Vector.new('A'..'H'); v
253
+ #
254
+ # # =>
255
+ # #<RedAmber::Vector(:string, size=8):0x0000000000011b20>
256
+ # ["A", "B", "C", "D", "E", "F", "G", "H"]
257
+ #
258
+ # v.sample
259
+ #
260
+ # # =>
261
+ # "C"
262
+ #
263
+ # @overload sample(n)
264
+ # Pick up n elements at random.
265
+ #
266
+ # @param n [Integer]
267
+ # positive number of elements to pick.
268
+ # If n is smaller or equal to size, elements are picked by non-repeating.
269
+ # If n is greater than `size`, elements are picked repeatedly.
270
+ # @return [Vector]
271
+ # sampled elements.
272
+ # If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
273
+ # not a scalar.
274
+ # @example Sample Vector in size 1
275
+ # v.sample(1)
276
+ #
277
+ # # =>
278
+ # #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
279
+ # ["H"]
280
+ #
281
+ # @example Sample same size of self: every element is picked in random order
282
+ # v.sample(8)
283
+ #
284
+ # # =>
285
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
286
+ # ["H", "D", "B", "F", "E", "A", "G", "C"]
287
+ #
288
+ # @example Over sampling: "E" and "A" are sampled repeatedly
289
+ # v.sample(9)
290
+ #
291
+ # # =>
292
+ # #<RedAmber::Vector(:string, size=9):0x000000000001d790>
293
+ # ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
294
+ #
295
+ # @overload sample(prop)
296
+ # Pick up elements by proportion `prop` at random.
297
+ #
298
+ # @param prop [Float]
299
+ # positive proportion of elements to pick.
300
+ # Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
301
+ # If prop is smaller or equal to 1.0, elements are picked by non-repeating.
302
+ # If prop is greater than 1.0, some elements are picked repeatedly.
303
+ # @return [Vector]
304
+ # sampled elements.
305
+ # If picked element is only one, it returns a Vector of size == 1
306
+ # not a scalar.
307
+ # @example Sample same size of self: every element is picked in random order
308
+ # v.sample(1.0)
309
+ #
310
+ # # =>
311
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
312
+ # ["D", "H", "F", "C", "A", "B", "E", "G"]
313
+ #
314
+ # @example 2 times over sampling
315
+ # v.sample(2.0)
316
+ #
317
+ # # =>
318
+ # #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
319
+ # ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
320
+ #
321
+ # @since 0.4.0
322
+ #
323
+ def sample(n_or_prop = nil)
324
+ require 'arrow-numo-narray'
325
+
326
+ return nil if size == 0
327
+
328
+ n_sample =
329
+ case n_or_prop
330
+ in Integer
331
+ n_or_prop
332
+ in Float
333
+ (n_or_prop * size).round
334
+ in nil
335
+ return to_a.sample
336
+ else
337
+ raise VectorArgumentError, "must specify Integer or Float but #{n_or_prop}"
338
+ end
339
+ if n_or_prop < 0
340
+ raise VectorArgumentError, '#sample does not accept negative number.'
341
+ end
342
+ return Vector.new([]) if n_sample == 0
343
+
344
+ over_sample = [8 * size, n_sample].max
345
+ over_size = n_sample > size ? n_sample / size * size * 2 : size
346
+ over_vector =
347
+ Vector.create(Numo::UInt32.new(over_size).rand(over_sample).to_arrow_array)
348
+ indices = over_vector.rank.take(*0...n_sample)
349
+ take(indices - ((indices / size) * size))
350
+ end
351
+
111
352
  private
112
353
 
113
354
  # Accepts indices by numeric Vector
114
355
  def take_by_vector(indices)
115
- raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
116
- raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
356
+ indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
117
357
 
118
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
119
- raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
358
+ min, max = indices.min_max
359
+ raise VectorArgumentError, "Index out of range: #{min}" if min < 0
360
+ raise VectorArgumentError, "Index out of range: #{max}" if max >= size
120
361
 
121
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
362
+ index_array =
363
+ if indices.float?
364
+ Arrow::UInt64ArrayBuilder.build(indices.data)
365
+ else
366
+ indices.data
367
+ end
122
368
 
123
- datum = find(:take).execute([data, index_array]) # :array_take will fail with ChunkedArray
124
- Vector.new(datum.value)
369
+ # :array_take will fail with ChunkedArray
370
+ find(:take).execute([data, index_array]).value
125
371
  end
126
372
 
127
373
  # Accepts booleans by Arrow::BooleanArray
128
374
  def filter_by_array(boolean_array)
129
- raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
375
+ unless boolean_array.length == size
376
+ raise VectorArgumentError, 'Booleans must be same size as self.'
377
+ end
378
+
379
+ find(:array_filter).execute([data, boolean_array]).value
380
+ end
130
381
 
131
- datum = find(:array_filter).execute([data, boolean_array])
132
- Vector.new(datum.value)
382
+ def scalar_or_array(arrow_array)
383
+ a = arrow_array.to_a
384
+ a.size > 1 ? a : a[0]
133
385
  end
134
386
  end
135
387
  end