red_amber 0.2.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -4,91 +4,131 @@
4
4
  # reference: https://arrow.apache.org/docs/cpp/compute.html
5
5
 
6
6
  module RedAmber
7
- # mix-ins for class Vector
8
- # Functions to select some data.
7
+ # Mix-in for class Vector
8
+ # Functions to select some data.
9
9
  module VectorSelectable
10
- def drop_nil
11
- datum = find(:drop_null).execute([data])
12
- Vector.new(datum.value)
13
- end
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ # Select elements in the self by indices.
14
+ #
15
+ # @param indices [Array<Numeric>, Vector]
16
+ # an array-like of indices.
17
+ # @yieldreturn [Array<Numeric>, Vector]
18
+ # an array-like of indices from the block.
19
+ # @return [Vector]
20
+ # vector by selected elements.
21
+ #
22
+ # TODO: support for the option `boundscheck: true`
23
+ def take(*indices, &block)
24
+ if block
25
+ unless indices.empty?
26
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
27
+ end
14
28
 
15
- # vector calculation version of selection by indices
16
- # TODO: support for option {boundscheck: true}
17
- def take(*indices)
18
- indices.flatten!
19
- return Vector.new([]) if indices.empty?
29
+ indices = [yield]
30
+ end
31
+
32
+ vector =
33
+ case indices
34
+ in [Vector => v] if v.numeric?
35
+ return Vector.create(take_by_vector(v))
36
+ in []
37
+ return Vector.new
38
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
39
+ Vector.create(aa)
40
+ else
41
+ Vector.new(indices.flatten)
42
+ end
20
43
 
21
- indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
- indices = Vector.new(indices) unless indices.is_a?(Vector)
44
+ unless vector.numeric?
45
+ raise VectorArgumentError, "argument must be a integers: #{indices}"
46
+ end
23
47
 
24
- take_by_vector(indices) # returns sub Vector
48
+ Vector.create(take_by_vector(vector))
25
49
  end
26
50
 
27
- # TODO: support for option {null_selection_behavior: :drop}
51
+ # Select elements in the self by booleans.
52
+ #
53
+ # @param booleans [Array<true, false, nil>, Vector]
54
+ # an array-like of booleans.
55
+ # @yieldreturn [Array<true, false, nil>, Vector]
56
+ # an array-like of booleans from the block.
57
+ # @return [Vector]
58
+ # vector by selected elements.
59
+ #
60
+ # TODO: support for the option `null_selection_behavior: :drop`
28
61
  def filter(*booleans, &block)
29
62
  if block
30
- raise VectorArgumentError, 'Must not specify both arguments and block.' unless booleans.empty?
63
+ unless booleans.empty?
64
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
65
+ end
31
66
 
32
67
  booleans = [yield]
33
68
  end
34
69
 
35
- booleans.flatten!
36
- return Vector.new([]) if booleans.empty?
70
+ case booleans
71
+ in [Vector => v]
72
+ raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
37
73
 
38
- b = booleans[0]
39
- boolean_array =
40
- case b
41
- when Vector
42
- raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
43
-
44
- b.data
45
- when Arrow::BooleanArray
46
- b
74
+ Vector.create(filter_by_array(v.data))
75
+ in [Arrow::BooleanArray => ba]
76
+ Vector.create(filter_by_array(ba))
77
+ in []
78
+ Vector.new
79
+ else
80
+ booleans.flatten!
81
+ a = Arrow::Array.new(booleans)
82
+ if a.boolean?
83
+ Vector.create(filter_by_array(a))
84
+ elsif booleans.compact.empty? # [nil, nil] becomes string array
85
+ Vector.new
47
86
  else
48
- raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
49
-
50
- Arrow::BooleanArray.new(booleans)
87
+ raise VectorTypeError, "Argument is not a boolean: #{booleans}"
51
88
  end
52
-
53
- filter_by_array(boolean_array) # returns sub Vector
89
+ end
54
90
  end
55
91
  alias_method :select, :filter
56
92
  alias_method :find_all, :filter
57
93
 
58
- # @param indices
59
- # @param booleans
94
+ # Select elements in the self by indices or booleans.
95
+ #
96
+ # @param args [Array<Numeric, true, false, nil>, Vector]
97
+ # specifier. Indices or booleans.
98
+ # @yieldparam [Array<Numeric, true, false, nil>, Vector]
99
+ # specifier. Indices or booleans.
100
+ # @return [scalar, Array]
101
+ # returns scalar or array.
102
+ #
60
103
  def [](*args)
61
- args.flatten!
62
- return Vector.new([]) if args.empty?
63
-
64
- arg = args[0]
65
- case arg
66
- when Vector
67
- return take_by_vector(arg) if arg.numeric?
68
- return filter_by_array(arg.data) if arg.boolean?
69
-
70
- raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
71
- when Arrow::BooleanArray
72
- return filter_by_array(arg)
73
- when Arrow::Array
74
- array = arg
75
- when Range
76
- array = normalize_element(arg)
77
- else
78
- unless arg.is_a?(Numeric) || booleans?([arg])
79
- raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
104
+ array =
105
+ case args
106
+ in [Vector => v]
107
+ return scalar_or_array(take_by_vector(v)) if v.numeric?
108
+ return scalar_or_array(filter_by_array(v.data)) if v.boolean?
109
+
110
+ raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
111
+ in [Arrow::BooleanArray => ba]
112
+ return scalar_or_array(filter_by_array(ba))
113
+ in []
114
+ return nil
115
+ in [Arrow::Array => arrow_array]
116
+ arrow_array
117
+ in [Range => r]
118
+ Arrow::Array.new(parse_range(r, size))
119
+ else
120
+ Arrow::Array.new(args.flatten)
80
121
  end
81
- end
82
- array ||= Arrow::Array.new(args)
83
- return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
122
+
123
+ return scalar_or_array(filter_by_array(array)) if array.boolean?
84
124
 
85
125
  vector = Vector.new(array)
86
- return take_by_vector(vector) if vector.numeric?
126
+ return scalar_or_array(take_by_vector(vector)) if vector.numeric?
87
127
 
88
128
  raise VectorArgumentError, "Invalid argument: #{args}"
89
129
  end
90
130
 
91
- # @param values [Array, Arrow::Array, Vector]
131
+ # @param values [Array, Arrow::Array, Vector]
92
132
  def is_in(*values)
93
133
  self_data = chunked? ? data.pack : data
94
134
 
@@ -100,7 +140,7 @@ module RedAmber
100
140
  Array(values).flatten
101
141
  end
102
142
 
103
- Vector.new(self_data.is_in(array))
143
+ Vector.create(self_data.is_in(array))
104
144
  end
105
145
 
106
146
  # Arrow's support required
@@ -108,28 +148,240 @@ module RedAmber
108
148
  to_a.index(element)
109
149
  end
110
150
 
151
+ # Drop nil in self and returns a new Vector as a result.
152
+ #
153
+ # @return [Vector]
154
+ # a Vector without nils.
155
+ #
156
+ def drop_nil
157
+ datum = find(:drop_null).execute([data])
158
+ Vector.create(datum.value)
159
+ end
160
+
161
+ # Arrange values in Vector.
162
+ #
163
+ # @param order [Symbol]
164
+ # sort order.
165
+ # - `:+`, `:ascending` or without argument will sort in increasing order.
166
+ # - `:-` or `:descending` will sort in decreasing order.
167
+ # @return [Vector]
168
+ # sorted Vector.
169
+ # @example Sort in increasing order (default)
170
+ # Vector.new(%w[B D A E C]).sort
171
+ # # same as #sort(:+)
172
+ # # same as #sort(:ascending)
173
+ #
174
+ # # =>
175
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c134>
176
+ # ["A", "B", "C", "D", "E"]
177
+ #
178
+ # @example Sort in decreasing order
179
+ # Vector.new(%w[B D A E C]).sort(:-)
180
+ # # same as #sort(:descending)
181
+ #
182
+ # # =>
183
+ # #<RedAmber::Vector(:string, size=5):0x000000000000c148>
184
+ # ["E", "D", "C", "B", "A"]
185
+ #
186
+ # @since 0.4.0
187
+ #
188
+ def sort(order = :ascending)
189
+ order =
190
+ case order.to_sym
191
+ when :+, :ascending, :increasing
192
+ :ascending
193
+ when :-, :descending, :decreasing
194
+ :descending
195
+ else
196
+ raise VectorArgumentError, "illegal order option: #{order}"
197
+ end
198
+ take(sort_indices(order: order))
199
+ end
200
+
201
+ # Returns numerical rank of self.
202
+ # - Nil values are considered greater than any value.
203
+ # - NaN values are considered greater than any value but smaller than nil values.
204
+ # - Tiebreakers are ranked in order of appearance.
205
+ # - `RankOptions` in C++ function is not implemented in C GLib yet.
206
+ # This method is currently fixed to the default behavior.
207
+ #
208
+ # @return [Vector]
209
+ # 0-based rank of self (0...size in range).
210
+ # @example Rank of float Vector
211
+ # fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
212
+ #
213
+ # # =>
214
+ # #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
215
+ # [0.1, nil, NaN, 0.2, 0.1]
216
+ #
217
+ # fv.rank
218
+ #
219
+ # # =>
220
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
221
+ # [0, 4, 3, 2, 1]
222
+ #
223
+ # @example Rank of string Vector
224
+ # sv = Vector.new("A", "B", nil, "A", "C"); sv
225
+ #
226
+ # # =>
227
+ # #<RedAmber::Vector(:string, size=5):0x0000000000003854>
228
+ # ["A", "B", nil, "A", "C"]
229
+ #
230
+ # sv.rank
231
+ #
232
+ # # =>
233
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
234
+ # [0, 2, 4, 1, 3]
235
+ #
236
+ # @since 0.4.0
237
+ #
238
+ def rank
239
+ datum = Arrow::Function.find(:rank).execute([data])
240
+ Vector.create(datum.value) - 1
241
+ end
242
+
243
+ # Pick up elements at random.
244
+ #
245
+ # @overload sample()
246
+ # Return a randomly selected element.
247
+ # This is one of an aggregation function.
248
+ #
249
+ # @return [scalar]
250
+ # one of an element in self.
251
+ # @example Sample a element
252
+ # v = Vector.new('A'..'H'); v
253
+ #
254
+ # # =>
255
+ # #<RedAmber::Vector(:string, size=8):0x0000000000011b20>
256
+ # ["A", "B", "C", "D", "E", "F", "G", "H"]
257
+ #
258
+ # v.sample
259
+ #
260
+ # # =>
261
+ # "C"
262
+ #
263
+ # @overload sample(n)
264
+ # Pick up n elements at random.
265
+ #
266
+ # @param n [Integer]
267
+ # positive number of elements to pick.
268
+ # If n is smaller or equal to size, elements are picked by non-repeating.
269
+ # If n is greater than `size`, elements are picked repeatedly.
270
+ # @return [Vector]
271
+ # sampled elements.
272
+ # If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
273
+ # not a scalar.
274
+ # @example Sample Vector in size 1
275
+ # v.sample(1)
276
+ #
277
+ # # =>
278
+ # #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
279
+ # ["H"]
280
+ #
281
+ # @example Sample same size of self: every element is picked in random order
282
+ # v.sample(8)
283
+ #
284
+ # # =>
285
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
286
+ # ["H", "D", "B", "F", "E", "A", "G", "C"]
287
+ #
288
+ # @example Over sampling: "E" and "A" are sampled repeatedly
289
+ # v.sample(9)
290
+ #
291
+ # # =>
292
+ # #<RedAmber::Vector(:string, size=9):0x000000000001d790>
293
+ # ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
294
+ #
295
+ # @overload sample(prop)
296
+ # Pick up elements by proportion `prop` at random.
297
+ #
298
+ # @param prop [Float]
299
+ # positive proportion of elements to pick.
300
+ # Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
301
+ # If prop is smaller or equal to 1.0, elements are picked by non-repeating.
302
+ # If prop is greater than 1.0, some elements are picked repeatedly.
303
+ # @return [Vector]
304
+ # sampled elements.
305
+ # If picked element is only one, it returns a Vector of size == 1
306
+ # not a scalar.
307
+ # @example Sample same size of self: every element is picked in random order
308
+ # v.sample(1.0)
309
+ #
310
+ # # =>
311
+ # #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
312
+ # ["D", "H", "F", "C", "A", "B", "E", "G"]
313
+ #
314
+ # @example 2 times over sampling
315
+ # v.sample(2.0)
316
+ #
317
+ # # =>
318
+ # #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
319
+ # ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
320
+ #
321
+ # @since 0.4.0
322
+ #
323
+ def sample(n_or_prop = nil)
324
+ require 'arrow-numo-narray'
325
+
326
+ return nil if size == 0
327
+
328
+ n_sample =
329
+ case n_or_prop
330
+ in Integer
331
+ n_or_prop
332
+ in Float
333
+ (n_or_prop * size).round
334
+ in nil
335
+ return to_a.sample
336
+ else
337
+ raise VectorArgumentError, "must specify Integer or Float but #{n_or_prop}"
338
+ end
339
+ if n_or_prop < 0
340
+ raise VectorArgumentError, '#sample does not accept negative number.'
341
+ end
342
+ return Vector.new([]) if n_sample == 0
343
+
344
+ over_sample = [8 * size, n_sample].max
345
+ over_size = n_sample > size ? n_sample / size * size * 2 : size
346
+ over_vector =
347
+ Vector.create(Numo::UInt32.new(over_size).rand(over_sample).to_arrow_array)
348
+ indices = over_vector.rank.take(*0...n_sample)
349
+ take(indices - ((indices / size) * size))
350
+ end
351
+
111
352
  private
112
353
 
113
354
  # Accepts indices by numeric Vector
114
355
  def take_by_vector(indices)
115
- raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
116
- raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
356
+ indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
117
357
 
118
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
119
- raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
358
+ min, max = indices.min_max
359
+ raise VectorArgumentError, "Index out of range: #{min}" if min < 0
360
+ raise VectorArgumentError, "Index out of range: #{max}" if max >= size
120
361
 
121
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
362
+ index_array =
363
+ if indices.float?
364
+ Arrow::UInt64ArrayBuilder.build(indices.data)
365
+ else
366
+ indices.data
367
+ end
122
368
 
123
- datum = find(:take).execute([data, index_array]) # :array_take will fail with ChunkedArray
124
- Vector.new(datum.value)
369
+ # :array_take will fail with ChunkedArray
370
+ find(:take).execute([data, index_array]).value
125
371
  end
126
372
 
127
373
  # Accepts booleans by Arrow::BooleanArray
128
374
  def filter_by_array(boolean_array)
129
- raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
375
+ unless boolean_array.length == size
376
+ raise VectorArgumentError, 'Booleans must be same size as self.'
377
+ end
378
+
379
+ find(:array_filter).execute([data, boolean_array]).value
380
+ end
130
381
 
131
- datum = find(:array_filter).execute([data, boolean_array])
132
- Vector.new(datum.value)
382
+ def scalar_or_array(arrow_array)
383
+ a = arrow_array.to_a
384
+ a.size > 1 ? a : a[0]
133
385
  end
134
386
  end
135
387
  end