red_amber 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -51
- data/.yardopts +2 -0
- data/CHANGELOG.md +203 -1
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +61 -45
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +63 -0
- data/doc/DataFrame.md +35 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +295 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +537 -68
- data/lib/red_amber/data_frame_combinable.rb +776 -123
- data/lib/red_amber/data_frame_displayable.rb +248 -18
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +81 -10
- data/lib/red_amber/data_frame_reshaping.rb +216 -21
- data/lib/red_amber/data_frame_selectable.rb +781 -120
- data/lib/red_amber/data_frame_variable_operation.rb +561 -85
- data/lib/red_amber/group.rb +195 -21
- data/lib/red_amber/helper.rb +114 -32
- data/lib/red_amber/refinements.rb +206 -0
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +435 -58
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +321 -69
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +397 -24
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +15 -1
- data/red_amber.gemspec +4 -3
- metadata +19 -11
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -294
@@ -4,91 +4,131 @@
|
|
4
4
|
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
5
|
|
6
6
|
module RedAmber
|
7
|
-
#
|
8
|
-
#
|
7
|
+
# Mix-in for class Vector
|
8
|
+
# Functions to select some data.
|
9
9
|
module VectorSelectable
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
using RefineArray
|
11
|
+
using RefineArrayLike
|
12
|
+
|
13
|
+
# Select elements in the self by indices.
|
14
|
+
#
|
15
|
+
# @param indices [Array<Numeric>, Vector]
|
16
|
+
# an array-like of indices.
|
17
|
+
# @yieldreturn [Array<Numeric>, Vector]
|
18
|
+
# an array-like of indices from the block.
|
19
|
+
# @return [Vector]
|
20
|
+
# vector by selected elements.
|
21
|
+
#
|
22
|
+
# TODO: support for the option `boundscheck: true`
|
23
|
+
def take(*indices, &block)
|
24
|
+
if block
|
25
|
+
unless indices.empty?
|
26
|
+
raise VectorArgumentError, 'Must not specify both arguments and block.'
|
27
|
+
end
|
14
28
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
29
|
+
indices = [yield]
|
30
|
+
end
|
31
|
+
|
32
|
+
vector =
|
33
|
+
case indices
|
34
|
+
in [Vector => v] if v.numeric?
|
35
|
+
return Vector.create(take_by_vector(v))
|
36
|
+
in []
|
37
|
+
return Vector.new
|
38
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
39
|
+
Vector.create(aa)
|
40
|
+
else
|
41
|
+
Vector.new(indices.flatten)
|
42
|
+
end
|
20
43
|
|
21
|
-
|
22
|
-
|
44
|
+
unless vector.numeric?
|
45
|
+
raise VectorArgumentError, "argument must be a integers: #{indices}"
|
46
|
+
end
|
23
47
|
|
24
|
-
take_by_vector(
|
48
|
+
Vector.create(take_by_vector(vector))
|
25
49
|
end
|
26
50
|
|
27
|
-
#
|
51
|
+
# Select elements in the self by booleans.
|
52
|
+
#
|
53
|
+
# @param booleans [Array<true, false, nil>, Vector]
|
54
|
+
# an array-like of booleans.
|
55
|
+
# @yieldreturn [Array<true, false, nil>, Vector]
|
56
|
+
# an array-like of booleans from the block.
|
57
|
+
# @return [Vector]
|
58
|
+
# vector by selected elements.
|
59
|
+
#
|
60
|
+
# TODO: support for the option `null_selection_behavior: :drop`
|
28
61
|
def filter(*booleans, &block)
|
29
62
|
if block
|
30
|
-
|
63
|
+
unless booleans.empty?
|
64
|
+
raise VectorArgumentError, 'Must not specify both arguments and block.'
|
65
|
+
end
|
31
66
|
|
32
67
|
booleans = [yield]
|
33
68
|
end
|
34
69
|
|
35
|
-
booleans
|
36
|
-
|
70
|
+
case booleans
|
71
|
+
in [Vector => v]
|
72
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
|
37
73
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
74
|
+
Vector.create(filter_by_array(v.data))
|
75
|
+
in [Arrow::BooleanArray => ba]
|
76
|
+
Vector.create(filter_by_array(ba))
|
77
|
+
in []
|
78
|
+
Vector.new
|
79
|
+
else
|
80
|
+
booleans.flatten!
|
81
|
+
a = Arrow::Array.new(booleans)
|
82
|
+
if a.boolean?
|
83
|
+
Vector.create(filter_by_array(a))
|
84
|
+
elsif booleans.compact.empty? # [nil, nil] becomes string array
|
85
|
+
Vector.new
|
47
86
|
else
|
48
|
-
raise VectorTypeError,
|
49
|
-
|
50
|
-
Arrow::BooleanArray.new(booleans)
|
87
|
+
raise VectorTypeError, "Argument is not a boolean: #{booleans}"
|
51
88
|
end
|
52
|
-
|
53
|
-
filter_by_array(boolean_array) # returns sub Vector
|
89
|
+
end
|
54
90
|
end
|
55
91
|
alias_method :select, :filter
|
56
92
|
alias_method :find_all, :filter
|
57
93
|
|
58
|
-
#
|
59
|
-
#
|
94
|
+
# Select elements in the self by indices or booleans.
|
95
|
+
#
|
96
|
+
# @param args [Array<Numeric, true, false, nil>, Vector]
|
97
|
+
# specifier. Indices or booleans.
|
98
|
+
# @yieldparam [Array<Numeric, true, false, nil>, Vector]
|
99
|
+
# specifier. Indices or booleans.
|
100
|
+
# @return [scalar, Array]
|
101
|
+
# returns scalar or array.
|
102
|
+
#
|
60
103
|
def [](*args)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
unless arg.is_a?(Numeric) || booleans?([arg])
|
79
|
-
raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
|
104
|
+
array =
|
105
|
+
case args
|
106
|
+
in [Vector => v]
|
107
|
+
return scalar_or_array(take_by_vector(v)) if v.numeric?
|
108
|
+
return scalar_or_array(filter_by_array(v.data)) if v.boolean?
|
109
|
+
|
110
|
+
raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
|
111
|
+
in [Arrow::BooleanArray => ba]
|
112
|
+
return scalar_or_array(filter_by_array(ba))
|
113
|
+
in []
|
114
|
+
return nil
|
115
|
+
in [Arrow::Array => arrow_array]
|
116
|
+
arrow_array
|
117
|
+
in [Range => r]
|
118
|
+
Arrow::Array.new(parse_range(r, size))
|
119
|
+
else
|
120
|
+
Arrow::Array.new(args.flatten)
|
80
121
|
end
|
81
|
-
|
82
|
-
array
|
83
|
-
return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
|
122
|
+
|
123
|
+
return scalar_or_array(filter_by_array(array)) if array.boolean?
|
84
124
|
|
85
125
|
vector = Vector.new(array)
|
86
|
-
return take_by_vector(vector) if vector.numeric?
|
126
|
+
return scalar_or_array(take_by_vector(vector)) if vector.numeric?
|
87
127
|
|
88
128
|
raise VectorArgumentError, "Invalid argument: #{args}"
|
89
129
|
end
|
90
130
|
|
91
|
-
#
|
131
|
+
# @param values [Array, Arrow::Array, Vector]
|
92
132
|
def is_in(*values)
|
93
133
|
self_data = chunked? ? data.pack : data
|
94
134
|
|
@@ -100,7 +140,7 @@ module RedAmber
|
|
100
140
|
Array(values).flatten
|
101
141
|
end
|
102
142
|
|
103
|
-
Vector.
|
143
|
+
Vector.create(self_data.is_in(array))
|
104
144
|
end
|
105
145
|
|
106
146
|
# Arrow's support required
|
@@ -108,28 +148,240 @@ module RedAmber
|
|
108
148
|
to_a.index(element)
|
109
149
|
end
|
110
150
|
|
151
|
+
# Drop nil in self and returns a new Vector as a result.
|
152
|
+
#
|
153
|
+
# @return [Vector]
|
154
|
+
# a Vector without nils.
|
155
|
+
#
|
156
|
+
def drop_nil
|
157
|
+
datum = find(:drop_null).execute([data])
|
158
|
+
Vector.create(datum.value)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Arrange values in Vector.
|
162
|
+
#
|
163
|
+
# @param order [Symbol]
|
164
|
+
# sort order.
|
165
|
+
# - `:+`, `:ascending` or without argument will sort in increasing order.
|
166
|
+
# - `:-` or `:descending` will sort in decreasing order.
|
167
|
+
# @return [Vector]
|
168
|
+
# sorted Vector.
|
169
|
+
# @example Sort in increasing order (default)
|
170
|
+
# Vector.new(%w[B D A E C]).sort
|
171
|
+
# # same as #sort(:+)
|
172
|
+
# # same as #sort(:ascending)
|
173
|
+
#
|
174
|
+
# # =>
|
175
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000c134>
|
176
|
+
# ["A", "B", "C", "D", "E"]
|
177
|
+
#
|
178
|
+
# @example Sort in decreasing order
|
179
|
+
# Vector.new(%w[B D A E C]).sort(:-)
|
180
|
+
# # same as #sort(:descending)
|
181
|
+
#
|
182
|
+
# # =>
|
183
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000c148>
|
184
|
+
# ["E", "D", "C", "B", "A"]
|
185
|
+
#
|
186
|
+
# @since 0.4.0
|
187
|
+
#
|
188
|
+
def sort(order = :ascending)
|
189
|
+
order =
|
190
|
+
case order.to_sym
|
191
|
+
when :+, :ascending, :increasing
|
192
|
+
:ascending
|
193
|
+
when :-, :descending, :decreasing
|
194
|
+
:descending
|
195
|
+
else
|
196
|
+
raise VectorArgumentError, "illegal order option: #{order}"
|
197
|
+
end
|
198
|
+
take(sort_indices(order: order))
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns numerical rank of self.
|
202
|
+
# - Nil values are considered greater than any value.
|
203
|
+
# - NaN values are considered greater than any value but smaller than nil values.
|
204
|
+
# - Tiebreakers are ranked in order of appearance.
|
205
|
+
# - `RankOptions` in C++ function is not implemented in C GLib yet.
|
206
|
+
# This method is currently fixed to the default behavior.
|
207
|
+
#
|
208
|
+
# @return [Vector]
|
209
|
+
# 0-based rank of self (0...size in range).
|
210
|
+
# @example Rank of float Vector
|
211
|
+
# fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
|
212
|
+
#
|
213
|
+
# # =>
|
214
|
+
# #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
|
215
|
+
# [0.1, nil, NaN, 0.2, 0.1]
|
216
|
+
#
|
217
|
+
# fv.rank
|
218
|
+
#
|
219
|
+
# # =>
|
220
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
|
221
|
+
# [0, 4, 3, 2, 1]
|
222
|
+
#
|
223
|
+
# @example Rank of string Vector
|
224
|
+
# sv = Vector.new("A", "B", nil, "A", "C"); sv
|
225
|
+
#
|
226
|
+
# # =>
|
227
|
+
# #<RedAmber::Vector(:string, size=5):0x0000000000003854>
|
228
|
+
# ["A", "B", nil, "A", "C"]
|
229
|
+
#
|
230
|
+
# sv.rank
|
231
|
+
#
|
232
|
+
# # =>
|
233
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
|
234
|
+
# [0, 2, 4, 1, 3]
|
235
|
+
#
|
236
|
+
# @since 0.4.0
|
237
|
+
#
|
238
|
+
def rank
|
239
|
+
datum = Arrow::Function.find(:rank).execute([data])
|
240
|
+
Vector.create(datum.value) - 1
|
241
|
+
end
|
242
|
+
|
243
|
+
# Pick up elements at random.
|
244
|
+
#
|
245
|
+
# @overload sample()
|
246
|
+
# Return a randomly selected element.
|
247
|
+
# This is one of an aggregation function.
|
248
|
+
#
|
249
|
+
# @return [scalar]
|
250
|
+
# one of an element in self.
|
251
|
+
# @example Sample a element
|
252
|
+
# v = Vector.new('A'..'H'); v
|
253
|
+
#
|
254
|
+
# # =>
|
255
|
+
# #<RedAmber::Vector(:string, size=8):0x0000000000011b20>
|
256
|
+
# ["A", "B", "C", "D", "E", "F", "G", "H"]
|
257
|
+
#
|
258
|
+
# v.sample
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# "C"
|
262
|
+
#
|
263
|
+
# @overload sample(n)
|
264
|
+
# Pick up n elements at random.
|
265
|
+
#
|
266
|
+
# @param n [Integer]
|
267
|
+
# positive number of elements to pick.
|
268
|
+
# If n is smaller or equal to size, elements are picked by non-repeating.
|
269
|
+
# If n is greater than `size`, elements are picked repeatedly.
|
270
|
+
# @return [Vector]
|
271
|
+
# sampled elements.
|
272
|
+
# If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
|
273
|
+
# not a scalar.
|
274
|
+
# @example Sample Vector in size 1
|
275
|
+
# v.sample(1)
|
276
|
+
#
|
277
|
+
# # =>
|
278
|
+
# #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
|
279
|
+
# ["H"]
|
280
|
+
#
|
281
|
+
# @example Sample same size of self: every element is picked in random order
|
282
|
+
# v.sample(8)
|
283
|
+
#
|
284
|
+
# # =>
|
285
|
+
# #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
|
286
|
+
# ["H", "D", "B", "F", "E", "A", "G", "C"]
|
287
|
+
#
|
288
|
+
# @example Over sampling: "E" and "A" are sampled repeatedly
|
289
|
+
# v.sample(9)
|
290
|
+
#
|
291
|
+
# # =>
|
292
|
+
# #<RedAmber::Vector(:string, size=9):0x000000000001d790>
|
293
|
+
# ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
|
294
|
+
#
|
295
|
+
# @overload sample(prop)
|
296
|
+
# Pick up elements by proportion `prop` at random.
|
297
|
+
#
|
298
|
+
# @param prop [Float]
|
299
|
+
# positive proportion of elements to pick.
|
300
|
+
# Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
|
301
|
+
# If prop is smaller or equal to 1.0, elements are picked by non-repeating.
|
302
|
+
# If prop is greater than 1.0, some elements are picked repeatedly.
|
303
|
+
# @return [Vector]
|
304
|
+
# sampled elements.
|
305
|
+
# If picked element is only one, it returns a Vector of size == 1
|
306
|
+
# not a scalar.
|
307
|
+
# @example Sample same size of self: every element is picked in random order
|
308
|
+
# v.sample(1.0)
|
309
|
+
#
|
310
|
+
# # =>
|
311
|
+
# #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
|
312
|
+
# ["D", "H", "F", "C", "A", "B", "E", "G"]
|
313
|
+
#
|
314
|
+
# @example 2 times over sampling
|
315
|
+
# v.sample(2.0)
|
316
|
+
#
|
317
|
+
# # =>
|
318
|
+
# #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
|
319
|
+
# ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
|
320
|
+
#
|
321
|
+
# @since 0.4.0
|
322
|
+
#
|
323
|
+
def sample(n_or_prop = nil)
|
324
|
+
require 'arrow-numo-narray'
|
325
|
+
|
326
|
+
return nil if size == 0
|
327
|
+
|
328
|
+
n_sample =
|
329
|
+
case n_or_prop
|
330
|
+
in Integer
|
331
|
+
n_or_prop
|
332
|
+
in Float
|
333
|
+
(n_or_prop * size).round
|
334
|
+
in nil
|
335
|
+
return to_a.sample
|
336
|
+
else
|
337
|
+
raise VectorArgumentError, "must specify Integer or Float but #{n_or_prop}"
|
338
|
+
end
|
339
|
+
if n_or_prop < 0
|
340
|
+
raise VectorArgumentError, '#sample does not accept negative number.'
|
341
|
+
end
|
342
|
+
return Vector.new([]) if n_sample == 0
|
343
|
+
|
344
|
+
over_sample = [8 * size, n_sample].max
|
345
|
+
over_size = n_sample > size ? n_sample / size * size * 2 : size
|
346
|
+
over_vector =
|
347
|
+
Vector.create(Numo::UInt32.new(over_size).rand(over_sample).to_arrow_array)
|
348
|
+
indices = over_vector.rank.take(*0...n_sample)
|
349
|
+
take(indices - ((indices / size) * size))
|
350
|
+
end
|
351
|
+
|
111
352
|
private
|
112
353
|
|
113
354
|
# Accepts indices by numeric Vector
|
114
355
|
def take_by_vector(indices)
|
115
|
-
|
116
|
-
raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
356
|
+
indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
|
117
357
|
|
118
|
-
|
119
|
-
raise VectorArgumentError, "Index out of range: #{
|
358
|
+
min, max = indices.min_max
|
359
|
+
raise VectorArgumentError, "Index out of range: #{min}" if min < 0
|
360
|
+
raise VectorArgumentError, "Index out of range: #{max}" if max >= size
|
120
361
|
|
121
|
-
index_array =
|
362
|
+
index_array =
|
363
|
+
if indices.float?
|
364
|
+
Arrow::UInt64ArrayBuilder.build(indices.data)
|
365
|
+
else
|
366
|
+
indices.data
|
367
|
+
end
|
122
368
|
|
123
|
-
|
124
|
-
|
369
|
+
# :array_take will fail with ChunkedArray
|
370
|
+
find(:take).execute([data, index_array]).value
|
125
371
|
end
|
126
372
|
|
127
373
|
# Accepts booleans by Arrow::BooleanArray
|
128
374
|
def filter_by_array(boolean_array)
|
129
|
-
|
375
|
+
unless boolean_array.length == size
|
376
|
+
raise VectorArgumentError, 'Booleans must be same size as self.'
|
377
|
+
end
|
378
|
+
|
379
|
+
find(:array_filter).execute([data, boolean_array]).value
|
380
|
+
end
|
130
381
|
|
131
|
-
|
132
|
-
|
382
|
+
def scalar_or_array(arrow_array)
|
383
|
+
a = arrow_array.to_a
|
384
|
+
a.size > 1 ? a : a[0]
|
133
385
|
end
|
134
386
|
end
|
135
387
|
end
|