red_amber 0.2.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -51
- data/.yardopts +2 -0
- data/CHANGELOG.md +203 -1
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +61 -45
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +63 -0
- data/doc/DataFrame.md +35 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +295 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +537 -68
- data/lib/red_amber/data_frame_combinable.rb +776 -123
- data/lib/red_amber/data_frame_displayable.rb +248 -18
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +81 -10
- data/lib/red_amber/data_frame_reshaping.rb +216 -21
- data/lib/red_amber/data_frame_selectable.rb +781 -120
- data/lib/red_amber/data_frame_variable_operation.rb +561 -85
- data/lib/red_amber/group.rb +195 -21
- data/lib/red_amber/helper.rb +114 -32
- data/lib/red_amber/refinements.rb +206 -0
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +435 -58
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +321 -69
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +397 -24
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +15 -1
- data/red_amber.gemspec +4 -3
- metadata +19 -11
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -294
@@ -4,91 +4,131 @@
|
|
4
4
|
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
5
|
|
6
6
|
module RedAmber
|
7
|
-
#
|
8
|
-
#
|
7
|
+
# Mix-in for class Vector
|
8
|
+
# Functions to select some data.
|
9
9
|
module VectorSelectable
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
using RefineArray
|
11
|
+
using RefineArrayLike
|
12
|
+
|
13
|
+
# Select elements in the self by indices.
|
14
|
+
#
|
15
|
+
# @param indices [Array<Numeric>, Vector]
|
16
|
+
# an array-like of indices.
|
17
|
+
# @yieldreturn [Array<Numeric>, Vector]
|
18
|
+
# an array-like of indices from the block.
|
19
|
+
# @return [Vector]
|
20
|
+
# vector by selected elements.
|
21
|
+
#
|
22
|
+
# TODO: support for the option `boundscheck: true`
|
23
|
+
def take(*indices, &block)
|
24
|
+
if block
|
25
|
+
unless indices.empty?
|
26
|
+
raise VectorArgumentError, 'Must not specify both arguments and block.'
|
27
|
+
end
|
14
28
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
29
|
+
indices = [yield]
|
30
|
+
end
|
31
|
+
|
32
|
+
vector =
|
33
|
+
case indices
|
34
|
+
in [Vector => v] if v.numeric?
|
35
|
+
return Vector.create(take_by_vector(v))
|
36
|
+
in []
|
37
|
+
return Vector.new
|
38
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
39
|
+
Vector.create(aa)
|
40
|
+
else
|
41
|
+
Vector.new(indices.flatten)
|
42
|
+
end
|
20
43
|
|
21
|
-
|
22
|
-
|
44
|
+
unless vector.numeric?
|
45
|
+
raise VectorArgumentError, "argument must be a integers: #{indices}"
|
46
|
+
end
|
23
47
|
|
24
|
-
take_by_vector(
|
48
|
+
Vector.create(take_by_vector(vector))
|
25
49
|
end
|
26
50
|
|
27
|
-
#
|
51
|
+
# Select elements in the self by booleans.
|
52
|
+
#
|
53
|
+
# @param booleans [Array<true, false, nil>, Vector]
|
54
|
+
# an array-like of booleans.
|
55
|
+
# @yieldreturn [Array<true, false, nil>, Vector]
|
56
|
+
# an array-like of booleans from the block.
|
57
|
+
# @return [Vector]
|
58
|
+
# vector by selected elements.
|
59
|
+
#
|
60
|
+
# TODO: support for the option `null_selection_behavior: :drop`
|
28
61
|
def filter(*booleans, &block)
|
29
62
|
if block
|
30
|
-
|
63
|
+
unless booleans.empty?
|
64
|
+
raise VectorArgumentError, 'Must not specify both arguments and block.'
|
65
|
+
end
|
31
66
|
|
32
67
|
booleans = [yield]
|
33
68
|
end
|
34
69
|
|
35
|
-
booleans
|
36
|
-
|
70
|
+
case booleans
|
71
|
+
in [Vector => v]
|
72
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
|
37
73
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
74
|
+
Vector.create(filter_by_array(v.data))
|
75
|
+
in [Arrow::BooleanArray => ba]
|
76
|
+
Vector.create(filter_by_array(ba))
|
77
|
+
in []
|
78
|
+
Vector.new
|
79
|
+
else
|
80
|
+
booleans.flatten!
|
81
|
+
a = Arrow::Array.new(booleans)
|
82
|
+
if a.boolean?
|
83
|
+
Vector.create(filter_by_array(a))
|
84
|
+
elsif booleans.compact.empty? # [nil, nil] becomes string array
|
85
|
+
Vector.new
|
47
86
|
else
|
48
|
-
raise VectorTypeError,
|
49
|
-
|
50
|
-
Arrow::BooleanArray.new(booleans)
|
87
|
+
raise VectorTypeError, "Argument is not a boolean: #{booleans}"
|
51
88
|
end
|
52
|
-
|
53
|
-
filter_by_array(boolean_array) # returns sub Vector
|
89
|
+
end
|
54
90
|
end
|
55
91
|
alias_method :select, :filter
|
56
92
|
alias_method :find_all, :filter
|
57
93
|
|
58
|
-
#
|
59
|
-
#
|
94
|
+
# Select elements in the self by indices or booleans.
|
95
|
+
#
|
96
|
+
# @param args [Array<Numeric, true, false, nil>, Vector]
|
97
|
+
# specifier. Indices or booleans.
|
98
|
+
# @yieldparam [Array<Numeric, true, false, nil>, Vector]
|
99
|
+
# specifier. Indices or booleans.
|
100
|
+
# @return [scalar, Array]
|
101
|
+
# returns scalar or array.
|
102
|
+
#
|
60
103
|
def [](*args)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
unless arg.is_a?(Numeric) || booleans?([arg])
|
79
|
-
raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
|
104
|
+
array =
|
105
|
+
case args
|
106
|
+
in [Vector => v]
|
107
|
+
return scalar_or_array(take_by_vector(v)) if v.numeric?
|
108
|
+
return scalar_or_array(filter_by_array(v.data)) if v.boolean?
|
109
|
+
|
110
|
+
raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
|
111
|
+
in [Arrow::BooleanArray => ba]
|
112
|
+
return scalar_or_array(filter_by_array(ba))
|
113
|
+
in []
|
114
|
+
return nil
|
115
|
+
in [Arrow::Array => arrow_array]
|
116
|
+
arrow_array
|
117
|
+
in [Range => r]
|
118
|
+
Arrow::Array.new(parse_range(r, size))
|
119
|
+
else
|
120
|
+
Arrow::Array.new(args.flatten)
|
80
121
|
end
|
81
|
-
|
82
|
-
array
|
83
|
-
return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
|
122
|
+
|
123
|
+
return scalar_or_array(filter_by_array(array)) if array.boolean?
|
84
124
|
|
85
125
|
vector = Vector.new(array)
|
86
|
-
return take_by_vector(vector) if vector.numeric?
|
126
|
+
return scalar_or_array(take_by_vector(vector)) if vector.numeric?
|
87
127
|
|
88
128
|
raise VectorArgumentError, "Invalid argument: #{args}"
|
89
129
|
end
|
90
130
|
|
91
|
-
#
|
131
|
+
# @param values [Array, Arrow::Array, Vector]
|
92
132
|
def is_in(*values)
|
93
133
|
self_data = chunked? ? data.pack : data
|
94
134
|
|
@@ -100,7 +140,7 @@ module RedAmber
|
|
100
140
|
Array(values).flatten
|
101
141
|
end
|
102
142
|
|
103
|
-
Vector.
|
143
|
+
Vector.create(self_data.is_in(array))
|
104
144
|
end
|
105
145
|
|
106
146
|
# Arrow's support required
|
@@ -108,28 +148,240 @@ module RedAmber
|
|
108
148
|
to_a.index(element)
|
109
149
|
end
|
110
150
|
|
151
|
+
# Drop nil in self and returns a new Vector as a result.
|
152
|
+
#
|
153
|
+
# @return [Vector]
|
154
|
+
# a Vector without nils.
|
155
|
+
#
|
156
|
+
def drop_nil
|
157
|
+
datum = find(:drop_null).execute([data])
|
158
|
+
Vector.create(datum.value)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Arrange values in Vector.
|
162
|
+
#
|
163
|
+
# @param order [Symbol]
|
164
|
+
# sort order.
|
165
|
+
# - `:+`, `:ascending` or without argument will sort in increasing order.
|
166
|
+
# - `:-` or `:descending` will sort in decreasing order.
|
167
|
+
# @return [Vector]
|
168
|
+
# sorted Vector.
|
169
|
+
# @example Sort in increasing order (default)
|
170
|
+
# Vector.new(%w[B D A E C]).sort
|
171
|
+
# # same as #sort(:+)
|
172
|
+
# # same as #sort(:ascending)
|
173
|
+
#
|
174
|
+
# # =>
|
175
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000c134>
|
176
|
+
# ["A", "B", "C", "D", "E"]
|
177
|
+
#
|
178
|
+
# @example Sort in decreasing order
|
179
|
+
# Vector.new(%w[B D A E C]).sort(:-)
|
180
|
+
# # same as #sort(:descending)
|
181
|
+
#
|
182
|
+
# # =>
|
183
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000c148>
|
184
|
+
# ["E", "D", "C", "B", "A"]
|
185
|
+
#
|
186
|
+
# @since 0.4.0
|
187
|
+
#
|
188
|
+
def sort(order = :ascending)
|
189
|
+
order =
|
190
|
+
case order.to_sym
|
191
|
+
when :+, :ascending, :increasing
|
192
|
+
:ascending
|
193
|
+
when :-, :descending, :decreasing
|
194
|
+
:descending
|
195
|
+
else
|
196
|
+
raise VectorArgumentError, "illegal order option: #{order}"
|
197
|
+
end
|
198
|
+
take(sort_indices(order: order))
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns numerical rank of self.
|
202
|
+
# - Nil values are considered greater than any value.
|
203
|
+
# - NaN values are considered greater than any value but smaller than nil values.
|
204
|
+
# - Tiebreakers are ranked in order of appearance.
|
205
|
+
# - `RankOptions` in C++ function is not implemented in C GLib yet.
|
206
|
+
# This method is currently fixed to the default behavior.
|
207
|
+
#
|
208
|
+
# @return [Vector]
|
209
|
+
# 0-based rank of self (0...size in range).
|
210
|
+
# @example Rank of float Vector
|
211
|
+
# fv = Vector.new(0.1, nil, Float::NAN, 0.2, 0.1); fv
|
212
|
+
#
|
213
|
+
# # =>
|
214
|
+
# #<RedAmber::Vector(:double, size=5):0x000000000000c65c>
|
215
|
+
# [0.1, nil, NaN, 0.2, 0.1]
|
216
|
+
#
|
217
|
+
# fv.rank
|
218
|
+
#
|
219
|
+
# # =>
|
220
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
|
221
|
+
# [0, 4, 3, 2, 1]
|
222
|
+
#
|
223
|
+
# @example Rank of string Vector
|
224
|
+
# sv = Vector.new("A", "B", nil, "A", "C"); sv
|
225
|
+
#
|
226
|
+
# # =>
|
227
|
+
# #<RedAmber::Vector(:string, size=5):0x0000000000003854>
|
228
|
+
# ["A", "B", nil, "A", "C"]
|
229
|
+
#
|
230
|
+
# sv.rank
|
231
|
+
#
|
232
|
+
# # =>
|
233
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003868>
|
234
|
+
# [0, 2, 4, 1, 3]
|
235
|
+
#
|
236
|
+
# @since 0.4.0
|
237
|
+
#
|
238
|
+
def rank
|
239
|
+
datum = Arrow::Function.find(:rank).execute([data])
|
240
|
+
Vector.create(datum.value) - 1
|
241
|
+
end
|
242
|
+
|
243
|
+
# Pick up elements at random.
|
244
|
+
#
|
245
|
+
# @overload sample()
|
246
|
+
# Return a randomly selected element.
|
247
|
+
# This is one of an aggregation function.
|
248
|
+
#
|
249
|
+
# @return [scalar]
|
250
|
+
# one of an element in self.
|
251
|
+
# @example Sample a element
|
252
|
+
# v = Vector.new('A'..'H'); v
|
253
|
+
#
|
254
|
+
# # =>
|
255
|
+
# #<RedAmber::Vector(:string, size=8):0x0000000000011b20>
|
256
|
+
# ["A", "B", "C", "D", "E", "F", "G", "H"]
|
257
|
+
#
|
258
|
+
# v.sample
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# "C"
|
262
|
+
#
|
263
|
+
# @overload sample(n)
|
264
|
+
# Pick up n elements at random.
|
265
|
+
#
|
266
|
+
# @param n [Integer]
|
267
|
+
# positive number of elements to pick.
|
268
|
+
# If n is smaller or equal to size, elements are picked by non-repeating.
|
269
|
+
# If n is greater than `size`, elements are picked repeatedly.
|
270
|
+
# @return [Vector]
|
271
|
+
# sampled elements.
|
272
|
+
# If n == 1 (in case of `sample(1)`), it returns a Vector of size == 1
|
273
|
+
# not a scalar.
|
274
|
+
# @example Sample Vector in size 1
|
275
|
+
# v.sample(1)
|
276
|
+
#
|
277
|
+
# # =>
|
278
|
+
# #<RedAmber::Vector(:string, size=1):0x000000000001a3b0>
|
279
|
+
# ["H"]
|
280
|
+
#
|
281
|
+
# @example Sample same size of self: every element is picked in random order
|
282
|
+
# v.sample(8)
|
283
|
+
#
|
284
|
+
# # =>
|
285
|
+
# #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
|
286
|
+
# ["H", "D", "B", "F", "E", "A", "G", "C"]
|
287
|
+
#
|
288
|
+
# @example Over sampling: "E" and "A" are sampled repeatedly
|
289
|
+
# v.sample(9)
|
290
|
+
#
|
291
|
+
# # =>
|
292
|
+
# #<RedAmber::Vector(:string, size=9):0x000000000001d790>
|
293
|
+
# ["E", "E", "A", "D", "H", "C", "A", "F", "H"]
|
294
|
+
#
|
295
|
+
# @overload sample(prop)
|
296
|
+
# Pick up elements by proportion `prop` at random.
|
297
|
+
#
|
298
|
+
# @param prop [Float]
|
299
|
+
# positive proportion of elements to pick.
|
300
|
+
# Absolute number of elements to pick:`prop*size` is rounded (by `half: :up``).
|
301
|
+
# If prop is smaller or equal to 1.0, elements are picked by non-repeating.
|
302
|
+
# If prop is greater than 1.0, some elements are picked repeatedly.
|
303
|
+
# @return [Vector]
|
304
|
+
# sampled elements.
|
305
|
+
# If picked element is only one, it returns a Vector of size == 1
|
306
|
+
# not a scalar.
|
307
|
+
# @example Sample same size of self: every element is picked in random order
|
308
|
+
# v.sample(1.0)
|
309
|
+
#
|
310
|
+
# # =>
|
311
|
+
# #<RedAmber::Vector(:string, size=8):0x000000000001bda0>
|
312
|
+
# ["D", "H", "F", "C", "A", "B", "E", "G"]
|
313
|
+
#
|
314
|
+
# @example 2 times over sampling
|
315
|
+
# v.sample(2.0)
|
316
|
+
#
|
317
|
+
# # =>
|
318
|
+
# #<RedAmber::Vector(:string, size=16):0x00000000000233e8>
|
319
|
+
# ["H", "B", "C", "B", "C", "A", "F", "A", "E", "C", "H", "F", "F", "A", ... ]
|
320
|
+
#
|
321
|
+
# @since 0.4.0
|
322
|
+
#
|
323
|
+
def sample(n_or_prop = nil)
|
324
|
+
require 'arrow-numo-narray'
|
325
|
+
|
326
|
+
return nil if size == 0
|
327
|
+
|
328
|
+
n_sample =
|
329
|
+
case n_or_prop
|
330
|
+
in Integer
|
331
|
+
n_or_prop
|
332
|
+
in Float
|
333
|
+
(n_or_prop * size).round
|
334
|
+
in nil
|
335
|
+
return to_a.sample
|
336
|
+
else
|
337
|
+
raise VectorArgumentError, "must specify Integer or Float but #{n_or_prop}"
|
338
|
+
end
|
339
|
+
if n_or_prop < 0
|
340
|
+
raise VectorArgumentError, '#sample does not accept negative number.'
|
341
|
+
end
|
342
|
+
return Vector.new([]) if n_sample == 0
|
343
|
+
|
344
|
+
over_sample = [8 * size, n_sample].max
|
345
|
+
over_size = n_sample > size ? n_sample / size * size * 2 : size
|
346
|
+
over_vector =
|
347
|
+
Vector.create(Numo::UInt32.new(over_size).rand(over_sample).to_arrow_array)
|
348
|
+
indices = over_vector.rank.take(*0...n_sample)
|
349
|
+
take(indices - ((indices / size) * size))
|
350
|
+
end
|
351
|
+
|
111
352
|
private
|
112
353
|
|
113
354
|
# Accepts indices by numeric Vector
|
114
355
|
def take_by_vector(indices)
|
115
|
-
|
116
|
-
raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
356
|
+
indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
|
117
357
|
|
118
|
-
|
119
|
-
raise VectorArgumentError, "Index out of range: #{
|
358
|
+
min, max = indices.min_max
|
359
|
+
raise VectorArgumentError, "Index out of range: #{min}" if min < 0
|
360
|
+
raise VectorArgumentError, "Index out of range: #{max}" if max >= size
|
120
361
|
|
121
|
-
index_array =
|
362
|
+
index_array =
|
363
|
+
if indices.float?
|
364
|
+
Arrow::UInt64ArrayBuilder.build(indices.data)
|
365
|
+
else
|
366
|
+
indices.data
|
367
|
+
end
|
122
368
|
|
123
|
-
|
124
|
-
|
369
|
+
# :array_take will fail with ChunkedArray
|
370
|
+
find(:take).execute([data, index_array]).value
|
125
371
|
end
|
126
372
|
|
127
373
|
# Accepts booleans by Arrow::BooleanArray
|
128
374
|
def filter_by_array(boolean_array)
|
129
|
-
|
375
|
+
unless boolean_array.length == size
|
376
|
+
raise VectorArgumentError, 'Booleans must be same size as self.'
|
377
|
+
end
|
378
|
+
|
379
|
+
find(:array_filter).execute([data, boolean_array]).value
|
380
|
+
end
|
130
381
|
|
131
|
-
|
132
|
-
|
382
|
+
def scalar_or_array(arrow_array)
|
383
|
+
a = arrow_array.to_a
|
384
|
+
a.size > 1 ? a : a[0]
|
133
385
|
end
|
134
386
|
end
|
135
387
|
end
|