red_amber 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +114 -39
- data/CHANGELOG.md +203 -31
- data/Gemfile +5 -2
- data/README.md +62 -29
- data/benchmark/basic.yml +86 -0
- data/benchmark/combine.yml +62 -0
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +39 -0
- data/benchmark/reshape.yml +31 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/benchmark/vector.yml +60 -0
- data/doc/DataFrame.md +335 -53
- data/doc/Vector.md +91 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +167 -51
- data/lib/red_amber/data_frame_combinable.rb +486 -0
- data/lib/red_amber/data_frame_displayable.rb +6 -4
- data/lib/red_amber/data_frame_indexable.rb +2 -2
- data/lib/red_amber/data_frame_loadsave.rb +4 -1
- data/lib/red_amber/data_frame_reshaping.rb +35 -10
- data/lib/red_amber/data_frame_selectable.rb +221 -116
- data/lib/red_amber/data_frame_variable_operation.rb +146 -82
- data/lib/red_amber/group.rb +108 -18
- data/lib/red_amber/helper.rb +53 -43
- data/lib/red_amber/refinements.rb +199 -0
- data/lib/red_amber/vector.rb +56 -46
- data/lib/red_amber/vector_functions.rb +23 -83
- data/lib/red_amber/vector_selectable.rb +116 -69
- data/lib/red_amber/vector_updatable.rb +189 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +3 -0
- data/red_amber.gemspec +4 -3
- metadata +24 -10
@@ -3,53 +3,149 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
#
|
7
|
-
|
6
|
+
# Array, Arrow::Array and Arrow::ChunkedArray are refined
|
7
|
+
using RefineArray
|
8
|
+
using RefineArrayLike
|
9
|
+
|
10
|
+
# Select variables or records.
|
11
|
+
#
|
12
|
+
# @overload [](key)
|
13
|
+
# select single variable and return as a Vetor.
|
14
|
+
#
|
15
|
+
# @param key [Symbol, String] key name to select.
|
16
|
+
# @return [Vector] selected variable as a Vector.
|
17
|
+
# @note DataFrame.v(key) is faster to create Vector from a variable.
|
18
|
+
#
|
19
|
+
# @overload [](keys)
|
20
|
+
# select variables and return a DataFrame.
|
21
|
+
#
|
22
|
+
# @param keys [<Symbol, String>] key names to select.
|
23
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
24
|
+
#
|
25
|
+
# @overload [](index)
|
26
|
+
# select records and return a DataFrame.
|
27
|
+
#
|
28
|
+
# @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
29
|
+
# index of a row to select.
|
30
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
31
|
+
#
|
32
|
+
# @overload [](indices)
|
33
|
+
# select records and return a DataFrame.
|
34
|
+
#
|
35
|
+
# @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
|
36
|
+
# indices of rows to select.
|
37
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
38
|
+
#
|
8
39
|
def [](*args)
|
9
|
-
|
10
|
-
|
11
|
-
|
40
|
+
raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
|
41
|
+
|
42
|
+
case args
|
43
|
+
in [] | [nil]
|
44
|
+
return remove_all_values
|
45
|
+
in [(Symbol | String) => k] if key? k
|
46
|
+
return variables[k.to_sym]
|
47
|
+
in [Integer => i]
|
48
|
+
return take([i.negative? ? i + size : i])
|
49
|
+
in [Vector => v]
|
50
|
+
arrow_array = v.data
|
51
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
52
|
+
arrow_array = aa
|
53
|
+
else
|
54
|
+
a = parse_args(args, size)
|
55
|
+
return select_variables_by_keys(a) if a.symbols?
|
56
|
+
return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
|
57
|
+
return remove_all_values if a.compact.empty?
|
58
|
+
return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
|
59
|
+
|
60
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
61
|
+
end
|
62
|
+
|
63
|
+
return take(normalize_indices(arrow_array)) if arrow_array.numeric?
|
64
|
+
return filter_by_array(arrow_array) if arrow_array.boolean?
|
12
65
|
|
13
|
-
|
14
|
-
if
|
15
|
-
return filter_by_vector(vector.data) if vector.size == size
|
66
|
+
a = arrow_array.to_a
|
67
|
+
return select_variables_by_keys(a) if a.symbols_or_strings?
|
16
68
|
|
17
|
-
|
69
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Select a variable by a key in String or Symbol
|
73
|
+
def v(key)
|
74
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
75
|
+
raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
|
18
76
|
end
|
19
|
-
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
|
77
|
+
raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
|
21
78
|
|
22
|
-
|
79
|
+
variables[key.to_sym]
|
23
80
|
end
|
24
81
|
|
25
|
-
#
|
82
|
+
# Select records to create a DataFrame.
|
83
|
+
#
|
84
|
+
# @overload slice(row)
|
85
|
+
# select a record and return a DataFrame.
|
86
|
+
#
|
87
|
+
# @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
88
|
+
# a row index to select.
|
89
|
+
# @yield [self] gives self to the block.
|
90
|
+
# @note The block is evaluated within the context of self.
|
91
|
+
# It is accessable to self's instance variables and private methods.
|
92
|
+
# @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
93
|
+
# a row index to select.
|
94
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
95
|
+
#
|
96
|
+
# @overload slice(rows)
|
97
|
+
# select records and return a DataFrame.
|
98
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
99
|
+
# - The order of records will be the same as specified indices.
|
100
|
+
#
|
101
|
+
# @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
|
102
|
+
# row indeces to select.
|
103
|
+
# @yield [self] gives self to the block.
|
104
|
+
# @note The block is evaluated within the context of self.
|
105
|
+
# It is accessable to self's instance variables and private methods.
|
106
|
+
# @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
|
107
|
+
# row indeces to select.
|
108
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
109
|
+
#
|
26
110
|
def slice(*args, &block)
|
27
|
-
|
111
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
112
|
+
|
28
113
|
if block
|
29
|
-
|
114
|
+
unless args.empty?
|
115
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
116
|
+
end
|
30
117
|
|
31
|
-
|
118
|
+
args = [instance_eval(&block)]
|
32
119
|
end
|
33
|
-
slicer.flatten!
|
34
120
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
121
|
+
arrow_array =
|
122
|
+
case args
|
123
|
+
in [] | [[]]
|
124
|
+
return remove_all_values
|
125
|
+
in [Vector => v]
|
126
|
+
v.data
|
127
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
128
|
+
aa
|
129
|
+
else
|
130
|
+
Arrow::Array.new(parse_args(args, size))
|
131
|
+
end
|
41
132
|
|
42
|
-
|
133
|
+
if arrow_array.numeric?
|
134
|
+
take(normalize_indices(arrow_array))
|
135
|
+
elsif arrow_array.boolean?
|
136
|
+
filter_by_array(arrow_array)
|
137
|
+
elsif arrow_array.to_a.compact.empty?
|
138
|
+
# Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
|
139
|
+
remove_all_values
|
140
|
+
else
|
141
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
43
142
|
end
|
44
|
-
return take_by_array(vector) if vector.numeric?
|
45
|
-
|
46
|
-
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
143
|
end
|
48
144
|
|
49
145
|
def slice_by(key, keep_key: false, &block)
|
50
146
|
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
51
147
|
raise DataFrameArgumentError, 'No block given' unless block
|
52
|
-
raise DataFrameArgumentError, "#{key} is
|
148
|
+
raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
|
53
149
|
return self if key.nil?
|
54
150
|
|
55
151
|
slicer = instance_eval(&block)
|
@@ -83,69 +179,82 @@ module RedAmber
|
|
83
179
|
slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
|
84
180
|
end
|
85
181
|
|
86
|
-
|
87
|
-
|
88
|
-
else
|
89
|
-
take(slicer).drop(key)
|
90
|
-
end
|
182
|
+
taken = take(normalize_indices(Arrow::Array.new(slicer)))
|
183
|
+
keep_key ? taken : taken.drop(key)
|
91
184
|
end
|
92
185
|
|
93
|
-
# remove
|
186
|
+
# Select records and remove them to create a remainer DataFrame.
|
187
|
+
#
|
188
|
+
# @overload remove(row)
|
189
|
+
# select a record and remove it to create a remainer DataFrame.
|
190
|
+
# - The order of records in self will be preserved.
|
191
|
+
#
|
192
|
+
# @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
193
|
+
# a row index to remove.
|
194
|
+
# @yield [self] gives self to the block.
|
195
|
+
# @note The block is evaluated within the context of self.
|
196
|
+
# It is accessable to self's instance variables and private methods.
|
197
|
+
# @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
198
|
+
# a row index to remove.
|
199
|
+
# @return [DataFrame] remainer variables as a DataFrame.
|
200
|
+
#
|
201
|
+
# @overload remove(rows)
|
202
|
+
# select records and remove them to create a remainer DataFrame.
|
203
|
+
# - The order of records in self will be preserved.
|
204
|
+
#
|
205
|
+
# @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
206
|
+
# row indeces to remove.
|
207
|
+
# @yield [self] gives self to the block.
|
208
|
+
# @note The block is evaluated within the context of self.
|
209
|
+
# It is accessable to self's instance variables and private methods.
|
210
|
+
# @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
|
211
|
+
# row indeces to remove.
|
212
|
+
# @return [DataFrame] remainer variables as a DataFrame.
|
213
|
+
#
|
94
214
|
def remove(*args, &block)
|
95
|
-
|
96
|
-
if block
|
97
|
-
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
98
|
-
|
99
|
-
remover = [instance_eval(&block)]
|
100
|
-
end
|
101
|
-
remover.flatten!
|
102
|
-
|
103
|
-
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
104
|
-
return self if remover.empty? || remover[0].nil?
|
215
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
105
216
|
|
106
|
-
|
107
|
-
|
108
|
-
|
217
|
+
if block
|
218
|
+
unless args.empty?
|
219
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
220
|
+
end
|
109
221
|
|
110
|
-
|
222
|
+
args = [instance_eval(&block)]
|
111
223
|
end
|
112
|
-
if vector.numeric?
|
113
|
-
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
114
224
|
|
115
|
-
|
116
|
-
|
117
|
-
|
225
|
+
arrow_array =
|
226
|
+
case args
|
227
|
+
in [] | [[]] | [nil]
|
228
|
+
return self
|
229
|
+
in [Vector => v]
|
230
|
+
v.data
|
231
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
232
|
+
aa
|
233
|
+
else
|
234
|
+
Arrow::Array.new(parse_args(args, size))
|
118
235
|
end
|
119
236
|
|
120
|
-
|
121
|
-
|
122
|
-
|
237
|
+
if arrow_array.boolean?
|
238
|
+
filter_by_array(arrow_array.primitive_invert)
|
239
|
+
elsif arrow_array.numeric?
|
240
|
+
remover = normalize_indices(arrow_array).to_a
|
241
|
+
return self if remover.empty?
|
123
242
|
|
124
|
-
|
243
|
+
slicer = indices.to_a - remover.map(&:to_i)
|
244
|
+
return remove_all_values if slicer.empty?
|
125
245
|
|
126
|
-
|
127
|
-
|
246
|
+
take(slicer)
|
247
|
+
else
|
248
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
128
249
|
end
|
129
|
-
|
130
|
-
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
131
250
|
end
|
132
251
|
|
133
252
|
def remove_nil
|
134
253
|
func = Arrow::Function.find(:drop_null)
|
135
|
-
DataFrame.
|
254
|
+
DataFrame.create(func.execute([table]).value)
|
136
255
|
end
|
137
256
|
alias_method :drop_nil, :remove_nil
|
138
257
|
|
139
|
-
# Select a variable by a key in String or Symbol
|
140
|
-
def v(key)
|
141
|
-
unless key.is_a?(Symbol) || key.is_a?(String)
|
142
|
-
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
143
|
-
end
|
144
|
-
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
145
|
-
|
146
|
-
variables[key.to_sym]
|
147
|
-
end
|
148
|
-
|
149
258
|
def head(n_obs = 5)
|
150
259
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
151
260
|
|
@@ -166,77 +275,73 @@ module RedAmber
|
|
166
275
|
tail(n_obs)
|
167
276
|
end
|
168
277
|
|
169
|
-
#
|
170
|
-
#
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
176
|
-
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
177
|
-
|
178
|
-
take_by_array(indices)
|
278
|
+
# @api private
|
279
|
+
# TODO: support for option `boundscheck: true`
|
280
|
+
# Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
|
281
|
+
# Negative index is not supported.
|
282
|
+
def take(index_array)
|
283
|
+
DataFrame.create(@table.take(index_array))
|
179
284
|
end
|
180
285
|
|
181
|
-
#
|
182
|
-
#
|
286
|
+
# @api private
|
287
|
+
# TODO: support for option `null_selection_behavior: :drop``
|
183
288
|
def filter(*booleans)
|
184
289
|
booleans.flatten!
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
191
|
-
|
192
|
-
filter_by_vector(b.data)
|
193
|
-
when Arrow::BooleanArray
|
194
|
-
filter_by_vector(b)
|
290
|
+
case booleans
|
291
|
+
in []
|
292
|
+
return remove_all_values
|
293
|
+
in [Arrow::BooleanArray => b]
|
294
|
+
filter_by_array(b)
|
195
295
|
else
|
196
|
-
|
296
|
+
unless booleans.booleans?
|
297
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.'
|
298
|
+
end
|
197
299
|
|
198
|
-
|
300
|
+
filter_by_array(Arrow::BooleanArray.new(booleans))
|
199
301
|
end
|
200
302
|
end
|
201
303
|
|
202
304
|
private
|
203
305
|
|
204
|
-
def
|
306
|
+
def select_variables_by_keys(keys)
|
205
307
|
if keys.one?
|
206
308
|
key = keys[0].to_sym
|
207
|
-
raise DataFrameArgumentError, "Key does not exist #{
|
309
|
+
raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
|
208
310
|
|
209
311
|
variables[key]
|
312
|
+
# Vector.new(@table.find_column(*key).data)
|
210
313
|
else
|
211
|
-
|
314
|
+
check_duplicate_keys(keys)
|
315
|
+
DataFrame.create(@table.select_columns(*keys))
|
212
316
|
end
|
213
317
|
end
|
214
318
|
|
215
|
-
# Accepts indices by numeric
|
216
|
-
def
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
DataFrame.new(datum.value)
|
319
|
+
# Accepts indices by numeric arrow array and returns positive indices.
|
320
|
+
def normalize_indices(arrow_array)
|
321
|
+
b = Arrow::Function.find(:less).execute([arrow_array, 0])
|
322
|
+
a = Arrow::Function.find(:add).execute([arrow_array, size])
|
323
|
+
r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
|
324
|
+
if r.float?
|
325
|
+
r = Arrow::Function.find(:floor).execute([r]).value
|
326
|
+
Arrow::UInt64ArrayBuilder.build(r)
|
327
|
+
else
|
328
|
+
r
|
329
|
+
end
|
227
330
|
end
|
228
331
|
|
229
|
-
# Accepts booleans by Arrow::BooleanArray
|
230
|
-
def
|
231
|
-
|
332
|
+
# Accepts booleans by a Arrow::BooleanArray or an Array
|
333
|
+
def filter_by_array(boolean_array)
|
334
|
+
unless boolean_array.length == size
|
335
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.'
|
336
|
+
end
|
232
337
|
|
233
338
|
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
234
|
-
DataFrame.
|
339
|
+
DataFrame.create(datum.value)
|
235
340
|
end
|
236
341
|
|
237
342
|
# return a DataFrame with same keys as self without values
|
238
343
|
def remove_all_values
|
239
|
-
|
344
|
+
filter_by_array(Arrow::BooleanArray.new([false] * size))
|
240
345
|
end
|
241
346
|
end
|
242
347
|
end
|