red_amber 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +111 -48
- data/CHANGELOG.md +90 -1
- data/Gemfile +1 -0
- data/README.md +42 -25
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +60 -0
- data/doc/DataFrame.md +3 -0
- data/doc/Vector.md +88 -0
- data/lib/red_amber/data_frame.rb +161 -46
- data/lib/red_amber/data_frame_combinable.rb +304 -101
- data/lib/red_amber/data_frame_displayable.rb +4 -4
- data/lib/red_amber/data_frame_indexable.rb +2 -2
- data/lib/red_amber/data_frame_loadsave.rb +4 -1
- data/lib/red_amber/data_frame_reshaping.rb +35 -10
- data/lib/red_amber/data_frame_selectable.rb +221 -116
- data/lib/red_amber/data_frame_variable_operation.rb +146 -82
- data/lib/red_amber/group.rb +16 -7
- data/lib/red_amber/helper.rb +53 -31
- data/lib/red_amber/refinements.rb +199 -0
- data/lib/red_amber/vector.rb +55 -52
- data/lib/red_amber/vector_functions.rb +23 -75
- data/lib/red_amber/vector_selectable.rb +116 -69
- data/lib/red_amber/vector_updatable.rb +136 -7
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +2 -0
- data/red_amber.gemspec +3 -2
- metadata +11 -8
@@ -8,11 +8,14 @@ module RedAmber
|
|
8
8
|
# @param key [Symbol] key of the index column
|
9
9
|
# to transepose into keys.
|
10
10
|
# If it is not specified, keys[0] is used.
|
11
|
-
# @param
|
12
|
-
# If it is not specified, :NAME is used.
|
11
|
+
# @param name [Symbol] key name of transposed index column.
|
12
|
+
# If it is not specified, :NAME is used.
|
13
|
+
# If it already exists, :NAME1 or :NAME1.succ is used.
|
13
14
|
# @return [DataFrame] trnsposed DataFrame
|
14
15
|
def transpose(key: keys.first, name: :NAME)
|
15
|
-
|
16
|
+
unless keys.include?(key)
|
17
|
+
raise DataFrameArgumentError, "Self does not include: #{key}"
|
18
|
+
end
|
16
19
|
|
17
20
|
# Find unused name
|
18
21
|
new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
|
@@ -35,14 +38,24 @@ module RedAmber
|
|
35
38
|
# @param value [Symbol, String] key of the column which is come **from values**.
|
36
39
|
# @return [DataFrame] long DataFrame.
|
37
40
|
def to_long(*keep_keys, name: :NAME, value: :VALUE)
|
41
|
+
warn('[Info] No key to keep is specified.') if keep_keys.empty?
|
42
|
+
|
38
43
|
not_included = keep_keys - keys
|
39
|
-
|
44
|
+
unless not_included.empty?
|
45
|
+
raise DataFrameArgumentError, "Not have keys #{not_included}"
|
46
|
+
end
|
40
47
|
|
41
48
|
name = name.to_sym
|
42
|
-
|
49
|
+
if keep_keys.include?(name)
|
50
|
+
raise DataFrameArgumentError,
|
51
|
+
"Can't specify the key: #{name} for the column from keys."
|
52
|
+
end
|
43
53
|
|
44
54
|
value = value.to_sym
|
45
|
-
|
55
|
+
if keep_keys.include?(value)
|
56
|
+
raise DataFrameArgumentError,
|
57
|
+
"Can't specify the key: #{value} for the column from values."
|
58
|
+
end
|
46
59
|
|
47
60
|
hash = Hash.new { |h, k| h[k] = [] }
|
48
61
|
l = keys.size - keep_keys.size
|
@@ -62,15 +75,27 @@ module RedAmber
|
|
62
75
|
|
63
76
|
# Reshape long DataFrame to a wide DataFrame.
|
64
77
|
#
|
65
|
-
# @param name [Symbol, String]
|
66
|
-
#
|
78
|
+
# @param name [Symbol, String]
|
79
|
+
# key of the column which will be expanded **to key names**.
|
80
|
+
# @param value [Symbol, String]
|
81
|
+
# key of the column which will be expanded **to values**.
|
67
82
|
# @return [DataFrame] wide DataFrame.
|
68
83
|
def to_wide(name: :NAME, value: :VALUE)
|
69
84
|
name = name.to_sym
|
70
|
-
|
85
|
+
unless keys.include?(name)
|
86
|
+
raise DataFrameArgumentError,
|
87
|
+
"You are going to keep the key: #{name}. " \
|
88
|
+
'You may need to specify the column name ' \
|
89
|
+
'that gives the new keys by `:name` option.'
|
90
|
+
end
|
71
91
|
|
72
92
|
value = value.to_sym
|
73
|
-
|
93
|
+
unless keys.include?(value)
|
94
|
+
raise DataFrameArgumentError,
|
95
|
+
"You are going to keep the key: #{value}. " \
|
96
|
+
'You may need to specify the column name ' \
|
97
|
+
'that gives the new values by `:value` option.'
|
98
|
+
end
|
74
99
|
|
75
100
|
hash = Hash.new { |h, k| h[k] = {} }
|
76
101
|
keep_keys = keys - [name, value]
|
@@ -3,53 +3,149 @@
|
|
3
3
|
module RedAmber
|
4
4
|
# mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
#
|
7
|
-
|
6
|
+
# Array, Arrow::Array and Arrow::ChunkedArray are refined
|
7
|
+
using RefineArray
|
8
|
+
using RefineArrayLike
|
9
|
+
|
10
|
+
# Select variables or records.
|
11
|
+
#
|
12
|
+
# @overload [](key)
|
13
|
+
# select single variable and return as a Vetor.
|
14
|
+
#
|
15
|
+
# @param key [Symbol, String] key name to select.
|
16
|
+
# @return [Vector] selected variable as a Vector.
|
17
|
+
# @note DataFrame.v(key) is faster to create Vector from a variable.
|
18
|
+
#
|
19
|
+
# @overload [](keys)
|
20
|
+
# select variables and return a DataFrame.
|
21
|
+
#
|
22
|
+
# @param keys [<Symbol, String>] key names to select.
|
23
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
24
|
+
#
|
25
|
+
# @overload [](index)
|
26
|
+
# select records and return a DataFrame.
|
27
|
+
#
|
28
|
+
# @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
29
|
+
# index of a row to select.
|
30
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
31
|
+
#
|
32
|
+
# @overload [](indices)
|
33
|
+
# select records and return a DataFrame.
|
34
|
+
#
|
35
|
+
# @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
|
36
|
+
# indices of rows to select.
|
37
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
38
|
+
#
|
8
39
|
def [](*args)
|
9
|
-
|
10
|
-
|
11
|
-
|
40
|
+
raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
|
41
|
+
|
42
|
+
case args
|
43
|
+
in [] | [nil]
|
44
|
+
return remove_all_values
|
45
|
+
in [(Symbol | String) => k] if key? k
|
46
|
+
return variables[k.to_sym]
|
47
|
+
in [Integer => i]
|
48
|
+
return take([i.negative? ? i + size : i])
|
49
|
+
in [Vector => v]
|
50
|
+
arrow_array = v.data
|
51
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
52
|
+
arrow_array = aa
|
53
|
+
else
|
54
|
+
a = parse_args(args, size)
|
55
|
+
return select_variables_by_keys(a) if a.symbols?
|
56
|
+
return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
|
57
|
+
return remove_all_values if a.compact.empty?
|
58
|
+
return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
|
59
|
+
|
60
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
61
|
+
end
|
62
|
+
|
63
|
+
return take(normalize_indices(arrow_array)) if arrow_array.numeric?
|
64
|
+
return filter_by_array(arrow_array) if arrow_array.boolean?
|
12
65
|
|
13
|
-
|
14
|
-
if
|
15
|
-
return filter_by_vector(vector.data) if vector.size == size
|
66
|
+
a = arrow_array.to_a
|
67
|
+
return select_variables_by_keys(a) if a.symbols_or_strings?
|
16
68
|
|
17
|
-
|
69
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Select a variable by a key in String or Symbol
|
73
|
+
def v(key)
|
74
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
75
|
+
raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
|
18
76
|
end
|
19
|
-
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
|
77
|
+
raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
|
21
78
|
|
22
|
-
|
79
|
+
variables[key.to_sym]
|
23
80
|
end
|
24
81
|
|
25
|
-
#
|
82
|
+
# Select records to create a DataFrame.
|
83
|
+
#
|
84
|
+
# @overload slice(row)
|
85
|
+
# select a record and return a DataFrame.
|
86
|
+
#
|
87
|
+
# @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
88
|
+
# a row index to select.
|
89
|
+
# @yield [self] gives self to the block.
|
90
|
+
# @note The block is evaluated within the context of self.
|
91
|
+
# It is accessable to self's instance variables and private methods.
|
92
|
+
# @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
93
|
+
# a row index to select.
|
94
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
95
|
+
#
|
96
|
+
# @overload slice(rows)
|
97
|
+
# select records and return a DataFrame.
|
98
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
99
|
+
# - The order of records will be the same as specified indices.
|
100
|
+
#
|
101
|
+
# @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
|
102
|
+
# row indeces to select.
|
103
|
+
# @yield [self] gives self to the block.
|
104
|
+
# @note The block is evaluated within the context of self.
|
105
|
+
# It is accessable to self's instance variables and private methods.
|
106
|
+
# @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
|
107
|
+
# row indeces to select.
|
108
|
+
# @return [DataFrame] selected variables as a DataFrame.
|
109
|
+
#
|
26
110
|
def slice(*args, &block)
|
27
|
-
|
111
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
112
|
+
|
28
113
|
if block
|
29
|
-
|
114
|
+
unless args.empty?
|
115
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
116
|
+
end
|
30
117
|
|
31
|
-
|
118
|
+
args = [instance_eval(&block)]
|
32
119
|
end
|
33
|
-
slicer.flatten!
|
34
120
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
121
|
+
arrow_array =
|
122
|
+
case args
|
123
|
+
in [] | [[]]
|
124
|
+
return remove_all_values
|
125
|
+
in [Vector => v]
|
126
|
+
v.data
|
127
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
128
|
+
aa
|
129
|
+
else
|
130
|
+
Arrow::Array.new(parse_args(args, size))
|
131
|
+
end
|
41
132
|
|
42
|
-
|
133
|
+
if arrow_array.numeric?
|
134
|
+
take(normalize_indices(arrow_array))
|
135
|
+
elsif arrow_array.boolean?
|
136
|
+
filter_by_array(arrow_array)
|
137
|
+
elsif arrow_array.to_a.compact.empty?
|
138
|
+
# Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
|
139
|
+
remove_all_values
|
140
|
+
else
|
141
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
43
142
|
end
|
44
|
-
return take_by_array(vector) if vector.numeric?
|
45
|
-
|
46
|
-
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
143
|
end
|
48
144
|
|
49
145
|
def slice_by(key, keep_key: false, &block)
|
50
146
|
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
51
147
|
raise DataFrameArgumentError, 'No block given' unless block
|
52
|
-
raise DataFrameArgumentError, "#{key} is
|
148
|
+
raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
|
53
149
|
return self if key.nil?
|
54
150
|
|
55
151
|
slicer = instance_eval(&block)
|
@@ -83,69 +179,82 @@ module RedAmber
|
|
83
179
|
slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
|
84
180
|
end
|
85
181
|
|
86
|
-
|
87
|
-
|
88
|
-
else
|
89
|
-
take(slicer).drop(key)
|
90
|
-
end
|
182
|
+
taken = take(normalize_indices(Arrow::Array.new(slicer)))
|
183
|
+
keep_key ? taken : taken.drop(key)
|
91
184
|
end
|
92
185
|
|
93
|
-
# remove
|
186
|
+
# Select records and remove them to create a remainer DataFrame.
|
187
|
+
#
|
188
|
+
# @overload remove(row)
|
189
|
+
# select a record and remove it to create a remainer DataFrame.
|
190
|
+
# - The order of records in self will be preserved.
|
191
|
+
#
|
192
|
+
# @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
193
|
+
# a row index to remove.
|
194
|
+
# @yield [self] gives self to the block.
|
195
|
+
# @note The block is evaluated within the context of self.
|
196
|
+
# It is accessable to self's instance variables and private methods.
|
197
|
+
# @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
198
|
+
# a row index to remove.
|
199
|
+
# @return [DataFrame] remainer variables as a DataFrame.
|
200
|
+
#
|
201
|
+
# @overload remove(rows)
|
202
|
+
# select records and remove them to create a remainer DataFrame.
|
203
|
+
# - The order of records in self will be preserved.
|
204
|
+
#
|
205
|
+
# @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
206
|
+
# row indeces to remove.
|
207
|
+
# @yield [self] gives self to the block.
|
208
|
+
# @note The block is evaluated within the context of self.
|
209
|
+
# It is accessable to self's instance variables and private methods.
|
210
|
+
# @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
|
211
|
+
# row indeces to remove.
|
212
|
+
# @return [DataFrame] remainer variables as a DataFrame.
|
213
|
+
#
|
94
214
|
def remove(*args, &block)
|
95
|
-
|
96
|
-
if block
|
97
|
-
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
98
|
-
|
99
|
-
remover = [instance_eval(&block)]
|
100
|
-
end
|
101
|
-
remover.flatten!
|
102
|
-
|
103
|
-
raise DataFrameArgumentError, 'Empty dataframe' if empty?
|
104
|
-
return self if remover.empty? || remover[0].nil?
|
215
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
105
216
|
|
106
|
-
|
107
|
-
|
108
|
-
|
217
|
+
if block
|
218
|
+
unless args.empty?
|
219
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
220
|
+
end
|
109
221
|
|
110
|
-
|
222
|
+
args = [instance_eval(&block)]
|
111
223
|
end
|
112
|
-
if vector.numeric?
|
113
|
-
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
114
224
|
|
115
|
-
|
116
|
-
|
117
|
-
|
225
|
+
arrow_array =
|
226
|
+
case args
|
227
|
+
in [] | [[]] | [nil]
|
228
|
+
return self
|
229
|
+
in [Vector => v]
|
230
|
+
v.data
|
231
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
232
|
+
aa
|
233
|
+
else
|
234
|
+
Arrow::Array.new(parse_args(args, size))
|
118
235
|
end
|
119
236
|
|
120
|
-
|
121
|
-
|
122
|
-
|
237
|
+
if arrow_array.boolean?
|
238
|
+
filter_by_array(arrow_array.primitive_invert)
|
239
|
+
elsif arrow_array.numeric?
|
240
|
+
remover = normalize_indices(arrow_array).to_a
|
241
|
+
return self if remover.empty?
|
123
242
|
|
124
|
-
|
243
|
+
slicer = indices.to_a - remover.map(&:to_i)
|
244
|
+
return remove_all_values if slicer.empty?
|
125
245
|
|
126
|
-
|
127
|
-
|
246
|
+
take(slicer)
|
247
|
+
else
|
248
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
128
249
|
end
|
129
|
-
|
130
|
-
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
131
250
|
end
|
132
251
|
|
133
252
|
def remove_nil
|
134
253
|
func = Arrow::Function.find(:drop_null)
|
135
|
-
DataFrame.
|
254
|
+
DataFrame.create(func.execute([table]).value)
|
136
255
|
end
|
137
256
|
alias_method :drop_nil, :remove_nil
|
138
257
|
|
139
|
-
# Select a variable by a key in String or Symbol
|
140
|
-
def v(key)
|
141
|
-
unless key.is_a?(Symbol) || key.is_a?(String)
|
142
|
-
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
143
|
-
end
|
144
|
-
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
145
|
-
|
146
|
-
variables[key.to_sym]
|
147
|
-
end
|
148
|
-
|
149
258
|
def head(n_obs = 5)
|
150
259
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
151
260
|
|
@@ -166,77 +275,73 @@ module RedAmber
|
|
166
275
|
tail(n_obs)
|
167
276
|
end
|
168
277
|
|
169
|
-
#
|
170
|
-
#
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
|
176
|
-
arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
|
177
|
-
|
178
|
-
take_by_array(arg_indices)
|
278
|
+
# @api private
|
279
|
+
# TODO: support for option `boundscheck: true`
|
280
|
+
# Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
|
281
|
+
# Negative index is not supported.
|
282
|
+
def take(index_array)
|
283
|
+
DataFrame.create(@table.take(index_array))
|
179
284
|
end
|
180
285
|
|
181
|
-
#
|
182
|
-
#
|
286
|
+
# @api private
|
287
|
+
# TODO: support for option `null_selection_behavior: :drop``
|
183
288
|
def filter(*booleans)
|
184
289
|
booleans.flatten!
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
191
|
-
|
192
|
-
filter_by_vector(b.data)
|
193
|
-
when Arrow::BooleanArray
|
194
|
-
filter_by_vector(b)
|
290
|
+
case booleans
|
291
|
+
in []
|
292
|
+
return remove_all_values
|
293
|
+
in [Arrow::BooleanArray => b]
|
294
|
+
filter_by_array(b)
|
195
295
|
else
|
196
|
-
|
296
|
+
unless booleans.booleans?
|
297
|
+
raise DataFrameArgumentError, 'Argument is not a boolean.'
|
298
|
+
end
|
197
299
|
|
198
|
-
|
300
|
+
filter_by_array(Arrow::BooleanArray.new(booleans))
|
199
301
|
end
|
200
302
|
end
|
201
303
|
|
202
304
|
private
|
203
305
|
|
204
|
-
def
|
306
|
+
def select_variables_by_keys(keys)
|
205
307
|
if keys.one?
|
206
308
|
key = keys[0].to_sym
|
207
|
-
raise DataFrameArgumentError, "Key does not exist #{
|
309
|
+
raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
|
208
310
|
|
209
311
|
variables[key]
|
312
|
+
# Vector.new(@table.find_column(*key).data)
|
210
313
|
else
|
211
|
-
|
314
|
+
check_duplicate_keys(keys)
|
315
|
+
DataFrame.create(@table.select_columns(*keys))
|
212
316
|
end
|
213
317
|
end
|
214
318
|
|
215
|
-
# Accepts indices by numeric
|
216
|
-
def
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
DataFrame.new(datum.value)
|
319
|
+
# Accepts indices by numeric arrow array and returns positive indices.
|
320
|
+
def normalize_indices(arrow_array)
|
321
|
+
b = Arrow::Function.find(:less).execute([arrow_array, 0])
|
322
|
+
a = Arrow::Function.find(:add).execute([arrow_array, size])
|
323
|
+
r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
|
324
|
+
if r.float?
|
325
|
+
r = Arrow::Function.find(:floor).execute([r]).value
|
326
|
+
Arrow::UInt64ArrayBuilder.build(r)
|
327
|
+
else
|
328
|
+
r
|
329
|
+
end
|
227
330
|
end
|
228
331
|
|
229
|
-
# Accepts booleans by Arrow::BooleanArray
|
230
|
-
def
|
231
|
-
|
332
|
+
# Accepts booleans by a Arrow::BooleanArray or an Array
|
333
|
+
def filter_by_array(boolean_array)
|
334
|
+
unless boolean_array.length == size
|
335
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.'
|
336
|
+
end
|
232
337
|
|
233
338
|
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
234
|
-
DataFrame.
|
339
|
+
DataFrame.create(datum.value)
|
235
340
|
end
|
236
341
|
|
237
342
|
# return a DataFrame with same keys as self without values
|
238
343
|
def remove_all_values
|
239
|
-
|
344
|
+
filter_by_array(Arrow::BooleanArray.new([false] * size))
|
240
345
|
end
|
241
346
|
end
|
242
347
|
end
|