red_amber 0.2.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -51
- data/.yardopts +2 -0
- data/CHANGELOG.md +203 -1
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +61 -45
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +63 -0
- data/doc/DataFrame.md +35 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +295 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +537 -68
- data/lib/red_amber/data_frame_combinable.rb +776 -123
- data/lib/red_amber/data_frame_displayable.rb +248 -18
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +81 -10
- data/lib/red_amber/data_frame_reshaping.rb +216 -21
- data/lib/red_amber/data_frame_selectable.rb +781 -120
- data/lib/red_amber/data_frame_variable_operation.rb +561 -85
- data/lib/red_amber/group.rb +195 -21
- data/lib/red_amber/helper.rb +114 -32
- data/lib/red_amber/refinements.rb +206 -0
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +435 -58
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +321 -69
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +397 -24
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +15 -1
- data/red_amber.gemspec +4 -3
- metadata +19 -11
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,55 +1,462 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
#
|
7
|
-
|
6
|
+
# Array, Arrow::Array and Arrow::ChunkedArray are refined
|
7
|
+
using RefineArray
|
8
|
+
using RefineArrayLike
|
9
|
+
|
10
|
+
# rubocop:disable Layout/LineLength
|
11
|
+
|
12
|
+
# Select variables or records.
|
13
|
+
#
|
14
|
+
# @overload [](key)
|
15
|
+
# Select single variable (column) and return as a Vetor.
|
16
|
+
#
|
17
|
+
# @param key [Symbol, String]
|
18
|
+
# key name to select.
|
19
|
+
# @return [Vector]
|
20
|
+
# selected variable as a Vector.
|
21
|
+
# @note DataFrame.v(key) is faster to create Vector from a variable.
|
22
|
+
# @example Select a column and return Vector
|
23
|
+
# penguins
|
24
|
+
#
|
25
|
+
# # =>
|
26
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
27
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
28
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
29
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
30
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
31
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
32
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
33
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
34
|
+
# : : : : : : ... :
|
35
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
36
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
37
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
38
|
+
#
|
39
|
+
# penguins[:bill_length_mm]
|
40
|
+
#
|
41
|
+
# # =>
|
42
|
+
# #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
|
43
|
+
# [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
44
|
+
#
|
45
|
+
# @overload [](keys)
|
46
|
+
# Select variables and return a DataFrame.
|
47
|
+
#
|
48
|
+
# @param keys [<Symbol, String>] key names to select.
|
49
|
+
# @return [DataFrame]
|
50
|
+
# selected variables as a DataFrame.
|
51
|
+
# @example Select columns
|
52
|
+
# penguins[:island, :bill_length_mm]
|
53
|
+
#
|
54
|
+
# # =>
|
55
|
+
# #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
|
56
|
+
# island bill_length_mm
|
57
|
+
# <string> <double>
|
58
|
+
# 0 Torgersen 39.1
|
59
|
+
# 1 Torgersen 39.5
|
60
|
+
# 2 Torgersen 40.3
|
61
|
+
# 3 Torgersen (nil)
|
62
|
+
# 4 Torgersen 36.7
|
63
|
+
# : : :
|
64
|
+
# 341 Biscoe 50.4
|
65
|
+
# 342 Biscoe 45.2
|
66
|
+
# 343 Biscoe 49.9
|
67
|
+
#
|
68
|
+
# @overload [](index)
|
69
|
+
# Select a record and return a DataFrame.
|
70
|
+
#
|
71
|
+
# @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
72
|
+
# index of a row to select.
|
73
|
+
# @return [DataFrame]
|
74
|
+
# selected variables as a DataFrame.
|
75
|
+
# @example Select a row
|
76
|
+
# penguins[0]
|
77
|
+
#
|
78
|
+
# # =>
|
79
|
+
# #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
|
80
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
81
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
82
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
83
|
+
#
|
84
|
+
# @overload [](indices)
|
85
|
+
# Select records by indices and return a DataFrame.
|
86
|
+
#
|
87
|
+
# @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
|
88
|
+
# indices of rows to select.
|
89
|
+
# @return [DataFrame]
|
90
|
+
# selected variables as a DataFrame.
|
91
|
+
# @example Select rows by indices
|
92
|
+
# penguins[0..100]
|
93
|
+
#
|
94
|
+
# # =>
|
95
|
+
# #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
|
96
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
97
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
98
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
99
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
100
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
101
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
102
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
103
|
+
# : : : : : : ... :
|
104
|
+
# 98 Adelie Dream 33.1 16.1 178 ... 2008
|
105
|
+
# 99 Adelie Dream 43.2 18.5 192 ... 2008
|
106
|
+
# 100 Adelie Biscoe 35.0 17.9 192 ... 2009
|
107
|
+
#
|
108
|
+
# @overload [](booleans)
|
109
|
+
# Select records by booleans and return a DataFrame.
|
110
|
+
#
|
111
|
+
# @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
|
112
|
+
# booleans of rows to select.
|
113
|
+
# @return [DataFrame]
|
114
|
+
# selected variables as a DataFrame.
|
115
|
+
# @example Select rows by booleans
|
116
|
+
# penguins[penguins.species == 'Adelie']
|
117
|
+
#
|
118
|
+
# # =>
|
119
|
+
# #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
|
120
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
121
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
122
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
123
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
124
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
125
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
126
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
127
|
+
# : : : : : : ... :
|
128
|
+
# 149 Adelie Dream 37.8 18.1 193 ... 2009
|
129
|
+
# 150 Adelie Dream 36.0 17.1 187 ... 2009
|
130
|
+
# 151 Adelie Dream 41.5 18.5 201 ... 2009
|
131
|
+
#
|
8
132
|
def [](*args)
|
9
|
-
|
10
|
-
|
11
|
-
|
133
|
+
raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
|
134
|
+
|
135
|
+
case args
|
136
|
+
in [] | [nil]
|
137
|
+
return remove_all_values
|
138
|
+
in [(Symbol | String) => k] if key? k
|
139
|
+
return variables[k.to_sym]
|
140
|
+
in [Integer => i]
|
141
|
+
return take([i.negative? ? i + size : i])
|
142
|
+
in [Vector => v]
|
143
|
+
arrow_array = v.data
|
144
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
145
|
+
arrow_array = aa
|
146
|
+
else
|
147
|
+
a = parse_args(args, size)
|
148
|
+
return select_variables_by_keys(a) if a.symbol?
|
149
|
+
return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
|
150
|
+
return remove_all_values if a.compact.empty?
|
151
|
+
return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
|
152
|
+
|
153
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
154
|
+
end
|
155
|
+
|
156
|
+
return take(normalize_indices(arrow_array)) if arrow_array.numeric?
|
157
|
+
return filter_by_array(arrow_array) if arrow_array.boolean?
|
12
158
|
|
13
|
-
|
14
|
-
if
|
15
|
-
return filter_by_vector(vector.data) if vector.size == size
|
159
|
+
a = arrow_array.to_a
|
160
|
+
return select_variables_by_keys(a) if a.symbol_or_string?
|
16
161
|
|
17
|
-
|
162
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
163
|
+
end
|
164
|
+
|
165
|
+
# Select a variable by String or Symbol and return as a Vector.
|
166
|
+
#
|
167
|
+
# @param key [Symbol, String]
|
168
|
+
# key name to select.
|
169
|
+
# @return [Vector]
|
170
|
+
# selected variable as a Vector.
|
171
|
+
# @note #v(key) is faster then #[](key).
|
172
|
+
# @example Select a column and return Vector
|
173
|
+
# penguins.v(:bill_length_mm)
|
174
|
+
#
|
175
|
+
# # =>
|
176
|
+
# #<RedAmber::Vector(:double, size=344):0x000000000000f140>
|
177
|
+
# [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
178
|
+
#
|
179
|
+
def v(key)
|
180
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
181
|
+
raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
|
18
182
|
end
|
19
|
-
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
|
183
|
+
raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
|
21
184
|
|
22
|
-
|
185
|
+
variables[key.to_sym]
|
23
186
|
end
|
24
187
|
|
25
|
-
#
|
188
|
+
# Select records to create a DataFrame.
|
189
|
+
#
|
190
|
+
# @overload slice(row)
|
191
|
+
# Select a record and return a DataFrame.
|
192
|
+
#
|
193
|
+
# @param row [Indeger, Float]
|
194
|
+
# a row index to select.
|
195
|
+
# @return [DataFrame]
|
196
|
+
# selected records as a DataFrame.
|
197
|
+
# @example Select a row
|
198
|
+
# penguins
|
199
|
+
#
|
200
|
+
# # =>
|
201
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
202
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
203
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
204
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
205
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
206
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
207
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
208
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
209
|
+
# : : : : : : ... :
|
210
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
211
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
212
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
213
|
+
# penguins.slice(2)
|
214
|
+
#
|
215
|
+
# # =>
|
216
|
+
# #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
|
217
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
218
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
219
|
+
# 0 Adelie Torgersen 40.3 18.0 195 ... 2007
|
220
|
+
#
|
221
|
+
# @overload slice(rows)
|
222
|
+
# Select records and return a DataFrame.
|
223
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
224
|
+
# - The order of records will be the same as specified indices.
|
225
|
+
#
|
226
|
+
# @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
|
227
|
+
# row indeces to select.
|
228
|
+
# @return [DataFrame]
|
229
|
+
# selected records as a DataFrame.
|
230
|
+
# @example Select rows
|
231
|
+
# penguins.slice(300..-1)
|
232
|
+
#
|
233
|
+
# # =>
|
234
|
+
# #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
|
235
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
236
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
237
|
+
# 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
|
238
|
+
# 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
|
239
|
+
# 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
|
240
|
+
# 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
|
241
|
+
# 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
|
242
|
+
# : : : : : : ... :
|
243
|
+
# 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
244
|
+
# 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
245
|
+
# 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
246
|
+
#
|
247
|
+
# @overload slice(enumerator)
|
248
|
+
# Select records and return a DataFrame.
|
249
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
250
|
+
# - The order of records will be the same as specified indices.
|
251
|
+
#
|
252
|
+
# @param enumerator [Enumerator]
|
253
|
+
# an enumerator which returns row indeces to select.
|
254
|
+
# @return [DataFrame]
|
255
|
+
# selected records as a DataFrame.
|
256
|
+
# @example Select rows by Enumerator.
|
257
|
+
# penguins.assign_left(index: penguins.indices) # 0.2.0 feature
|
258
|
+
# .slice(0.step(by: 10, to: 340))
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
|
262
|
+
# index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
263
|
+
# <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
|
264
|
+
# 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
265
|
+
# 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
|
266
|
+
# 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
|
267
|
+
# 3 30 Adelie Dream 39.5 16.7 178 ... 2007
|
268
|
+
# 4 40 Adelie Dream 36.5 18.0 182 ... 2007
|
269
|
+
# : : : : : : : ... :
|
270
|
+
# 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
|
271
|
+
# 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
|
272
|
+
# 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
273
|
+
#
|
274
|
+
# @overload slice
|
275
|
+
# Select records by indices with block and return a DataFrame.
|
276
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
277
|
+
# - The order of records will be the same as specified indices.
|
278
|
+
#
|
279
|
+
# @yieldparam self [DataFrame]
|
280
|
+
# gives self to the block.
|
281
|
+
# The block is evaluated within the context of self.
|
282
|
+
# @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
|
283
|
+
# row indeces to select.
|
284
|
+
# @return [DataFrame]
|
285
|
+
# selected records as a DataFrame.
|
286
|
+
# @example Select rows by block
|
287
|
+
# penguins.assign_left(index: penguins.indices) # 0.2.0 feature
|
288
|
+
# .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
|
289
|
+
#
|
290
|
+
# # =>
|
291
|
+
# #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
|
292
|
+
# index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
293
|
+
# <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
|
294
|
+
# 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
295
|
+
# 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
296
|
+
# 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
|
297
|
+
# 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
|
298
|
+
# 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
|
299
|
+
# 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
|
300
|
+
# 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
|
301
|
+
# 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
|
302
|
+
#
|
303
|
+
# @overload slice(booleans)
|
304
|
+
# Select records by filtering with booleans and return a DataFrame.
|
305
|
+
#
|
306
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
307
|
+
# a boolean filter.
|
308
|
+
# @return [DataFrame]
|
309
|
+
# filtered records as a DataFrame.
|
310
|
+
# @example Select rows by boolean filter
|
311
|
+
# penguins.slice(penguins[:bill_length_mm] > 50)
|
312
|
+
#
|
313
|
+
# # =>
|
314
|
+
# #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
|
315
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
316
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
317
|
+
# 0 Chinstrap Dream 51.3 19.2 193 ... 2007
|
318
|
+
# 1 Chinstrap Dream 52.7 19.8 197 ... 2007
|
319
|
+
# 2 Chinstrap Dream 51.3 18.2 197 ... 2007
|
320
|
+
# 3 Chinstrap Dream 51.3 19.9 198 ... 2007
|
321
|
+
# 4 Chinstrap Dream 51.7 20.3 194 ... 2007
|
322
|
+
# : : : : : : ... :
|
323
|
+
# 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
|
324
|
+
# 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
|
325
|
+
# 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
326
|
+
#
|
327
|
+
# @overload slice
|
328
|
+
# Select records by filtering with block and return a DataFrame.
|
329
|
+
#
|
330
|
+
# @yieldparam self [DataFrame]
|
331
|
+
# gives self to the block.
|
332
|
+
# The block is evaluated within the context of self.
|
333
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
334
|
+
# a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
|
335
|
+
# @return [DataFrame]
|
336
|
+
# filtered records as a DataFrame.
|
337
|
+
# @example Select rows by booleans from block
|
338
|
+
# penguins.slice { indices.map(&:even?) }
|
339
|
+
#
|
340
|
+
# # =>
|
341
|
+
# #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
|
342
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
343
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
344
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
345
|
+
# 1 Adelie Torgersen 40.3 18.0 195 ... 2007
|
346
|
+
# 2 Adelie Torgersen 36.7 19.3 193 ... 2007
|
347
|
+
# 3 Adelie Torgersen 38.9 17.8 181 ... 2007
|
348
|
+
# 4 Adelie Torgersen 34.1 18.1 193 ... 2007
|
349
|
+
# : : : : : : ... :
|
350
|
+
# 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
|
351
|
+
# 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
352
|
+
# 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
353
|
+
#
|
26
354
|
def slice(*args, &block)
|
27
|
-
|
355
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
356
|
+
|
28
357
|
if block
|
29
|
-
|
358
|
+
unless args.empty?
|
359
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
360
|
+
end
|
30
361
|
|
31
|
-
|
362
|
+
args = [instance_eval(&block)]
|
32
363
|
end
|
33
|
-
slicer.flatten!
|
34
364
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
365
|
+
arrow_array =
|
366
|
+
case args
|
367
|
+
in [] | [[]]
|
368
|
+
return remove_all_values
|
369
|
+
in [Vector => v]
|
370
|
+
v.data
|
371
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
372
|
+
aa
|
373
|
+
else
|
374
|
+
Arrow::Array.new(parse_args(args, size))
|
375
|
+
end
|
41
376
|
|
42
|
-
|
377
|
+
if arrow_array.numeric?
|
378
|
+
take(normalize_indices(arrow_array))
|
379
|
+
elsif arrow_array.boolean?
|
380
|
+
filter_by_array(arrow_array)
|
381
|
+
elsif arrow_array.to_a.compact.empty?
|
382
|
+
# Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
|
383
|
+
remove_all_values
|
384
|
+
else
|
385
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
43
386
|
end
|
44
|
-
return take_by_array(vector) if vector.numeric?
|
45
|
-
|
46
|
-
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
387
|
end
|
48
388
|
|
389
|
+
# Select records by a column specified by a key
|
390
|
+
# and corresponding record with a block.
|
391
|
+
#
|
392
|
+
# @overload slice_by(key)
|
393
|
+
# Select records by elements.
|
394
|
+
#
|
395
|
+
# @param key [Symbol, String]
|
396
|
+
# a key to select column.
|
397
|
+
# @param keep_key [true, false]
|
398
|
+
# preserve column specified by key in the result if true.
|
399
|
+
# @yieldparam self [DataFrame]
|
400
|
+
# gives self to the block.
|
401
|
+
# The block is evaluated within the context of self.
|
402
|
+
# @yieldreturn [<elements>]
|
403
|
+
# array of elements to select.
|
404
|
+
# @return [DataFrame]
|
405
|
+
# selected records as a DataFrame.
|
406
|
+
# @example Select records by elements
|
407
|
+
# df
|
408
|
+
#
|
409
|
+
# # =>
|
410
|
+
# #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
|
411
|
+
# index float string
|
412
|
+
# <uint8> <double> <string>
|
413
|
+
# 0 0 0.0 A
|
414
|
+
# 1 1 1.1 B
|
415
|
+
# 2 2 2.2 C
|
416
|
+
# 3 3 NaN D
|
417
|
+
# 4 (nil) (nil) (nil)
|
418
|
+
#
|
419
|
+
# df.slice_by(:string) { ["A", "C"] }
|
420
|
+
#
|
421
|
+
# # =>
|
422
|
+
# #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
|
423
|
+
# index float
|
424
|
+
# <uint8> <double>
|
425
|
+
# 0 0 0.0
|
426
|
+
# 1 2 2.2
|
427
|
+
#
|
428
|
+
# @overload slice_by(key)
|
429
|
+
# Select records by elements range.
|
430
|
+
#
|
431
|
+
# @param key [Symbol, String]
|
432
|
+
# a key to select column.
|
433
|
+
# @param keep_key [true, false]
|
434
|
+
# preserve column specified by key in the result if true.
|
435
|
+
# @yieldparam self [DataFrame]
|
436
|
+
# gives self to the block.
|
437
|
+
# The block is evaluated within the context of self.
|
438
|
+
# @yieldreturn [Range]
|
439
|
+
# specifies position of elements at the start and the end and
|
440
|
+
# select records between them.
|
441
|
+
# @return [DataFrame]
|
442
|
+
# selected records as a DataFrame.
|
443
|
+
# @example Select records by elements range
|
444
|
+
# df.slice_by(:string) { "A".."C" }
|
445
|
+
#
|
446
|
+
# # =>
|
447
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
|
448
|
+
# index float
|
449
|
+
# <uint8> <double>
|
450
|
+
# 0 0 0.0
|
451
|
+
# 1 1 1.1
|
452
|
+
# 2 2 2.2
|
453
|
+
#
|
454
|
+
# @since 0.2.1
|
455
|
+
#
|
49
456
|
def slice_by(key, keep_key: false, &block)
|
50
457
|
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
51
458
|
raise DataFrameArgumentError, 'No block given' unless block
|
52
|
-
raise DataFrameArgumentError, "#{key} is
|
459
|
+
raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
|
53
460
|
return self if key.nil?
|
54
461
|
|
55
462
|
slicer = instance_eval(&block)
|
@@ -83,160 +490,414 @@ module RedAmber
|
|
83
490
|
slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
|
84
491
|
end
|
85
492
|
|
86
|
-
|
87
|
-
|
88
|
-
else
|
89
|
-
take(slicer).drop(key)
|
90
|
-
end
|
493
|
+
taken = take(normalize_indices(Arrow::Array.new(slicer)))
|
494
|
+
keep_key ? taken : taken.drop(key)
|
91
495
|
end
|
92
496
|
|
93
|
-
#
|
94
|
-
|
95
|
-
|
497
|
+
# Select records by filtering with booleans to create a DataFrame.
|
498
|
+
#
|
499
|
+
# @overload filter(booleans)
|
500
|
+
# Select records by filtering with booleans and return a DataFrame.
|
501
|
+
#
|
502
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
503
|
+
# a boolean filter.
|
504
|
+
# @return [DataFrame]
|
505
|
+
# filtered records as a DataFrame.
|
506
|
+
# @example Filter by boolean Vector
|
507
|
+
# penguins
|
508
|
+
#
|
509
|
+
# # =>
|
510
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
511
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
512
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
513
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
514
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
515
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
516
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
517
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
518
|
+
# : : : : : : ... :
|
519
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
520
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
521
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
522
|
+
#
|
523
|
+
# penguins.filter(penguins.bill_length_mm < 50)
|
524
|
+
#
|
525
|
+
# # =>
|
526
|
+
# #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
|
527
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
528
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
529
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
530
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
531
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
532
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
533
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
534
|
+
# : : : : : : ... :
|
535
|
+
# 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
536
|
+
# 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
537
|
+
# 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
538
|
+
#
|
539
|
+
# @overload filter
|
540
|
+
# Select records by filtering with block and return a DataFrame.
|
541
|
+
#
|
542
|
+
# @yieldparam self [DataFrame]
|
543
|
+
# gives self to the block.
|
544
|
+
# The block is evaluated within the context of self.
|
545
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
546
|
+
# a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
|
547
|
+
# @return [DataFrame]
|
548
|
+
# filtered records as a DataFrame.
|
549
|
+
# @example Filter by boolean Vector
|
550
|
+
# penguins.filter { bill_length_mm < 50 }
|
551
|
+
#
|
552
|
+
# # =>
|
553
|
+
# #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
|
554
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
555
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
556
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
557
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
558
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
559
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
560
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
561
|
+
# : : : : : : ... :
|
562
|
+
# 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
563
|
+
# 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
564
|
+
# 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
565
|
+
#
|
566
|
+
def filter(*booleans, &block)
|
567
|
+
booleans.flatten!
|
568
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
569
|
+
|
96
570
|
if block
|
97
|
-
|
571
|
+
unless booleans.empty?
|
572
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
573
|
+
end
|
574
|
+
|
575
|
+
booleans = [instance_eval(&block)]
|
576
|
+
end
|
577
|
+
|
578
|
+
case booleans
|
579
|
+
in [] | [[]]
|
580
|
+
return remove_all_values
|
581
|
+
in [Vector => v] if v.boolean?
|
582
|
+
filter_by_array(v.data)
|
583
|
+
in [Arrow::ChunkedArray => ca] if ca.boolean?
|
584
|
+
filter_by_array(ca)
|
585
|
+
in [Arrow::BooleanArray => b]
|
586
|
+
filter_by_array(b)
|
587
|
+
else
|
588
|
+
a = Arrow::Array.new(parse_args(booleans, size))
|
589
|
+
unless a.boolean?
|
590
|
+
raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
|
591
|
+
end
|
98
592
|
|
99
|
-
|
593
|
+
filter_by_array(a)
|
100
594
|
end
|
101
|
-
|
595
|
+
end
|
102
596
|
|
103
|
-
|
104
|
-
|
597
|
+
# Select records and remove them to create a remainer DataFrame.
|
598
|
+
#
|
599
|
+
# @overload remove(row)
|
600
|
+
# Select a record and remove it to create a remainer DataFrame.
|
601
|
+
# - The order of records in self will be preserved.
|
602
|
+
#
|
603
|
+
# @param row [Indeger, Float]
|
604
|
+
# a row index to remove.
|
605
|
+
# @return [DataFrame]
|
606
|
+
# remainer variables as a DataFrame.
|
607
|
+
# @example Remove a row
|
608
|
+
# penguins.remove(-1)
|
609
|
+
#
|
610
|
+
# # =>
|
611
|
+
# #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
|
612
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
613
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
614
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
615
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
616
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
617
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
618
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
619
|
+
# : : : : : : ... :
|
620
|
+
# 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
621
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
622
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
623
|
+
#
|
624
|
+
# @overload remove(rows)
|
625
|
+
# Select records and remove them to create a remainer DataFrame.
|
626
|
+
# - Duplicated selection is acceptable.
|
627
|
+
# - The order of records in self will be preserved.
|
628
|
+
#
|
629
|
+
# @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
|
630
|
+
# row indeces to remove.
|
631
|
+
# @return [DataFrame]
|
632
|
+
# remainer variables as a DataFrame.
|
633
|
+
# @example Remove rows
|
634
|
+
# penguins.remove(100..200)
|
635
|
+
#
|
636
|
+
# # =>
|
637
|
+
# #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
|
638
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
639
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
640
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
641
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
642
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
643
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
644
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
645
|
+
# : : : : : : ... :
|
646
|
+
# 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
647
|
+
# 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
648
|
+
# 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
649
|
+
#
|
650
|
+
# @overload remove
|
651
|
+
# Select records by indices from block
|
652
|
+
# and remove them to create a remainer DataFrame.
|
653
|
+
# - Duplicated selection is acceptable.
|
654
|
+
# - The order of records in self will be preserved.
|
655
|
+
#
|
656
|
+
# @yieldparam self [DataFrame]
|
657
|
+
# gives self to the block.
|
658
|
+
# The block is evaluated within the context of self.
|
659
|
+
# @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
|
660
|
+
# row indeces to remove.
|
661
|
+
# @return [DataFrame]
|
662
|
+
# remainer variables as a DataFrame.
|
663
|
+
# @example Remove rows by indices from block
|
664
|
+
# penguins.remove { 0.step(size, 10) }
|
665
|
+
#
|
666
|
+
# # =>
|
667
|
+
# #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
|
668
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
669
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
670
|
+
# 0 Adelie Torgersen 39.5 17.4 186 ... 2007
|
671
|
+
# 1 Adelie Torgersen 40.3 18.0 195 ... 2007
|
672
|
+
# 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
673
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
674
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
675
|
+
# : : : : : : ... :
|
676
|
+
# 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
677
|
+
# 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
678
|
+
# 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
679
|
+
#
|
680
|
+
# @overload remove(booleans)
|
681
|
+
# Select records by filtering with booleans and return a DataFrame.
|
682
|
+
# - The order of records in self will be preserved.
|
683
|
+
#
|
684
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
685
|
+
# a boolean filter to remove.
|
686
|
+
# @return [DataFrame]
|
687
|
+
# remainer records as a DataFrame.
|
688
|
+
# @example Remove rows by boolean filter
|
689
|
+
# penguins.remove(penguins.bill_length_mm.is_nil)
|
690
|
+
#
|
691
|
+
# # =>
|
692
|
+
# #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
|
693
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
694
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
695
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
696
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
697
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
698
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
699
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
700
|
+
# : : : : : : ... :
|
701
|
+
# 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
702
|
+
# 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
703
|
+
# 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
704
|
+
#
|
705
|
+
# @overload remove
|
706
|
+
# Select records by booleans from block
|
707
|
+
# and remove them to create a remainer DataFrame.
|
708
|
+
# - The order of records in self will be preserved.
|
709
|
+
#
|
710
|
+
# @yieldparam self [DataFrame]
|
711
|
+
# gives self to the block.
|
712
|
+
# The block is evaluated within the context of self.
|
713
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
714
|
+
# a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
|
715
|
+
# @return [DataFrame]
|
716
|
+
# remainer records as a DataFrame.
|
717
|
+
# @example Remove rows by booleans from block
|
718
|
+
# penguins.remove { (species == 'Adelie') | (year == 2009) }
|
719
|
+
#
|
720
|
+
# # =>
|
721
|
+
# #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
|
722
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
723
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
724
|
+
# 0 Chinstrap Dream 46.5 17.9 192 ... 2007
|
725
|
+
# 1 Chinstrap Dream 50.0 19.5 196 ... 2007
|
726
|
+
# 2 Chinstrap Dream 51.3 19.2 193 ... 2007
|
727
|
+
# 3 Chinstrap Dream 45.4 18.7 188 ... 2007
|
728
|
+
# 4 Chinstrap Dream 52.7 19.8 197 ... 2007
|
729
|
+
# : : : : : : ... :
|
730
|
+
# 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
|
731
|
+
# 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
|
732
|
+
# 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
|
733
|
+
#
|
734
|
+
def remove(*args, &block)
|
735
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
105
736
|
|
106
|
-
|
107
|
-
|
108
|
-
|
737
|
+
if block
|
738
|
+
unless args.empty?
|
739
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
740
|
+
end
|
109
741
|
|
110
|
-
|
742
|
+
args = [instance_eval(&block)]
|
111
743
|
end
|
112
|
-
if vector.numeric?
|
113
|
-
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
114
744
|
|
115
|
-
|
116
|
-
|
117
|
-
|
745
|
+
arrow_array =
|
746
|
+
case args
|
747
|
+
in [] | [[]] | [nil]
|
748
|
+
return self
|
749
|
+
in [Vector => v]
|
750
|
+
v.data
|
751
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
752
|
+
aa
|
753
|
+
else
|
754
|
+
Arrow::Array.new(parse_args(args, size))
|
118
755
|
end
|
119
756
|
|
120
|
-
|
121
|
-
|
122
|
-
|
757
|
+
if arrow_array.boolean?
|
758
|
+
filter_by_array(arrow_array.primitive_invert)
|
759
|
+
elsif arrow_array.numeric?
|
760
|
+
remover = normalize_indices(arrow_array).to_a
|
761
|
+
return self if remover.empty?
|
123
762
|
|
124
|
-
|
763
|
+
slicer = indices.to_a - remover.map(&:to_i)
|
764
|
+
return remove_all_values if slicer.empty?
|
125
765
|
|
126
|
-
|
127
|
-
|
766
|
+
take(slicer)
|
767
|
+
else
|
768
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
128
769
|
end
|
129
|
-
|
130
|
-
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
131
770
|
end
|
132
771
|
|
772
|
+
# Remove records (rows) contains any nil.
|
773
|
+
#
|
774
|
+
# @return [DataFrame]
|
775
|
+
# removed DataFrame.
|
776
|
+
# @example
|
777
|
+
# penguins.remove_nil
|
778
|
+
# # =>
|
779
|
+
# #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
|
780
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
781
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
782
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
783
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
784
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
785
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
786
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
787
|
+
# : : : : : : ... :
|
788
|
+
# 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
789
|
+
# 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
790
|
+
# 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
791
|
+
#
|
133
792
|
def remove_nil
|
134
793
|
func = Arrow::Function.find(:drop_null)
|
135
|
-
DataFrame.
|
794
|
+
DataFrame.create(func.execute([table]).value)
|
136
795
|
end
|
137
796
|
alias_method :drop_nil, :remove_nil
|
138
797
|
|
139
|
-
# Select
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
variables[key.to_sym]
|
147
|
-
end
|
148
|
-
|
798
|
+
# Select records from the top.
|
799
|
+
#
|
800
|
+
# @param n_obs [Integer]
|
801
|
+
# number of records to select.
|
802
|
+
# @return [DataFrame]
|
803
|
+
#
|
149
804
|
def head(n_obs = 5)
|
150
805
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
151
806
|
|
152
807
|
self[0...[n_obs, size].min]
|
153
808
|
end
|
154
809
|
|
810
|
+
# Select records from the end.
|
811
|
+
#
|
812
|
+
# @param n_obs [Integer]
|
813
|
+
# number of records to select.
|
814
|
+
# @return [DataFrame]
|
815
|
+
#
|
155
816
|
def tail(n_obs = 5)
|
156
817
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
157
818
|
|
158
819
|
self[-[n_obs, size].min..]
|
159
820
|
end
|
160
821
|
|
822
|
+
# Select records from the top.
|
823
|
+
#
|
824
|
+
# @param n_obs [Integer]
|
825
|
+
# number of records to select.
|
826
|
+
# @return [DataFrame]
|
827
|
+
#
|
161
828
|
def first(n_obs = 1)
|
162
829
|
head(n_obs)
|
163
830
|
end
|
164
831
|
|
832
|
+
# Select records from the end.
|
833
|
+
#
|
834
|
+
# @param n_obs [Integer]
|
835
|
+
# number of records to select.
|
836
|
+
# @return [DataFrame]
|
837
|
+
#
|
165
838
|
def last(n_obs = 1)
|
166
839
|
tail(n_obs)
|
167
840
|
end
|
168
841
|
|
169
|
-
#
|
170
|
-
#
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
842
|
+
# Select records by index Array to create a DataFrame.
|
843
|
+
#
|
844
|
+
# - TODO: support for option `boundscheck: true`
|
845
|
+
# - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
|
846
|
+
# - Negative index is not supported.
|
847
|
+
# @param index_array [<Integer>, Arrow::Array]
|
848
|
+
# row indeces to select.
|
849
|
+
# @return [DataFrame]
|
850
|
+
# selected variables as a DataFrame.
|
851
|
+
#
|
852
|
+
# @api private
|
853
|
+
#
|
854
|
+
def take(index_array)
|
855
|
+
DataFrame.create(@table.take(index_array))
|
179
856
|
end
|
180
857
|
|
181
|
-
#
|
182
|
-
# TODO: support for option {null_selection_behavior: :drop}
|
183
|
-
def filter(*booleans)
|
184
|
-
booleans.flatten!
|
185
|
-
return remove_all_values if booleans.empty?
|
186
|
-
|
187
|
-
b = booleans[0]
|
188
|
-
case b
|
189
|
-
when Vector
|
190
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
191
|
-
|
192
|
-
filter_by_vector(b.data)
|
193
|
-
when Arrow::BooleanArray
|
194
|
-
filter_by_vector(b)
|
195
|
-
else
|
196
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
|
197
|
-
|
198
|
-
filter_by_vector(Arrow::BooleanArray.new(booleans))
|
199
|
-
end
|
200
|
-
end
|
858
|
+
# rubocop:enable Layout/LineLength
|
201
859
|
|
202
860
|
private
|
203
861
|
|
204
|
-
def
|
862
|
+
def select_variables_by_keys(keys)
|
205
863
|
if keys.one?
|
206
864
|
key = keys[0].to_sym
|
207
|
-
raise DataFrameArgumentError, "Key does not exist #{
|
865
|
+
raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
|
208
866
|
|
209
867
|
variables[key]
|
868
|
+
# Vector.new(@table.find_column(*key).data)
|
210
869
|
else
|
211
|
-
|
870
|
+
check_duplicate_keys(keys)
|
871
|
+
DataFrame.create(@table.select_columns(*keys))
|
212
872
|
end
|
213
873
|
end
|
214
874
|
|
215
|
-
# Accepts indices by numeric
|
216
|
-
def
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
DataFrame.new(datum.value)
|
875
|
+
# Accepts indices by numeric arrow array and returns positive indices.
|
876
|
+
def normalize_indices(arrow_array)
|
877
|
+
b = Arrow::Function.find(:less).execute([arrow_array, 0])
|
878
|
+
a = Arrow::Function.find(:add).execute([arrow_array, size])
|
879
|
+
r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
|
880
|
+
if r.float?
|
881
|
+
r = Arrow::Function.find(:floor).execute([r]).value
|
882
|
+
Arrow::UInt64ArrayBuilder.build(r)
|
883
|
+
else
|
884
|
+
r
|
885
|
+
end
|
227
886
|
end
|
228
887
|
|
229
|
-
# Accepts booleans by Arrow::BooleanArray
|
230
|
-
def
|
231
|
-
|
888
|
+
# Accepts booleans by a Arrow::BooleanArray or an Array
|
889
|
+
def filter_by_array(boolean_array)
|
890
|
+
unless boolean_array.length == size
|
891
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.'
|
892
|
+
end
|
232
893
|
|
233
894
|
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
234
|
-
DataFrame.
|
895
|
+
DataFrame.create(datum.value)
|
235
896
|
end
|
236
897
|
|
237
898
|
# return a DataFrame with same keys as self without values
|
238
899
|
def remove_all_values
|
239
|
-
|
900
|
+
filter_by_array(Arrow::BooleanArray.new([false] * size))
|
240
901
|
end
|
241
902
|
end
|
242
903
|
end
|