red_amber 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -51
- data/.yardopts +2 -0
- data/CHANGELOG.md +203 -1
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +61 -45
- data/benchmark/basic.yml +11 -4
- data/benchmark/combine.yml +3 -4
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/group.yml +7 -1
- data/benchmark/reshape.yml +6 -2
- data/benchmark/vector.yml +63 -0
- data/doc/DataFrame.md +35 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +295 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +537 -68
- data/lib/red_amber/data_frame_combinable.rb +776 -123
- data/lib/red_amber/data_frame_displayable.rb +248 -18
- data/lib/red_amber/data_frame_indexable.rb +122 -19
- data/lib/red_amber/data_frame_loadsave.rb +81 -10
- data/lib/red_amber/data_frame_reshaping.rb +216 -21
- data/lib/red_amber/data_frame_selectable.rb +781 -120
- data/lib/red_amber/data_frame_variable_operation.rb +561 -85
- data/lib/red_amber/group.rb +195 -21
- data/lib/red_amber/helper.rb +114 -32
- data/lib/red_amber/refinements.rb +206 -0
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +435 -58
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +321 -69
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +397 -24
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +15 -1
- data/red_amber.gemspec +4 -3
- metadata +19 -11
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,55 +1,462 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameSelectable
|
6
|
-
#
|
7
|
-
|
6
|
+
# Array, Arrow::Array and Arrow::ChunkedArray are refined
|
7
|
+
using RefineArray
|
8
|
+
using RefineArrayLike
|
9
|
+
|
10
|
+
# rubocop:disable Layout/LineLength
|
11
|
+
|
12
|
+
# Select variables or records.
|
13
|
+
#
|
14
|
+
# @overload [](key)
|
15
|
+
# Select single variable (column) and return as a Vetor.
|
16
|
+
#
|
17
|
+
# @param key [Symbol, String]
|
18
|
+
# key name to select.
|
19
|
+
# @return [Vector]
|
20
|
+
# selected variable as a Vector.
|
21
|
+
# @note DataFrame.v(key) is faster to create Vector from a variable.
|
22
|
+
# @example Select a column and return Vector
|
23
|
+
# penguins
|
24
|
+
#
|
25
|
+
# # =>
|
26
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
27
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
28
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
29
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
30
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
31
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
32
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
33
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
34
|
+
# : : : : : : ... :
|
35
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
36
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
37
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
38
|
+
#
|
39
|
+
# penguins[:bill_length_mm]
|
40
|
+
#
|
41
|
+
# # =>
|
42
|
+
# #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
|
43
|
+
# [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
44
|
+
#
|
45
|
+
# @overload [](keys)
|
46
|
+
# Select variables and return a DataFrame.
|
47
|
+
#
|
48
|
+
# @param keys [<Symbol, String>] key names to select.
|
49
|
+
# @return [DataFrame]
|
50
|
+
# selected variables as a DataFrame.
|
51
|
+
# @example Select columns
|
52
|
+
# penguins[:island, :bill_length_mm]
|
53
|
+
#
|
54
|
+
# # =>
|
55
|
+
# #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
|
56
|
+
# island bill_length_mm
|
57
|
+
# <string> <double>
|
58
|
+
# 0 Torgersen 39.1
|
59
|
+
# 1 Torgersen 39.5
|
60
|
+
# 2 Torgersen 40.3
|
61
|
+
# 3 Torgersen (nil)
|
62
|
+
# 4 Torgersen 36.7
|
63
|
+
# : : :
|
64
|
+
# 341 Biscoe 50.4
|
65
|
+
# 342 Biscoe 45.2
|
66
|
+
# 343 Biscoe 49.9
|
67
|
+
#
|
68
|
+
# @overload [](index)
|
69
|
+
# Select a record and return a DataFrame.
|
70
|
+
#
|
71
|
+
# @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
|
72
|
+
# index of a row to select.
|
73
|
+
# @return [DataFrame]
|
74
|
+
# selected variables as a DataFrame.
|
75
|
+
# @example Select a row
|
76
|
+
# penguins[0]
|
77
|
+
#
|
78
|
+
# # =>
|
79
|
+
# #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
|
80
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
81
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
82
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
83
|
+
#
|
84
|
+
# @overload [](indices)
|
85
|
+
# Select records by indices and return a DataFrame.
|
86
|
+
#
|
87
|
+
# @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
|
88
|
+
# indices of rows to select.
|
89
|
+
# @return [DataFrame]
|
90
|
+
# selected variables as a DataFrame.
|
91
|
+
# @example Select rows by indices
|
92
|
+
# penguins[0..100]
|
93
|
+
#
|
94
|
+
# # =>
|
95
|
+
# #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
|
96
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
97
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
98
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
99
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
100
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
101
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
102
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
103
|
+
# : : : : : : ... :
|
104
|
+
# 98 Adelie Dream 33.1 16.1 178 ... 2008
|
105
|
+
# 99 Adelie Dream 43.2 18.5 192 ... 2008
|
106
|
+
# 100 Adelie Biscoe 35.0 17.9 192 ... 2009
|
107
|
+
#
|
108
|
+
# @overload [](booleans)
|
109
|
+
# Select records by booleans and return a DataFrame.
|
110
|
+
#
|
111
|
+
# @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
|
112
|
+
# booleans of rows to select.
|
113
|
+
# @return [DataFrame]
|
114
|
+
# selected variables as a DataFrame.
|
115
|
+
# @example Select rows by booleans
|
116
|
+
# penguins[penguins.species == 'Adelie']
|
117
|
+
#
|
118
|
+
# # =>
|
119
|
+
# #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
|
120
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
121
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
122
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
123
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
124
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
125
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
126
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
127
|
+
# : : : : : : ... :
|
128
|
+
# 149 Adelie Dream 37.8 18.1 193 ... 2009
|
129
|
+
# 150 Adelie Dream 36.0 17.1 187 ... 2009
|
130
|
+
# 151 Adelie Dream 41.5 18.5 201 ... 2009
|
131
|
+
#
|
8
132
|
def [](*args)
|
9
|
-
|
10
|
-
|
11
|
-
|
133
|
+
raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
|
134
|
+
|
135
|
+
case args
|
136
|
+
in [] | [nil]
|
137
|
+
return remove_all_values
|
138
|
+
in [(Symbol | String) => k] if key? k
|
139
|
+
return variables[k.to_sym]
|
140
|
+
in [Integer => i]
|
141
|
+
return take([i.negative? ? i + size : i])
|
142
|
+
in [Vector => v]
|
143
|
+
arrow_array = v.data
|
144
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
145
|
+
arrow_array = aa
|
146
|
+
else
|
147
|
+
a = parse_args(args, size)
|
148
|
+
return select_variables_by_keys(a) if a.symbol?
|
149
|
+
return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
|
150
|
+
return remove_all_values if a.compact.empty?
|
151
|
+
return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
|
152
|
+
|
153
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
154
|
+
end
|
155
|
+
|
156
|
+
return take(normalize_indices(arrow_array)) if arrow_array.numeric?
|
157
|
+
return filter_by_array(arrow_array) if arrow_array.boolean?
|
12
158
|
|
13
|
-
|
14
|
-
if
|
15
|
-
return filter_by_vector(vector.data) if vector.size == size
|
159
|
+
a = arrow_array.to_a
|
160
|
+
return select_variables_by_keys(a) if a.symbol_or_string?
|
16
161
|
|
17
|
-
|
162
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
163
|
+
end
|
164
|
+
|
165
|
+
# Select a variable by String or Symbol and return as a Vector.
|
166
|
+
#
|
167
|
+
# @param key [Symbol, String]
|
168
|
+
# key name to select.
|
169
|
+
# @return [Vector]
|
170
|
+
# selected variable as a Vector.
|
171
|
+
# @note #v(key) is faster then #[](key).
|
172
|
+
# @example Select a column and return Vector
|
173
|
+
# penguins.v(:bill_length_mm)
|
174
|
+
#
|
175
|
+
# # =>
|
176
|
+
# #<RedAmber::Vector(:double, size=344):0x000000000000f140>
|
177
|
+
# [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
178
|
+
#
|
179
|
+
def v(key)
|
180
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
181
|
+
raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
|
18
182
|
end
|
19
|
-
|
20
|
-
return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
|
183
|
+
raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
|
21
184
|
|
22
|
-
|
185
|
+
variables[key.to_sym]
|
23
186
|
end
|
24
187
|
|
25
|
-
#
|
188
|
+
# Select records to create a DataFrame.
|
189
|
+
#
|
190
|
+
# @overload slice(row)
|
191
|
+
# Select a record and return a DataFrame.
|
192
|
+
#
|
193
|
+
# @param row [Indeger, Float]
|
194
|
+
# a row index to select.
|
195
|
+
# @return [DataFrame]
|
196
|
+
# selected records as a DataFrame.
|
197
|
+
# @example Select a row
|
198
|
+
# penguins
|
199
|
+
#
|
200
|
+
# # =>
|
201
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
202
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
203
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
204
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
205
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
206
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
207
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
208
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
209
|
+
# : : : : : : ... :
|
210
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
211
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
212
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
213
|
+
# penguins.slice(2)
|
214
|
+
#
|
215
|
+
# # =>
|
216
|
+
# #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
|
217
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
218
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
219
|
+
# 0 Adelie Torgersen 40.3 18.0 195 ... 2007
|
220
|
+
#
|
221
|
+
# @overload slice(rows)
|
222
|
+
# Select records and return a DataFrame.
|
223
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
224
|
+
# - The order of records will be the same as specified indices.
|
225
|
+
#
|
226
|
+
# @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
|
227
|
+
# row indeces to select.
|
228
|
+
# @return [DataFrame]
|
229
|
+
# selected records as a DataFrame.
|
230
|
+
# @example Select rows
|
231
|
+
# penguins.slice(300..-1)
|
232
|
+
#
|
233
|
+
# # =>
|
234
|
+
# #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
|
235
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
236
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
237
|
+
# 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
|
238
|
+
# 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
|
239
|
+
# 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
|
240
|
+
# 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
|
241
|
+
# 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
|
242
|
+
# : : : : : : ... :
|
243
|
+
# 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
244
|
+
# 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
245
|
+
# 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
246
|
+
#
|
247
|
+
# @overload slice(enumerator)
|
248
|
+
# Select records and return a DataFrame.
|
249
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
250
|
+
# - The order of records will be the same as specified indices.
|
251
|
+
#
|
252
|
+
# @param enumerator [Enumerator]
|
253
|
+
# an enumerator which returns row indeces to select.
|
254
|
+
# @return [DataFrame]
|
255
|
+
# selected records as a DataFrame.
|
256
|
+
# @example Select rows by Enumerator.
|
257
|
+
# penguins.assign_left(index: penguins.indices) # 0.2.0 feature
|
258
|
+
# .slice(0.step(by: 10, to: 340))
|
259
|
+
#
|
260
|
+
# # =>
|
261
|
+
# #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
|
262
|
+
# index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
263
|
+
# <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
|
264
|
+
# 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
265
|
+
# 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
|
266
|
+
# 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
|
267
|
+
# 3 30 Adelie Dream 39.5 16.7 178 ... 2007
|
268
|
+
# 4 40 Adelie Dream 36.5 18.0 182 ... 2007
|
269
|
+
# : : : : : : : ... :
|
270
|
+
# 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
|
271
|
+
# 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
|
272
|
+
# 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
273
|
+
#
|
274
|
+
# @overload slice
|
275
|
+
# Select records by indices with block and return a DataFrame.
|
276
|
+
# - Duplicated selection is acceptable. The same record will be returned.
|
277
|
+
# - The order of records will be the same as specified indices.
|
278
|
+
#
|
279
|
+
# @yieldparam self [DataFrame]
|
280
|
+
# gives self to the block.
|
281
|
+
# The block is evaluated within the context of self.
|
282
|
+
# @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
|
283
|
+
# row indeces to select.
|
284
|
+
# @return [DataFrame]
|
285
|
+
# selected records as a DataFrame.
|
286
|
+
# @example Select rows by block
|
287
|
+
# penguins.assign_left(index: penguins.indices) # 0.2.0 feature
|
288
|
+
# .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
|
289
|
+
#
|
290
|
+
# # =>
|
291
|
+
# #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
|
292
|
+
# index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
293
|
+
# <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
|
294
|
+
# 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
295
|
+
# 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
296
|
+
# 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
|
297
|
+
# 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
|
298
|
+
# 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
|
299
|
+
# 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
|
300
|
+
# 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
|
301
|
+
# 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
|
302
|
+
#
|
303
|
+
# @overload slice(booleans)
|
304
|
+
# Select records by filtering with booleans and return a DataFrame.
|
305
|
+
#
|
306
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
307
|
+
# a boolean filter.
|
308
|
+
# @return [DataFrame]
|
309
|
+
# filtered records as a DataFrame.
|
310
|
+
# @example Select rows by boolean filter
|
311
|
+
# penguins.slice(penguins[:bill_length_mm] > 50)
|
312
|
+
#
|
313
|
+
# # =>
|
314
|
+
# #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
|
315
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
316
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
317
|
+
# 0 Chinstrap Dream 51.3 19.2 193 ... 2007
|
318
|
+
# 1 Chinstrap Dream 52.7 19.8 197 ... 2007
|
319
|
+
# 2 Chinstrap Dream 51.3 18.2 197 ... 2007
|
320
|
+
# 3 Chinstrap Dream 51.3 19.9 198 ... 2007
|
321
|
+
# 4 Chinstrap Dream 51.7 20.3 194 ... 2007
|
322
|
+
# : : : : : : ... :
|
323
|
+
# 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
|
324
|
+
# 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
|
325
|
+
# 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
326
|
+
#
|
327
|
+
# @overload slice
|
328
|
+
# Select records by filtering with block and return a DataFrame.
|
329
|
+
#
|
330
|
+
# @yieldparam self [DataFrame]
|
331
|
+
# gives self to the block.
|
332
|
+
# The block is evaluated within the context of self.
|
333
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
334
|
+
# a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
|
335
|
+
# @return [DataFrame]
|
336
|
+
# filtered records as a DataFrame.
|
337
|
+
# @example Select rows by booleans from block
|
338
|
+
# penguins.slice { indices.map(&:even?) }
|
339
|
+
#
|
340
|
+
# # =>
|
341
|
+
# #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
|
342
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
343
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
344
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
345
|
+
# 1 Adelie Torgersen 40.3 18.0 195 ... 2007
|
346
|
+
# 2 Adelie Torgersen 36.7 19.3 193 ... 2007
|
347
|
+
# 3 Adelie Torgersen 38.9 17.8 181 ... 2007
|
348
|
+
# 4 Adelie Torgersen 34.1 18.1 193 ... 2007
|
349
|
+
# : : : : : : ... :
|
350
|
+
# 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
|
351
|
+
# 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
352
|
+
# 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
353
|
+
#
|
26
354
|
def slice(*args, &block)
|
27
|
-
|
355
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
356
|
+
|
28
357
|
if block
|
29
|
-
|
358
|
+
unless args.empty?
|
359
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
360
|
+
end
|
30
361
|
|
31
|
-
|
362
|
+
args = [instance_eval(&block)]
|
32
363
|
end
|
33
|
-
slicer.flatten!
|
34
364
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
365
|
+
arrow_array =
|
366
|
+
case args
|
367
|
+
in [] | [[]]
|
368
|
+
return remove_all_values
|
369
|
+
in [Vector => v]
|
370
|
+
v.data
|
371
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
372
|
+
aa
|
373
|
+
else
|
374
|
+
Arrow::Array.new(parse_args(args, size))
|
375
|
+
end
|
41
376
|
|
42
|
-
|
377
|
+
if arrow_array.numeric?
|
378
|
+
take(normalize_indices(arrow_array))
|
379
|
+
elsif arrow_array.boolean?
|
380
|
+
filter_by_array(arrow_array)
|
381
|
+
elsif arrow_array.to_a.compact.empty?
|
382
|
+
# Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
|
383
|
+
remove_all_values
|
384
|
+
else
|
385
|
+
raise DataFrameArgumentError, "invalid arguments: #{args}"
|
43
386
|
end
|
44
|
-
return take_by_array(vector) if vector.numeric?
|
45
|
-
|
46
|
-
raise DataFrameArgumentError, "Invalid argument #{slicer}"
|
47
387
|
end
|
48
388
|
|
389
|
+
# Select records by a column specified by a key
|
390
|
+
# and corresponding record with a block.
|
391
|
+
#
|
392
|
+
# @overload slice_by(key)
|
393
|
+
# Select records by elements.
|
394
|
+
#
|
395
|
+
# @param key [Symbol, String]
|
396
|
+
# a key to select column.
|
397
|
+
# @param keep_key [true, false]
|
398
|
+
# preserve column specified by key in the result if true.
|
399
|
+
# @yieldparam self [DataFrame]
|
400
|
+
# gives self to the block.
|
401
|
+
# The block is evaluated within the context of self.
|
402
|
+
# @yieldreturn [<elements>]
|
403
|
+
# array of elements to select.
|
404
|
+
# @return [DataFrame]
|
405
|
+
# selected records as a DataFrame.
|
406
|
+
# @example Select records by elements
|
407
|
+
# df
|
408
|
+
#
|
409
|
+
# # =>
|
410
|
+
# #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
|
411
|
+
# index float string
|
412
|
+
# <uint8> <double> <string>
|
413
|
+
# 0 0 0.0 A
|
414
|
+
# 1 1 1.1 B
|
415
|
+
# 2 2 2.2 C
|
416
|
+
# 3 3 NaN D
|
417
|
+
# 4 (nil) (nil) (nil)
|
418
|
+
#
|
419
|
+
# df.slice_by(:string) { ["A", "C"] }
|
420
|
+
#
|
421
|
+
# # =>
|
422
|
+
# #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
|
423
|
+
# index float
|
424
|
+
# <uint8> <double>
|
425
|
+
# 0 0 0.0
|
426
|
+
# 1 2 2.2
|
427
|
+
#
|
428
|
+
# @overload slice_by(key)
|
429
|
+
# Select records by elements range.
|
430
|
+
#
|
431
|
+
# @param key [Symbol, String]
|
432
|
+
# a key to select column.
|
433
|
+
# @param keep_key [true, false]
|
434
|
+
# preserve column specified by key in the result if true.
|
435
|
+
# @yieldparam self [DataFrame]
|
436
|
+
# gives self to the block.
|
437
|
+
# The block is evaluated within the context of self.
|
438
|
+
# @yieldreturn [Range]
|
439
|
+
# specifies position of elements at the start and the end and
|
440
|
+
# select records between them.
|
441
|
+
# @return [DataFrame]
|
442
|
+
# selected records as a DataFrame.
|
443
|
+
# @example Select records by elements range
|
444
|
+
# df.slice_by(:string) { "A".."C" }
|
445
|
+
#
|
446
|
+
# # =>
|
447
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
|
448
|
+
# index float
|
449
|
+
# <uint8> <double>
|
450
|
+
# 0 0 0.0
|
451
|
+
# 1 1 1.1
|
452
|
+
# 2 2 2.2
|
453
|
+
#
|
454
|
+
# @since 0.2.1
|
455
|
+
#
|
49
456
|
def slice_by(key, keep_key: false, &block)
|
50
457
|
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
51
458
|
raise DataFrameArgumentError, 'No block given' unless block
|
52
|
-
raise DataFrameArgumentError, "#{key} is
|
459
|
+
raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
|
53
460
|
return self if key.nil?
|
54
461
|
|
55
462
|
slicer = instance_eval(&block)
|
@@ -83,160 +490,414 @@ module RedAmber
|
|
83
490
|
slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
|
84
491
|
end
|
85
492
|
|
86
|
-
|
87
|
-
|
88
|
-
else
|
89
|
-
take(slicer).drop(key)
|
90
|
-
end
|
493
|
+
taken = take(normalize_indices(Arrow::Array.new(slicer)))
|
494
|
+
keep_key ? taken : taken.drop(key)
|
91
495
|
end
|
92
496
|
|
93
|
-
#
|
94
|
-
|
95
|
-
|
497
|
+
# Select records by filtering with booleans to create a DataFrame.
|
498
|
+
#
|
499
|
+
# @overload filter(booleans)
|
500
|
+
# Select records by filtering with booleans and return a DataFrame.
|
501
|
+
#
|
502
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
503
|
+
# a boolean filter.
|
504
|
+
# @return [DataFrame]
|
505
|
+
# filtered records as a DataFrame.
|
506
|
+
# @example Filter by boolean Vector
|
507
|
+
# penguins
|
508
|
+
#
|
509
|
+
# # =>
|
510
|
+
# #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
|
511
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
512
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
513
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
514
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
515
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
516
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
517
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
518
|
+
# : : : : : : ... :
|
519
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
520
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
521
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
522
|
+
#
|
523
|
+
# penguins.filter(penguins.bill_length_mm < 50)
|
524
|
+
#
|
525
|
+
# # =>
|
526
|
+
# #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
|
527
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
528
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
529
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
530
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
531
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
532
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
533
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
534
|
+
# : : : : : : ... :
|
535
|
+
# 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
536
|
+
# 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
537
|
+
# 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
538
|
+
#
|
539
|
+
# @overload filter
|
540
|
+
# Select records by filtering with block and return a DataFrame.
|
541
|
+
#
|
542
|
+
# @yieldparam self [DataFrame]
|
543
|
+
# gives self to the block.
|
544
|
+
# The block is evaluated within the context of self.
|
545
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
546
|
+
# a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
|
547
|
+
# @return [DataFrame]
|
548
|
+
# filtered records as a DataFrame.
|
549
|
+
# @example Filter by boolean Vector
|
550
|
+
# penguins.filter { bill_length_mm < 50 }
|
551
|
+
#
|
552
|
+
# # =>
|
553
|
+
# #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
|
554
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
555
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
556
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
557
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
558
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
559
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
560
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
561
|
+
# : : : : : : ... :
|
562
|
+
# 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
563
|
+
# 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
564
|
+
# 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
565
|
+
#
|
566
|
+
def filter(*booleans, &block)
|
567
|
+
booleans.flatten!
|
568
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
569
|
+
|
96
570
|
if block
|
97
|
-
|
571
|
+
unless booleans.empty?
|
572
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
573
|
+
end
|
574
|
+
|
575
|
+
booleans = [instance_eval(&block)]
|
576
|
+
end
|
577
|
+
|
578
|
+
case booleans
|
579
|
+
in [] | [[]]
|
580
|
+
return remove_all_values
|
581
|
+
in [Vector => v] if v.boolean?
|
582
|
+
filter_by_array(v.data)
|
583
|
+
in [Arrow::ChunkedArray => ca] if ca.boolean?
|
584
|
+
filter_by_array(ca)
|
585
|
+
in [Arrow::BooleanArray => b]
|
586
|
+
filter_by_array(b)
|
587
|
+
else
|
588
|
+
a = Arrow::Array.new(parse_args(booleans, size))
|
589
|
+
unless a.boolean?
|
590
|
+
raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
|
591
|
+
end
|
98
592
|
|
99
|
-
|
593
|
+
filter_by_array(a)
|
100
594
|
end
|
101
|
-
|
595
|
+
end
|
102
596
|
|
103
|
-
|
104
|
-
|
597
|
+
# Select records and remove them to create a remainer DataFrame.
|
598
|
+
#
|
599
|
+
# @overload remove(row)
|
600
|
+
# Select a record and remove it to create a remainer DataFrame.
|
601
|
+
# - The order of records in self will be preserved.
|
602
|
+
#
|
603
|
+
# @param row [Indeger, Float]
|
604
|
+
# a row index to remove.
|
605
|
+
# @return [DataFrame]
|
606
|
+
# remainer variables as a DataFrame.
|
607
|
+
# @example Remove a row
|
608
|
+
# penguins.remove(-1)
|
609
|
+
#
|
610
|
+
# # =>
|
611
|
+
# #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
|
612
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
613
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
614
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
615
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
616
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
617
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
618
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
619
|
+
# : : : : : : ... :
|
620
|
+
# 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
|
621
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
622
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
623
|
+
#
|
624
|
+
# @overload remove(rows)
|
625
|
+
# Select records and remove them to create a remainer DataFrame.
|
626
|
+
# - Duplicated selection is acceptable.
|
627
|
+
# - The order of records in self will be preserved.
|
628
|
+
#
|
629
|
+
# @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
|
630
|
+
# row indeces to remove.
|
631
|
+
# @return [DataFrame]
|
632
|
+
# remainer variables as a DataFrame.
|
633
|
+
# @example Remove rows
|
634
|
+
# penguins.remove(100..200)
|
635
|
+
#
|
636
|
+
# # =>
|
637
|
+
# #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
|
638
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
639
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
640
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
641
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
642
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
643
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
644
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
645
|
+
# : : : : : : ... :
|
646
|
+
# 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
647
|
+
# 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
648
|
+
# 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
649
|
+
#
|
650
|
+
# @overload remove
|
651
|
+
# Select records by indices from block
|
652
|
+
# and remove them to create a remainer DataFrame.
|
653
|
+
# - Duplicated selection is acceptable.
|
654
|
+
# - The order of records in self will be preserved.
|
655
|
+
#
|
656
|
+
# @yieldparam self [DataFrame]
|
657
|
+
# gives self to the block.
|
658
|
+
# The block is evaluated within the context of self.
|
659
|
+
# @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
|
660
|
+
# row indeces to remove.
|
661
|
+
# @return [DataFrame]
|
662
|
+
# remainer variables as a DataFrame.
|
663
|
+
# @example Remove rows by indices from block
|
664
|
+
# penguins.remove { 0.step(size, 10) }
|
665
|
+
#
|
666
|
+
# # =>
|
667
|
+
# #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
|
668
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
669
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
670
|
+
# 0 Adelie Torgersen 39.5 17.4 186 ... 2007
|
671
|
+
# 1 Adelie Torgersen 40.3 18.0 195 ... 2007
|
672
|
+
# 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
673
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
674
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
675
|
+
# : : : : : : ... :
|
676
|
+
# 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
677
|
+
# 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
678
|
+
# 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
679
|
+
#
|
680
|
+
# @overload remove(booleans)
|
681
|
+
# Select records by filtering with booleans and return a DataFrame.
|
682
|
+
# - The order of records in self will be preserved.
|
683
|
+
#
|
684
|
+
# @param booleans [<Boolean, nil>, Vector, Arrow::Array]
|
685
|
+
# a boolean filter to remove.
|
686
|
+
# @return [DataFrame]
|
687
|
+
# remainer records as a DataFrame.
|
688
|
+
# @example Remove rows by boolean filter
|
689
|
+
# penguins.remove(penguins.bill_length_mm.is_nil)
|
690
|
+
#
|
691
|
+
# # =>
|
692
|
+
# #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
|
693
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
694
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
695
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
696
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
697
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
698
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
699
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
700
|
+
# : : : : : : ... :
|
701
|
+
# 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
702
|
+
# 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
703
|
+
# 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
704
|
+
#
|
705
|
+
# @overload remove
|
706
|
+
# Select records by booleans from block
|
707
|
+
# and remove them to create a remainer DataFrame.
|
708
|
+
# - The order of records in self will be preserved.
|
709
|
+
#
|
710
|
+
# @yieldparam self [DataFrame]
|
711
|
+
# gives self to the block.
|
712
|
+
# The block is evaluated within the context of self.
|
713
|
+
# @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
|
714
|
+
# a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
|
715
|
+
# @return [DataFrame]
|
716
|
+
# remainer records as a DataFrame.
|
717
|
+
# @example Remove rows by booleans from block
|
718
|
+
# penguins.remove { (species == 'Adelie') | (year == 2009) }
|
719
|
+
#
|
720
|
+
# # =>
|
721
|
+
# #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
|
722
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
723
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
724
|
+
# 0 Chinstrap Dream 46.5 17.9 192 ... 2007
|
725
|
+
# 1 Chinstrap Dream 50.0 19.5 196 ... 2007
|
726
|
+
# 2 Chinstrap Dream 51.3 19.2 193 ... 2007
|
727
|
+
# 3 Chinstrap Dream 45.4 18.7 188 ... 2007
|
728
|
+
# 4 Chinstrap Dream 52.7 19.8 197 ... 2007
|
729
|
+
# : : : : : : ... :
|
730
|
+
# 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
|
731
|
+
# 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
|
732
|
+
# 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
|
733
|
+
#
|
734
|
+
def remove(*args, &block)
|
735
|
+
raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
|
105
736
|
|
106
|
-
|
107
|
-
|
108
|
-
|
737
|
+
if block
|
738
|
+
unless args.empty?
|
739
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.'
|
740
|
+
end
|
109
741
|
|
110
|
-
|
742
|
+
args = [instance_eval(&block)]
|
111
743
|
end
|
112
|
-
if vector.numeric?
|
113
|
-
raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
|
114
744
|
|
115
|
-
|
116
|
-
|
117
|
-
|
745
|
+
arrow_array =
|
746
|
+
case args
|
747
|
+
in [] | [[]] | [nil]
|
748
|
+
return self
|
749
|
+
in [Vector => v]
|
750
|
+
v.data
|
751
|
+
in [(Arrow::Array | Arrow::ChunkedArray) => aa]
|
752
|
+
aa
|
753
|
+
else
|
754
|
+
Arrow::Array.new(parse_args(args, size))
|
118
755
|
end
|
119
756
|
|
120
|
-
|
121
|
-
|
122
|
-
|
757
|
+
if arrow_array.boolean?
|
758
|
+
filter_by_array(arrow_array.primitive_invert)
|
759
|
+
elsif arrow_array.numeric?
|
760
|
+
remover = normalize_indices(arrow_array).to_a
|
761
|
+
return self if remover.empty?
|
123
762
|
|
124
|
-
|
763
|
+
slicer = indices.to_a - remover.map(&:to_i)
|
764
|
+
return remove_all_values if slicer.empty?
|
125
765
|
|
126
|
-
|
127
|
-
|
766
|
+
take(slicer)
|
767
|
+
else
|
768
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
128
769
|
end
|
129
|
-
|
130
|
-
raise DataFrameArgumentError, "Invalid argument #{remover}"
|
131
770
|
end
|
132
771
|
|
772
|
+
# Remove records (rows) contains any nil.
|
773
|
+
#
|
774
|
+
# @return [DataFrame]
|
775
|
+
# removed DataFrame.
|
776
|
+
# @example
|
777
|
+
# penguins.remove_nil
|
778
|
+
# # =>
|
779
|
+
# #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
|
780
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
781
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
782
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
783
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
784
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
785
|
+
# 3 Adelie Torgersen 36.7 19.3 193 ... 2007
|
786
|
+
# 4 Adelie Torgersen 39.3 20.6 190 ... 2007
|
787
|
+
# : : : : : : ... :
|
788
|
+
# 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
789
|
+
# 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
790
|
+
# 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
791
|
+
#
|
133
792
|
def remove_nil
|
134
793
|
func = Arrow::Function.find(:drop_null)
|
135
|
-
DataFrame.
|
794
|
+
DataFrame.create(func.execute([table]).value)
|
136
795
|
end
|
137
796
|
alias_method :drop_nil, :remove_nil
|
138
797
|
|
139
|
-
# Select
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
variables[key.to_sym]
|
147
|
-
end
|
148
|
-
|
798
|
+
# Select records from the top.
|
799
|
+
#
|
800
|
+
# @param n_obs [Integer]
|
801
|
+
# number of records to select.
|
802
|
+
# @return [DataFrame]
|
803
|
+
#
|
149
804
|
def head(n_obs = 5)
|
150
805
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
151
806
|
|
152
807
|
self[0...[n_obs, size].min]
|
153
808
|
end
|
154
809
|
|
810
|
+
# Select records from the end.
|
811
|
+
#
|
812
|
+
# @param n_obs [Integer]
|
813
|
+
# number of records to select.
|
814
|
+
# @return [DataFrame]
|
815
|
+
#
|
155
816
|
def tail(n_obs = 5)
|
156
817
|
raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
|
157
818
|
|
158
819
|
self[-[n_obs, size].min..]
|
159
820
|
end
|
160
821
|
|
822
|
+
# Select records from the top.
|
823
|
+
#
|
824
|
+
# @param n_obs [Integer]
|
825
|
+
# number of records to select.
|
826
|
+
# @return [DataFrame]
|
827
|
+
#
|
161
828
|
def first(n_obs = 1)
|
162
829
|
head(n_obs)
|
163
830
|
end
|
164
831
|
|
832
|
+
# Select records from the end.
|
833
|
+
#
|
834
|
+
# @param n_obs [Integer]
|
835
|
+
# number of records to select.
|
836
|
+
# @return [DataFrame]
|
837
|
+
#
|
165
838
|
def last(n_obs = 1)
|
166
839
|
tail(n_obs)
|
167
840
|
end
|
168
841
|
|
169
|
-
#
|
170
|
-
#
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
842
|
+
# Select records by index Array to create a DataFrame.
|
843
|
+
#
|
844
|
+
# - TODO: support for option `boundscheck: true`
|
845
|
+
# - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
|
846
|
+
# - Negative index is not supported.
|
847
|
+
# @param index_array [<Integer>, Arrow::Array]
|
848
|
+
# row indeces to select.
|
849
|
+
# @return [DataFrame]
|
850
|
+
# selected variables as a DataFrame.
|
851
|
+
#
|
852
|
+
# @api private
|
853
|
+
#
|
854
|
+
def take(index_array)
|
855
|
+
DataFrame.create(@table.take(index_array))
|
179
856
|
end
|
180
857
|
|
181
|
-
#
|
182
|
-
# TODO: support for option {null_selection_behavior: :drop}
|
183
|
-
def filter(*booleans)
|
184
|
-
booleans.flatten!
|
185
|
-
return remove_all_values if booleans.empty?
|
186
|
-
|
187
|
-
b = booleans[0]
|
188
|
-
case b
|
189
|
-
when Vector
|
190
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
|
191
|
-
|
192
|
-
filter_by_vector(b.data)
|
193
|
-
when Arrow::BooleanArray
|
194
|
-
filter_by_vector(b)
|
195
|
-
else
|
196
|
-
raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
|
197
|
-
|
198
|
-
filter_by_vector(Arrow::BooleanArray.new(booleans))
|
199
|
-
end
|
200
|
-
end
|
858
|
+
# rubocop:enable Layout/LineLength
|
201
859
|
|
202
860
|
private
|
203
861
|
|
204
|
-
def
|
862
|
+
def select_variables_by_keys(keys)
|
205
863
|
if keys.one?
|
206
864
|
key = keys[0].to_sym
|
207
|
-
raise DataFrameArgumentError, "Key does not exist #{
|
865
|
+
raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
|
208
866
|
|
209
867
|
variables[key]
|
868
|
+
# Vector.new(@table.find_column(*key).data)
|
210
869
|
else
|
211
|
-
|
870
|
+
check_duplicate_keys(keys)
|
871
|
+
DataFrame.create(@table.select_columns(*keys))
|
212
872
|
end
|
213
873
|
end
|
214
874
|
|
215
|
-
# Accepts indices by numeric
|
216
|
-
def
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
DataFrame.new(datum.value)
|
875
|
+
# Accepts indices by numeric arrow array and returns positive indices.
|
876
|
+
def normalize_indices(arrow_array)
|
877
|
+
b = Arrow::Function.find(:less).execute([arrow_array, 0])
|
878
|
+
a = Arrow::Function.find(:add).execute([arrow_array, size])
|
879
|
+
r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
|
880
|
+
if r.float?
|
881
|
+
r = Arrow::Function.find(:floor).execute([r]).value
|
882
|
+
Arrow::UInt64ArrayBuilder.build(r)
|
883
|
+
else
|
884
|
+
r
|
885
|
+
end
|
227
886
|
end
|
228
887
|
|
229
|
-
# Accepts booleans by Arrow::BooleanArray
|
230
|
-
def
|
231
|
-
|
888
|
+
# Accepts booleans by a Arrow::BooleanArray or an Array
|
889
|
+
def filter_by_array(boolean_array)
|
890
|
+
unless boolean_array.length == size
|
891
|
+
raise DataFrameArgumentError, 'Booleans must be same size as self.'
|
892
|
+
end
|
232
893
|
|
233
894
|
datum = Arrow::Function.find(:filter).execute([table, boolean_array])
|
234
|
-
DataFrame.
|
895
|
+
DataFrame.create(datum.value)
|
235
896
|
end
|
236
897
|
|
237
898
|
# return a DataFrame with same keys as self without values
|
238
899
|
def remove_all_values
|
239
|
-
|
900
|
+
filter_by_array(Arrow::BooleanArray.new([false] * size))
|
240
901
|
end
|
241
902
|
end
|
242
903
|
end
|