red_amber 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,55 +1,462 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
+ using RefineArray
8
+ using RefineArrayLike
9
+
10
+ # rubocop:disable Layout/LineLength
11
+
12
+ # Select variables or records.
13
+ #
14
+ # @overload [](key)
15
+ # Select single variable (column) and return as a Vetor.
16
+ #
17
+ # @param key [Symbol, String]
18
+ # key name to select.
19
+ # @return [Vector]
20
+ # selected variable as a Vector.
21
+ # @note DataFrame.v(key) is faster to create Vector from a variable.
22
+ # @example Select a column and return Vector
23
+ # penguins
24
+ #
25
+ # # =>
26
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
27
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
28
+ # <string> <string> <double> <double> <uint8> ... <uint16>
29
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
30
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
31
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
32
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
33
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
34
+ # : : : : : : ... :
35
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
36
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
37
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
38
+ #
39
+ # penguins[:bill_length_mm]
40
+ #
41
+ # # =>
42
+ # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
43
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
44
+ #
45
+ # @overload [](keys)
46
+ # Select variables and return a DataFrame.
47
+ #
48
+ # @param keys [<Symbol, String>] key names to select.
49
+ # @return [DataFrame]
50
+ # selected variables as a DataFrame.
51
+ # @example Select columns
52
+ # penguins[:island, :bill_length_mm]
53
+ #
54
+ # # =>
55
+ # #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
56
+ # island bill_length_mm
57
+ # <string> <double>
58
+ # 0 Torgersen 39.1
59
+ # 1 Torgersen 39.5
60
+ # 2 Torgersen 40.3
61
+ # 3 Torgersen (nil)
62
+ # 4 Torgersen 36.7
63
+ # : : :
64
+ # 341 Biscoe 50.4
65
+ # 342 Biscoe 45.2
66
+ # 343 Biscoe 49.9
67
+ #
68
+ # @overload [](index)
69
+ # Select a record and return a DataFrame.
70
+ #
71
+ # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
72
+ # index of a row to select.
73
+ # @return [DataFrame]
74
+ # selected variables as a DataFrame.
75
+ # @example Select a row
76
+ # penguins[0]
77
+ #
78
+ # # =>
79
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
80
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
+ # <string> <string> <double> <double> <uint8> ... <uint16>
82
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
83
+ #
84
+ # @overload [](indices)
85
+ # Select records by indices and return a DataFrame.
86
+ #
87
+ # @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
88
+ # indices of rows to select.
89
+ # @return [DataFrame]
90
+ # selected variables as a DataFrame.
91
+ # @example Select rows by indices
92
+ # penguins[0..100]
93
+ #
94
+ # # =>
95
+ # #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
96
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
97
+ # <string> <string> <double> <double> <uint8> ... <uint16>
98
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
99
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
100
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
101
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
102
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
103
+ # : : : : : : ... :
104
+ # 98 Adelie Dream 33.1 16.1 178 ... 2008
105
+ # 99 Adelie Dream 43.2 18.5 192 ... 2008
106
+ # 100 Adelie Biscoe 35.0 17.9 192 ... 2009
107
+ #
108
+ # @overload [](booleans)
109
+ # Select records by booleans and return a DataFrame.
110
+ #
111
+ # @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
112
+ # booleans of rows to select.
113
+ # @return [DataFrame]
114
+ # selected variables as a DataFrame.
115
+ # @example Select rows by booleans
116
+ # penguins[penguins.species == 'Adelie']
117
+ #
118
+ # # =>
119
+ # #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
120
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
121
+ # <string> <string> <double> <double> <uint8> ... <uint16>
122
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
123
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
124
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
125
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
126
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
127
+ # : : : : : : ... :
128
+ # 149 Adelie Dream 37.8 18.1 193 ... 2009
129
+ # 150 Adelie Dream 36.0 17.1 187 ... 2009
130
+ # 151 Adelie Dream 41.5 18.5 201 ... 2009
131
+ #
8
132
  def [](*args)
9
- args.flatten!
10
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
11
- return remove_all_values if args.empty? || args[0].nil?
133
+ raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
134
+
135
+ case args
136
+ in [] | [nil]
137
+ return remove_all_values
138
+ in [(Symbol | String) => k] if key? k
139
+ return variables[k.to_sym]
140
+ in [Integer => i]
141
+ return take([i.negative? ? i + size : i])
142
+ in [Vector => v]
143
+ arrow_array = v.data
144
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
145
+ arrow_array = aa
146
+ else
147
+ a = parse_args(args, size)
148
+ return select_variables_by_keys(a) if a.symbol?
149
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
150
+ return remove_all_values if a.compact.empty?
151
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
152
+
153
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
154
+ end
155
+
156
+ return take(normalize_indices(arrow_array)) if arrow_array.numeric?
157
+ return filter_by_array(arrow_array) if arrow_array.boolean?
12
158
 
13
- vector = parse_to_vector(args)
14
- if vector.boolean?
15
- return filter_by_vector(vector.data) if vector.size == size
159
+ a = arrow_array.to_a
160
+ return select_variables_by_keys(a) if a.symbol_or_string?
16
161
 
17
- raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
162
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
163
+ end
164
+
165
+ # Select a variable by String or Symbol and return as a Vector.
166
+ #
167
+ # @param key [Symbol, String]
168
+ # key name to select.
169
+ # @return [Vector]
170
+ # selected variable as a Vector.
171
+ # @note #v(key) is faster then #[](key).
172
+ # @example Select a column and return Vector
173
+ # penguins.v(:bill_length_mm)
174
+ #
175
+ # # =>
176
+ # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
177
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
+ #
179
+ def v(key)
180
+ unless key.is_a?(Symbol) || key.is_a?(String)
181
+ raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
18
182
  end
19
- return take_by_array(vector) if vector.numeric?
20
- return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
183
+ raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
21
184
 
22
- raise DataFrameArgumentError, "Invalid argument: #{args}"
185
+ variables[key.to_sym]
23
186
  end
24
187
 
25
- # slice and select rows to create sub DataFrame
188
+ # Select records to create a DataFrame.
189
+ #
190
+ # @overload slice(row)
191
+ # Select a record and return a DataFrame.
192
+ #
193
+ # @param row [Indeger, Float]
194
+ # a row index to select.
195
+ # @return [DataFrame]
196
+ # selected records as a DataFrame.
197
+ # @example Select a row
198
+ # penguins
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
202
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
203
+ # <string> <string> <double> <double> <uint8> ... <uint16>
204
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
205
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
206
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
207
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
208
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
209
+ # : : : : : : ... :
210
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
211
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
212
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
213
+ # penguins.slice(2)
214
+ #
215
+ # # =>
216
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
217
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
218
+ # <string> <string> <double> <double> <uint8> ... <uint16>
219
+ # 0 Adelie Torgersen 40.3 18.0 195 ... 2007
220
+ #
221
+ # @overload slice(rows)
222
+ # Select records and return a DataFrame.
223
+ # - Duplicated selection is acceptable. The same record will be returned.
224
+ # - The order of records will be the same as specified indices.
225
+ #
226
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
227
+ # row indeces to select.
228
+ # @return [DataFrame]
229
+ # selected records as a DataFrame.
230
+ # @example Select rows
231
+ # penguins.slice(300..-1)
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
235
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
236
+ # <string> <string> <double> <double> <uint8> ... <uint16>
237
+ # 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
238
+ # 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
239
+ # 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
240
+ # 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
241
+ # 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
242
+ # : : : : : : ... :
243
+ # 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
244
+ # 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
245
+ # 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
246
+ #
247
+ # @overload slice(enumerator)
248
+ # Select records and return a DataFrame.
249
+ # - Duplicated selection is acceptable. The same record will be returned.
250
+ # - The order of records will be the same as specified indices.
251
+ #
252
+ # @param enumerator [Enumerator]
253
+ # an enumerator which returns row indeces to select.
254
+ # @return [DataFrame]
255
+ # selected records as a DataFrame.
256
+ # @example Select rows by Enumerator.
257
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
258
+ # .slice(0.step(by: 10, to: 340))
259
+ #
260
+ # # =>
261
+ # #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
262
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
263
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
264
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
265
+ # 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
266
+ # 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
267
+ # 3 30 Adelie Dream 39.5 16.7 178 ... 2007
268
+ # 4 40 Adelie Dream 36.5 18.0 182 ... 2007
269
+ # : : : : : : : ... :
270
+ # 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
271
+ # 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
272
+ # 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
273
+ #
274
+ # @overload slice
275
+ # Select records by indices with block and return a DataFrame.
276
+ # - Duplicated selection is acceptable. The same record will be returned.
277
+ # - The order of records will be the same as specified indices.
278
+ #
279
+ # @yieldparam self [DataFrame]
280
+ # gives self to the block.
281
+ # The block is evaluated within the context of self.
282
+ # @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
283
+ # row indeces to select.
284
+ # @return [DataFrame]
285
+ # selected records as a DataFrame.
286
+ # @example Select rows by block
287
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
288
+ # .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
289
+ #
290
+ # # =>
291
+ # #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
292
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
293
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
294
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
295
+ # 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
296
+ # 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
297
+ # 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
298
+ # 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
299
+ # 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
300
+ # 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
301
+ # 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
302
+ #
303
+ # @overload slice(booleans)
304
+ # Select records by filtering with booleans and return a DataFrame.
305
+ #
306
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
307
+ # a boolean filter.
308
+ # @return [DataFrame]
309
+ # filtered records as a DataFrame.
310
+ # @example Select rows by boolean filter
311
+ # penguins.slice(penguins[:bill_length_mm] > 50)
312
+ #
313
+ # # =>
314
+ # #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
315
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
316
+ # <string> <string> <double> <double> <uint8> ... <uint16>
317
+ # 0 Chinstrap Dream 51.3 19.2 193 ... 2007
318
+ # 1 Chinstrap Dream 52.7 19.8 197 ... 2007
319
+ # 2 Chinstrap Dream 51.3 18.2 197 ... 2007
320
+ # 3 Chinstrap Dream 51.3 19.9 198 ... 2007
321
+ # 4 Chinstrap Dream 51.7 20.3 194 ... 2007
322
+ # : : : : : : ... :
323
+ # 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
324
+ # 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
325
+ # 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
326
+ #
327
+ # @overload slice
328
+ # Select records by filtering with block and return a DataFrame.
329
+ #
330
+ # @yieldparam self [DataFrame]
331
+ # gives self to the block.
332
+ # The block is evaluated within the context of self.
333
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
334
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
335
+ # @return [DataFrame]
336
+ # filtered records as a DataFrame.
337
+ # @example Select rows by booleans from block
338
+ # penguins.slice { indices.map(&:even?) }
339
+ #
340
+ # # =>
341
+ # #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
342
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
343
+ # <string> <string> <double> <double> <uint8> ... <uint16>
344
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
345
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
346
+ # 2 Adelie Torgersen 36.7 19.3 193 ... 2007
347
+ # 3 Adelie Torgersen 38.9 17.8 181 ... 2007
348
+ # 4 Adelie Torgersen 34.1 18.1 193 ... 2007
349
+ # : : : : : : ... :
350
+ # 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
351
+ # 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
352
+ # 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
353
+ #
26
354
  def slice(*args, &block)
27
- slicer = args
355
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
356
+
28
357
  if block
29
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
358
+ unless args.empty?
359
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
360
+ end
30
361
 
31
- slicer = [instance_eval(&block)]
362
+ args = [instance_eval(&block)]
32
363
  end
33
- slicer.flatten!
34
364
 
35
- raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
- return remove_all_values if slicer.empty? || slicer[0].nil?
37
-
38
- vector = parse_to_vector(slicer)
39
- if vector.boolean?
40
- return filter_by_vector(vector.data) if vector.size == size
365
+ arrow_array =
366
+ case args
367
+ in [] | [[]]
368
+ return remove_all_values
369
+ in [Vector => v]
370
+ v.data
371
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
372
+ aa
373
+ else
374
+ Arrow::Array.new(parse_args(args, size))
375
+ end
41
376
 
42
- raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
377
+ if arrow_array.numeric?
378
+ take(normalize_indices(arrow_array))
379
+ elsif arrow_array.boolean?
380
+ filter_by_array(arrow_array)
381
+ elsif arrow_array.to_a.compact.empty?
382
+ # Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
383
+ remove_all_values
384
+ else
385
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
43
386
  end
44
- return take_by_array(vector) if vector.numeric?
45
-
46
- raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
387
  end
48
388
 
389
+ # Select records by a column specified by a key
390
+ # and corresponding record with a block.
391
+ #
392
+ # @overload slice_by(key)
393
+ # Select records by elements.
394
+ #
395
+ # @param key [Symbol, String]
396
+ # a key to select column.
397
+ # @param keep_key [true, false]
398
+ # preserve column specified by key in the result if true.
399
+ # @yieldparam self [DataFrame]
400
+ # gives self to the block.
401
+ # The block is evaluated within the context of self.
402
+ # @yieldreturn [<elements>]
403
+ # array of elements to select.
404
+ # @return [DataFrame]
405
+ # selected records as a DataFrame.
406
+ # @example Select records by elements
407
+ # df
408
+ #
409
+ # # =>
410
+ # #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
411
+ # index float string
412
+ # <uint8> <double> <string>
413
+ # 0 0 0.0 A
414
+ # 1 1 1.1 B
415
+ # 2 2 2.2 C
416
+ # 3 3 NaN D
417
+ # 4 (nil) (nil) (nil)
418
+ #
419
+ # df.slice_by(:string) { ["A", "C"] }
420
+ #
421
+ # # =>
422
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
423
+ # index float
424
+ # <uint8> <double>
425
+ # 0 0 0.0
426
+ # 1 2 2.2
427
+ #
428
+ # @overload slice_by(key)
429
+ # Select records by elements range.
430
+ #
431
+ # @param key [Symbol, String]
432
+ # a key to select column.
433
+ # @param keep_key [true, false]
434
+ # preserve column specified by key in the result if true.
435
+ # @yieldparam self [DataFrame]
436
+ # gives self to the block.
437
+ # The block is evaluated within the context of self.
438
+ # @yieldreturn [Range]
439
+ # specifies position of elements at the start and the end and
440
+ # select records between them.
441
+ # @return [DataFrame]
442
+ # selected records as a DataFrame.
443
+ # @example Select records by elements range
444
+ # df.slice_by(:string) { "A".."C" }
445
+ #
446
+ # # =>
447
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
448
+ # index float
449
+ # <uint8> <double>
450
+ # 0 0 0.0
451
+ # 1 1 1.1
452
+ # 2 2 2.2
453
+ #
454
+ # @since 0.2.1
455
+ #
49
456
  def slice_by(key, keep_key: false, &block)
50
457
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
458
  raise DataFrameArgumentError, 'No block given' unless block
52
- raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
459
+ raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
53
460
  return self if key.nil?
54
461
 
55
462
  slicer = instance_eval(&block)
@@ -83,160 +490,414 @@ module RedAmber
83
490
  slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
491
  end
85
492
 
86
- if keep_key
87
- take(slicer)
88
- else
89
- take(slicer).drop(key)
90
- end
493
+ taken = take(normalize_indices(Arrow::Array.new(slicer)))
494
+ keep_key ? taken : taken.drop(key)
91
495
  end
92
496
 
93
- # remove selected rows to create remainer DataFrame
94
- def remove(*args, &block)
95
- remover = args
497
+ # Select records by filtering with booleans to create a DataFrame.
498
+ #
499
+ # @overload filter(booleans)
500
+ # Select records by filtering with booleans and return a DataFrame.
501
+ #
502
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
503
+ # a boolean filter.
504
+ # @return [DataFrame]
505
+ # filtered records as a DataFrame.
506
+ # @example Filter by boolean Vector
507
+ # penguins
508
+ #
509
+ # # =>
510
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
511
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
512
+ # <string> <string> <double> <double> <uint8> ... <uint16>
513
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
514
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
515
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
516
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
517
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
518
+ # : : : : : : ... :
519
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
520
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
521
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
522
+ #
523
+ # penguins.filter(penguins.bill_length_mm < 50)
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
527
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
528
+ # <string> <string> <double> <double> <uint8> ... <uint16>
529
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
530
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
531
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
532
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
533
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
534
+ # : : : : : : ... :
535
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
536
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
537
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
538
+ #
539
+ # @overload filter
540
+ # Select records by filtering with block and return a DataFrame.
541
+ #
542
+ # @yieldparam self [DataFrame]
543
+ # gives self to the block.
544
+ # The block is evaluated within the context of self.
545
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
546
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
547
+ # @return [DataFrame]
548
+ # filtered records as a DataFrame.
549
+ # @example Filter by boolean Vector
550
+ # penguins.filter { bill_length_mm < 50 }
551
+ #
552
+ # # =>
553
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
554
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
555
+ # <string> <string> <double> <double> <uint8> ... <uint16>
556
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
557
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
558
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
559
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
560
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
561
+ # : : : : : : ... :
562
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
563
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
564
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
565
+ #
566
+ def filter(*booleans, &block)
567
+ booleans.flatten!
568
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
569
+
96
570
  if block
97
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
571
+ unless booleans.empty?
572
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
573
+ end
574
+
575
+ booleans = [instance_eval(&block)]
576
+ end
577
+
578
+ case booleans
579
+ in [] | [[]]
580
+ return remove_all_values
581
+ in [Vector => v] if v.boolean?
582
+ filter_by_array(v.data)
583
+ in [Arrow::ChunkedArray => ca] if ca.boolean?
584
+ filter_by_array(ca)
585
+ in [Arrow::BooleanArray => b]
586
+ filter_by_array(b)
587
+ else
588
+ a = Arrow::Array.new(parse_args(booleans, size))
589
+ unless a.boolean?
590
+ raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
591
+ end
98
592
 
99
- remover = [instance_eval(&block)]
593
+ filter_by_array(a)
100
594
  end
101
- remover.flatten!
595
+ end
102
596
 
103
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
104
- return self if remover.empty? || remover[0].nil?
597
+ # Select records and remove them to create a remainer DataFrame.
598
+ #
599
+ # @overload remove(row)
600
+ # Select a record and remove it to create a remainer DataFrame.
601
+ # - The order of records in self will be preserved.
602
+ #
603
+ # @param row [Indeger, Float]
604
+ # a row index to remove.
605
+ # @return [DataFrame]
606
+ # remainer variables as a DataFrame.
607
+ # @example Remove a row
608
+ # penguins.remove(-1)
609
+ #
610
+ # # =>
611
+ # #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
612
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
613
+ # <string> <string> <double> <double> <uint8> ... <uint16>
614
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
615
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
616
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
617
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
618
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
619
+ # : : : : : : ... :
620
+ # 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
621
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
622
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
623
+ #
624
+ # @overload remove(rows)
625
+ # Select records and remove them to create a remainer DataFrame.
626
+ # - Duplicated selection is acceptable.
627
+ # - The order of records in self will be preserved.
628
+ #
629
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
630
+ # row indeces to remove.
631
+ # @return [DataFrame]
632
+ # remainer variables as a DataFrame.
633
+ # @example Remove rows
634
+ # penguins.remove(100..200)
635
+ #
636
+ # # =>
637
+ # #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
638
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
639
+ # <string> <string> <double> <double> <uint8> ... <uint16>
640
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
641
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
642
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
643
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
644
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
645
+ # : : : : : : ... :
646
+ # 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
647
+ # 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
648
+ # 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
649
+ #
650
+ # @overload remove
651
+ # Select records by indices from block
652
+ # and remove them to create a remainer DataFrame.
653
+ # - Duplicated selection is acceptable.
654
+ # - The order of records in self will be preserved.
655
+ #
656
+ # @yieldparam self [DataFrame]
657
+ # gives self to the block.
658
+ # The block is evaluated within the context of self.
659
+ # @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
660
+ # row indeces to remove.
661
+ # @return [DataFrame]
662
+ # remainer variables as a DataFrame.
663
+ # @example Remove rows by indices from block
664
+ # penguins.remove { 0.step(size, 10) }
665
+ #
666
+ # # =>
667
+ # #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
668
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
669
+ # <string> <string> <double> <double> <uint8> ... <uint16>
670
+ # 0 Adelie Torgersen 39.5 17.4 186 ... 2007
671
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
672
+ # 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
673
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
674
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
675
+ # : : : : : : ... :
676
+ # 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
677
+ # 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
678
+ # 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
679
+ #
680
+ # @overload remove(booleans)
681
+ # Select records by filtering with booleans and return a DataFrame.
682
+ # - The order of records in self will be preserved.
683
+ #
684
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
685
+ # a boolean filter to remove.
686
+ # @return [DataFrame]
687
+ # remainer records as a DataFrame.
688
+ # @example Remove rows by boolean filter
689
+ # penguins.remove(penguins.bill_length_mm.is_nil)
690
+ #
691
+ # # =>
692
+ # #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
693
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
694
+ # <string> <string> <double> <double> <uint8> ... <uint16>
695
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
696
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
697
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
698
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
699
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
700
+ # : : : : : : ... :
701
+ # 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
702
+ # 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
703
+ # 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
704
+ #
705
+ # @overload remove
706
+ # Select records by booleans from block
707
+ # and remove them to create a remainer DataFrame.
708
+ # - The order of records in self will be preserved.
709
+ #
710
+ # @yieldparam self [DataFrame]
711
+ # gives self to the block.
712
+ # The block is evaluated within the context of self.
713
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
714
+ # a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
715
+ # @return [DataFrame]
716
+ # remainer records as a DataFrame.
717
+ # @example Remove rows by booleans from block
718
+ # penguins.remove { (species == 'Adelie') | (year == 2009) }
719
+ #
720
+ # # =>
721
+ # #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
722
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
723
+ # <string> <string> <double> <double> <uint8> ... <uint16>
724
+ # 0 Chinstrap Dream 46.5 17.9 192 ... 2007
725
+ # 1 Chinstrap Dream 50.0 19.5 196 ... 2007
726
+ # 2 Chinstrap Dream 51.3 19.2 193 ... 2007
727
+ # 3 Chinstrap Dream 45.4 18.7 188 ... 2007
728
+ # 4 Chinstrap Dream 52.7 19.8 197 ... 2007
729
+ # : : : : : : ... :
730
+ # 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
731
+ # 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
732
+ # 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
733
+ #
734
+ def remove(*args, &block)
735
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
105
736
 
106
- vector = parse_to_vector(remover)
107
- if vector.boolean?
108
- return filter_by_vector(vector.primitive_invert.data) if vector.size == size
737
+ if block
738
+ unless args.empty?
739
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
740
+ end
109
741
 
110
- raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
742
+ args = [instance_eval(&block)]
111
743
  end
112
- if vector.numeric?
113
- raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
114
744
 
115
- normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
116
- if normalized_indices.max >= size
117
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
745
+ arrow_array =
746
+ case args
747
+ in [] | [[]] | [nil]
748
+ return self
749
+ in [Vector => v]
750
+ v.data
751
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
752
+ aa
753
+ else
754
+ Arrow::Array.new(parse_args(args, size))
118
755
  end
119
756
 
120
- normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
121
- return remove_all_values if normalized_indices == indices.to_a
122
- return self if normalized_indices.empty?
757
+ if arrow_array.boolean?
758
+ filter_by_array(arrow_array.primitive_invert)
759
+ elsif arrow_array.numeric?
760
+ remover = normalize_indices(arrow_array).to_a
761
+ return self if remover.empty?
123
762
 
124
- index_array = indices.to_a - normalized_indices
763
+ slicer = indices.to_a - remover.map(&:to_i)
764
+ return remove_all_values if slicer.empty?
125
765
 
126
- datum = Arrow::Function.find(:take).execute([table, index_array])
127
- return DataFrame.new(datum.value)
766
+ take(slicer)
767
+ else
768
+ raise DataFrameArgumentError, "Invalid argument #{args}"
128
769
  end
129
-
130
- raise DataFrameArgumentError, "Invalid argument #{remover}"
131
770
  end
132
771
 
772
+ # Remove records (rows) contains any nil.
773
+ #
774
+ # @return [DataFrame]
775
+ # removed DataFrame.
776
+ # @example
777
+ # penguins.remove_nil
778
+ # # =>
779
+ # #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
780
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
781
+ # <string> <string> <double> <double> <uint8> ... <uint16>
782
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
783
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
784
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
785
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
786
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
787
+ # : : : : : : ... :
788
+ # 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
789
+ # 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
790
+ # 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
791
+ #
133
792
  def remove_nil
134
793
  func = Arrow::Function.find(:drop_null)
135
- DataFrame.new(func.execute([table]).value)
794
+ DataFrame.create(func.execute([table]).value)
136
795
  end
137
796
  alias_method :drop_nil, :remove_nil
138
797
 
139
- # Select a variable by a key in String or Symbol
140
- def v(key)
141
- unless key.is_a?(Symbol) || key.is_a?(String)
142
- raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
143
- end
144
- raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
145
-
146
- variables[key.to_sym]
147
- end
148
-
798
+ # Select records from the top.
799
+ #
800
+ # @param n_obs [Integer]
801
+ # number of records to select.
802
+ # @return [DataFrame]
803
+ #
149
804
  def head(n_obs = 5)
150
805
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
151
806
 
152
807
  self[0...[n_obs, size].min]
153
808
  end
154
809
 
810
+ # Select records from the end.
811
+ #
812
+ # @param n_obs [Integer]
813
+ # number of records to select.
814
+ # @return [DataFrame]
815
+ #
155
816
  def tail(n_obs = 5)
156
817
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
157
818
 
158
819
  self[-[n_obs, size].min..]
159
820
  end
160
821
 
822
+ # Select records from the top.
823
+ #
824
+ # @param n_obs [Integer]
825
+ # number of records to select.
826
+ # @return [DataFrame]
827
+ #
161
828
  def first(n_obs = 1)
162
829
  head(n_obs)
163
830
  end
164
831
 
832
+ # Select records from the end.
833
+ #
834
+ # @param n_obs [Integer]
835
+ # number of records to select.
836
+ # @return [DataFrame]
837
+ #
165
838
  def last(n_obs = 1)
166
839
  tail(n_obs)
167
840
  end
168
841
 
169
- # Undocumented
170
- # TODO: support for option {boundscheck: true}
171
- def take(*arg_indices)
172
- arg_indices.flatten!
173
- return remove_all_values if arg_indices.empty?
174
-
175
- arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
176
- arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
177
-
178
- take_by_array(arg_indices)
842
+ # Select records by index Array to create a DataFrame.
843
+ #
844
+ # - TODO: support for option `boundscheck: true`
845
+ # - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
846
+ # - Negative index is not supported.
847
+ # @param index_array [<Integer>, Arrow::Array]
848
+ # row indeces to select.
849
+ # @return [DataFrame]
850
+ # selected variables as a DataFrame.
851
+ #
852
+ # @api private
853
+ #
854
+ def take(index_array)
855
+ DataFrame.create(@table.take(index_array))
179
856
  end
180
857
 
181
- # Undocumented
182
- # TODO: support for option {null_selection_behavior: :drop}
183
- def filter(*booleans)
184
- booleans.flatten!
185
- return remove_all_values if booleans.empty?
186
-
187
- b = booleans[0]
188
- case b
189
- when Vector
190
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
191
-
192
- filter_by_vector(b.data)
193
- when Arrow::BooleanArray
194
- filter_by_vector(b)
195
- else
196
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
197
-
198
- filter_by_vector(Arrow::BooleanArray.new(booleans))
199
- end
200
- end
858
+ # rubocop:enable Layout/LineLength
201
859
 
202
860
  private
203
861
 
204
- def select_vars_by_keys(keys)
862
+ def select_variables_by_keys(keys)
205
863
  if keys.one?
206
864
  key = keys[0].to_sym
207
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
865
+ raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
208
866
 
209
867
  variables[key]
868
+ # Vector.new(@table.find_column(*key).data)
210
869
  else
211
- DataFrame.new(@table[keys])
870
+ check_duplicate_keys(keys)
871
+ DataFrame.create(@table.select_columns(*keys))
212
872
  end
213
873
  end
214
874
 
215
- # Accepts indices by numeric Vector
216
- def take_by_array(indices)
217
- raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
218
- raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
219
-
220
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
221
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
222
-
223
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
224
-
225
- datum = Arrow::Function.find(:take).execute([table, index_array])
226
- DataFrame.new(datum.value)
875
+ # Accepts indices by numeric arrow array and returns positive indices.
876
+ def normalize_indices(arrow_array)
877
+ b = Arrow::Function.find(:less).execute([arrow_array, 0])
878
+ a = Arrow::Function.find(:add).execute([arrow_array, size])
879
+ r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
880
+ if r.float?
881
+ r = Arrow::Function.find(:floor).execute([r]).value
882
+ Arrow::UInt64ArrayBuilder.build(r)
883
+ else
884
+ r
885
+ end
227
886
  end
228
887
 
229
- # Accepts booleans by Arrow::BooleanArray
230
- def filter_by_vector(boolean_array)
231
- raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
888
+ # Accepts booleans by a Arrow::BooleanArray or an Array
889
+ def filter_by_array(boolean_array)
890
+ unless boolean_array.length == size
891
+ raise DataFrameArgumentError, 'Booleans must be same size as self.'
892
+ end
232
893
 
233
894
  datum = Arrow::Function.find(:filter).execute([table, boolean_array])
234
- DataFrame.new(datum.value)
895
+ DataFrame.create(datum.value)
235
896
  end
236
897
 
237
898
  # return a DataFrame with same keys as self without values
238
899
  def remove_all_values
239
- filter_by_vector(Arrow::BooleanArray.new([false] * size))
900
+ filter_by_array(Arrow::BooleanArray.new([false] * size))
240
901
  end
241
902
  end
242
903
  end