red_amber 0.2.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -51
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +203 -1
  5. data/Gemfile +2 -1
  6. data/LICENSE +1 -1
  7. data/README.md +61 -45
  8. data/benchmark/basic.yml +11 -4
  9. data/benchmark/combine.yml +3 -4
  10. data/benchmark/dataframe.yml +62 -0
  11. data/benchmark/group.yml +7 -1
  12. data/benchmark/reshape.yml +6 -2
  13. data/benchmark/vector.yml +63 -0
  14. data/doc/DataFrame.md +35 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +295 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +537 -68
  20. data/lib/red_amber/data_frame_combinable.rb +776 -123
  21. data/lib/red_amber/data_frame_displayable.rb +248 -18
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +81 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +216 -21
  25. data/lib/red_amber/data_frame_selectable.rb +781 -120
  26. data/lib/red_amber/data_frame_variable_operation.rb +561 -85
  27. data/lib/red_amber/group.rb +195 -21
  28. data/lib/red_amber/helper.rb +114 -32
  29. data/lib/red_amber/refinements.rb +206 -0
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +435 -58
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +321 -69
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +397 -24
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +15 -1
  39. data/red_amber.gemspec +4 -3
  40. metadata +19 -11
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -294
@@ -1,55 +1,462 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
+ using RefineArray
8
+ using RefineArrayLike
9
+
10
+ # rubocop:disable Layout/LineLength
11
+
12
+ # Select variables or records.
13
+ #
14
+ # @overload [](key)
15
+ # Select single variable (column) and return as a Vetor.
16
+ #
17
+ # @param key [Symbol, String]
18
+ # key name to select.
19
+ # @return [Vector]
20
+ # selected variable as a Vector.
21
+ # @note DataFrame.v(key) is faster to create Vector from a variable.
22
+ # @example Select a column and return Vector
23
+ # penguins
24
+ #
25
+ # # =>
26
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
27
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
28
+ # <string> <string> <double> <double> <uint8> ... <uint16>
29
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
30
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
31
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
32
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
33
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
34
+ # : : : : : : ... :
35
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
36
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
37
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
38
+ #
39
+ # penguins[:bill_length_mm]
40
+ #
41
+ # # =>
42
+ # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
43
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
44
+ #
45
+ # @overload [](keys)
46
+ # Select variables and return a DataFrame.
47
+ #
48
+ # @param keys [<Symbol, String>] key names to select.
49
+ # @return [DataFrame]
50
+ # selected variables as a DataFrame.
51
+ # @example Select columns
52
+ # penguins[:island, :bill_length_mm]
53
+ #
54
+ # # =>
55
+ # #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
56
+ # island bill_length_mm
57
+ # <string> <double>
58
+ # 0 Torgersen 39.1
59
+ # 1 Torgersen 39.5
60
+ # 2 Torgersen 40.3
61
+ # 3 Torgersen (nil)
62
+ # 4 Torgersen 36.7
63
+ # : : :
64
+ # 341 Biscoe 50.4
65
+ # 342 Biscoe 45.2
66
+ # 343 Biscoe 49.9
67
+ #
68
+ # @overload [](index)
69
+ # Select a record and return a DataFrame.
70
+ #
71
+ # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
72
+ # index of a row to select.
73
+ # @return [DataFrame]
74
+ # selected variables as a DataFrame.
75
+ # @example Select a row
76
+ # penguins[0]
77
+ #
78
+ # # =>
79
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
80
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
+ # <string> <string> <double> <double> <uint8> ... <uint16>
82
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
83
+ #
84
+ # @overload [](indices)
85
+ # Select records by indices and return a DataFrame.
86
+ #
87
+ # @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
88
+ # indices of rows to select.
89
+ # @return [DataFrame]
90
+ # selected variables as a DataFrame.
91
+ # @example Select rows by indices
92
+ # penguins[0..100]
93
+ #
94
+ # # =>
95
+ # #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
96
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
97
+ # <string> <string> <double> <double> <uint8> ... <uint16>
98
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
99
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
100
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
101
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
102
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
103
+ # : : : : : : ... :
104
+ # 98 Adelie Dream 33.1 16.1 178 ... 2008
105
+ # 99 Adelie Dream 43.2 18.5 192 ... 2008
106
+ # 100 Adelie Biscoe 35.0 17.9 192 ... 2009
107
+ #
108
+ # @overload [](booleans)
109
+ # Select records by booleans and return a DataFrame.
110
+ #
111
+ # @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
112
+ # booleans of rows to select.
113
+ # @return [DataFrame]
114
+ # selected variables as a DataFrame.
115
+ # @example Select rows by booleans
116
+ # penguins[penguins.species == 'Adelie']
117
+ #
118
+ # # =>
119
+ # #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
120
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
121
+ # <string> <string> <double> <double> <uint8> ... <uint16>
122
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
123
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
124
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
125
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
126
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
127
+ # : : : : : : ... :
128
+ # 149 Adelie Dream 37.8 18.1 193 ... 2009
129
+ # 150 Adelie Dream 36.0 17.1 187 ... 2009
130
+ # 151 Adelie Dream 41.5 18.5 201 ... 2009
131
+ #
8
132
  def [](*args)
9
- args.flatten!
10
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
11
- return remove_all_values if args.empty? || args[0].nil?
133
+ raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
134
+
135
+ case args
136
+ in [] | [nil]
137
+ return remove_all_values
138
+ in [(Symbol | String) => k] if key? k
139
+ return variables[k.to_sym]
140
+ in [Integer => i]
141
+ return take([i.negative? ? i + size : i])
142
+ in [Vector => v]
143
+ arrow_array = v.data
144
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
145
+ arrow_array = aa
146
+ else
147
+ a = parse_args(args, size)
148
+ return select_variables_by_keys(a) if a.symbol?
149
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
150
+ return remove_all_values if a.compact.empty?
151
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
152
+
153
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
154
+ end
155
+
156
+ return take(normalize_indices(arrow_array)) if arrow_array.numeric?
157
+ return filter_by_array(arrow_array) if arrow_array.boolean?
12
158
 
13
- vector = parse_to_vector(args)
14
- if vector.boolean?
15
- return filter_by_vector(vector.data) if vector.size == size
159
+ a = arrow_array.to_a
160
+ return select_variables_by_keys(a) if a.symbol_or_string?
16
161
 
17
- raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
162
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
163
+ end
164
+
165
+ # Select a variable by String or Symbol and return as a Vector.
166
+ #
167
+ # @param key [Symbol, String]
168
+ # key name to select.
169
+ # @return [Vector]
170
+ # selected variable as a Vector.
171
+ # @note #v(key) is faster then #[](key).
172
+ # @example Select a column and return Vector
173
+ # penguins.v(:bill_length_mm)
174
+ #
175
+ # # =>
176
+ # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
177
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
+ #
179
+ def v(key)
180
+ unless key.is_a?(Symbol) || key.is_a?(String)
181
+ raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
18
182
  end
19
- return take_by_array(vector) if vector.numeric?
20
- return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
183
+ raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
21
184
 
22
- raise DataFrameArgumentError, "Invalid argument: #{args}"
185
+ variables[key.to_sym]
23
186
  end
24
187
 
25
- # slice and select rows to create sub DataFrame
188
+ # Select records to create a DataFrame.
189
+ #
190
+ # @overload slice(row)
191
+ # Select a record and return a DataFrame.
192
+ #
193
+ # @param row [Indeger, Float]
194
+ # a row index to select.
195
+ # @return [DataFrame]
196
+ # selected records as a DataFrame.
197
+ # @example Select a row
198
+ # penguins
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
202
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
203
+ # <string> <string> <double> <double> <uint8> ... <uint16>
204
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
205
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
206
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
207
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
208
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
209
+ # : : : : : : ... :
210
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
211
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
212
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
213
+ # penguins.slice(2)
214
+ #
215
+ # # =>
216
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
217
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
218
+ # <string> <string> <double> <double> <uint8> ... <uint16>
219
+ # 0 Adelie Torgersen 40.3 18.0 195 ... 2007
220
+ #
221
+ # @overload slice(rows)
222
+ # Select records and return a DataFrame.
223
+ # - Duplicated selection is acceptable. The same record will be returned.
224
+ # - The order of records will be the same as specified indices.
225
+ #
226
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
227
+ # row indeces to select.
228
+ # @return [DataFrame]
229
+ # selected records as a DataFrame.
230
+ # @example Select rows
231
+ # penguins.slice(300..-1)
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
235
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
236
+ # <string> <string> <double> <double> <uint8> ... <uint16>
237
+ # 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
238
+ # 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
239
+ # 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
240
+ # 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
241
+ # 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
242
+ # : : : : : : ... :
243
+ # 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
244
+ # 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
245
+ # 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
246
+ #
247
+ # @overload slice(enumerator)
248
+ # Select records and return a DataFrame.
249
+ # - Duplicated selection is acceptable. The same record will be returned.
250
+ # - The order of records will be the same as specified indices.
251
+ #
252
+ # @param enumerator [Enumerator]
253
+ # an enumerator which returns row indeces to select.
254
+ # @return [DataFrame]
255
+ # selected records as a DataFrame.
256
+ # @example Select rows by Enumerator.
257
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
258
+ # .slice(0.step(by: 10, to: 340))
259
+ #
260
+ # # =>
261
+ # #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
262
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
263
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
264
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
265
+ # 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
266
+ # 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
267
+ # 3 30 Adelie Dream 39.5 16.7 178 ... 2007
268
+ # 4 40 Adelie Dream 36.5 18.0 182 ... 2007
269
+ # : : : : : : : ... :
270
+ # 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
271
+ # 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
272
+ # 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
273
+ #
274
+ # @overload slice
275
+ # Select records by indices with block and return a DataFrame.
276
+ # - Duplicated selection is acceptable. The same record will be returned.
277
+ # - The order of records will be the same as specified indices.
278
+ #
279
+ # @yieldparam self [DataFrame]
280
+ # gives self to the block.
281
+ # The block is evaluated within the context of self.
282
+ # @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
283
+ # row indeces to select.
284
+ # @return [DataFrame]
285
+ # selected records as a DataFrame.
286
+ # @example Select rows by block
287
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
288
+ # .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
289
+ #
290
+ # # =>
291
+ # #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
292
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
293
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
294
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
295
+ # 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
296
+ # 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
297
+ # 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
298
+ # 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
299
+ # 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
300
+ # 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
301
+ # 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
302
+ #
303
+ # @overload slice(booleans)
304
+ # Select records by filtering with booleans and return a DataFrame.
305
+ #
306
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
307
+ # a boolean filter.
308
+ # @return [DataFrame]
309
+ # filtered records as a DataFrame.
310
+ # @example Select rows by boolean filter
311
+ # penguins.slice(penguins[:bill_length_mm] > 50)
312
+ #
313
+ # # =>
314
+ # #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
315
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
316
+ # <string> <string> <double> <double> <uint8> ... <uint16>
317
+ # 0 Chinstrap Dream 51.3 19.2 193 ... 2007
318
+ # 1 Chinstrap Dream 52.7 19.8 197 ... 2007
319
+ # 2 Chinstrap Dream 51.3 18.2 197 ... 2007
320
+ # 3 Chinstrap Dream 51.3 19.9 198 ... 2007
321
+ # 4 Chinstrap Dream 51.7 20.3 194 ... 2007
322
+ # : : : : : : ... :
323
+ # 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
324
+ # 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
325
+ # 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
326
+ #
327
+ # @overload slice
328
+ # Select records by filtering with block and return a DataFrame.
329
+ #
330
+ # @yieldparam self [DataFrame]
331
+ # gives self to the block.
332
+ # The block is evaluated within the context of self.
333
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
334
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
335
+ # @return [DataFrame]
336
+ # filtered records as a DataFrame.
337
+ # @example Select rows by booleans from block
338
+ # penguins.slice { indices.map(&:even?) }
339
+ #
340
+ # # =>
341
+ # #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
342
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
343
+ # <string> <string> <double> <double> <uint8> ... <uint16>
344
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
345
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
346
+ # 2 Adelie Torgersen 36.7 19.3 193 ... 2007
347
+ # 3 Adelie Torgersen 38.9 17.8 181 ... 2007
348
+ # 4 Adelie Torgersen 34.1 18.1 193 ... 2007
349
+ # : : : : : : ... :
350
+ # 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
351
+ # 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
352
+ # 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
353
+ #
26
354
  def slice(*args, &block)
27
- slicer = args
355
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
356
+
28
357
  if block
29
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
358
+ unless args.empty?
359
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
360
+ end
30
361
 
31
- slicer = [instance_eval(&block)]
362
+ args = [instance_eval(&block)]
32
363
  end
33
- slicer.flatten!
34
364
 
35
- raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
- return remove_all_values if slicer.empty? || slicer[0].nil?
37
-
38
- vector = parse_to_vector(slicer)
39
- if vector.boolean?
40
- return filter_by_vector(vector.data) if vector.size == size
365
+ arrow_array =
366
+ case args
367
+ in [] | [[]]
368
+ return remove_all_values
369
+ in [Vector => v]
370
+ v.data
371
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
372
+ aa
373
+ else
374
+ Arrow::Array.new(parse_args(args, size))
375
+ end
41
376
 
42
- raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
377
+ if arrow_array.numeric?
378
+ take(normalize_indices(arrow_array))
379
+ elsif arrow_array.boolean?
380
+ filter_by_array(arrow_array)
381
+ elsif arrow_array.to_a.compact.empty?
382
+ # Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
383
+ remove_all_values
384
+ else
385
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
43
386
  end
44
- return take_by_array(vector) if vector.numeric?
45
-
46
- raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
387
  end
48
388
 
389
+ # Select records by a column specified by a key
390
+ # and corresponding record with a block.
391
+ #
392
+ # @overload slice_by(key)
393
+ # Select records by elements.
394
+ #
395
+ # @param key [Symbol, String]
396
+ # a key to select column.
397
+ # @param keep_key [true, false]
398
+ # preserve column specified by key in the result if true.
399
+ # @yieldparam self [DataFrame]
400
+ # gives self to the block.
401
+ # The block is evaluated within the context of self.
402
+ # @yieldreturn [<elements>]
403
+ # array of elements to select.
404
+ # @return [DataFrame]
405
+ # selected records as a DataFrame.
406
+ # @example Select records by elements
407
+ # df
408
+ #
409
+ # # =>
410
+ # #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
411
+ # index float string
412
+ # <uint8> <double> <string>
413
+ # 0 0 0.0 A
414
+ # 1 1 1.1 B
415
+ # 2 2 2.2 C
416
+ # 3 3 NaN D
417
+ # 4 (nil) (nil) (nil)
418
+ #
419
+ # df.slice_by(:string) { ["A", "C"] }
420
+ #
421
+ # # =>
422
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
423
+ # index float
424
+ # <uint8> <double>
425
+ # 0 0 0.0
426
+ # 1 2 2.2
427
+ #
428
+ # @overload slice_by(key)
429
+ # Select records by elements range.
430
+ #
431
+ # @param key [Symbol, String]
432
+ # a key to select column.
433
+ # @param keep_key [true, false]
434
+ # preserve column specified by key in the result if true.
435
+ # @yieldparam self [DataFrame]
436
+ # gives self to the block.
437
+ # The block is evaluated within the context of self.
438
+ # @yieldreturn [Range]
439
+ # specifies position of elements at the start and the end and
440
+ # select records between them.
441
+ # @return [DataFrame]
442
+ # selected records as a DataFrame.
443
+ # @example Select records by elements range
444
+ # df.slice_by(:string) { "A".."C" }
445
+ #
446
+ # # =>
447
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
448
+ # index float
449
+ # <uint8> <double>
450
+ # 0 0 0.0
451
+ # 1 1 1.1
452
+ # 2 2 2.2
453
+ #
454
+ # @since 0.2.1
455
+ #
49
456
  def slice_by(key, keep_key: false, &block)
50
457
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
458
  raise DataFrameArgumentError, 'No block given' unless block
52
- raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
459
+ raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
53
460
  return self if key.nil?
54
461
 
55
462
  slicer = instance_eval(&block)
@@ -83,160 +490,414 @@ module RedAmber
83
490
  slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
491
  end
85
492
 
86
- if keep_key
87
- take(slicer)
88
- else
89
- take(slicer).drop(key)
90
- end
493
+ taken = take(normalize_indices(Arrow::Array.new(slicer)))
494
+ keep_key ? taken : taken.drop(key)
91
495
  end
92
496
 
93
- # remove selected rows to create remainer DataFrame
94
- def remove(*args, &block)
95
- remover = args
497
+ # Select records by filtering with booleans to create a DataFrame.
498
+ #
499
+ # @overload filter(booleans)
500
+ # Select records by filtering with booleans and return a DataFrame.
501
+ #
502
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
503
+ # a boolean filter.
504
+ # @return [DataFrame]
505
+ # filtered records as a DataFrame.
506
+ # @example Filter by boolean Vector
507
+ # penguins
508
+ #
509
+ # # =>
510
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
511
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
512
+ # <string> <string> <double> <double> <uint8> ... <uint16>
513
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
514
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
515
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
516
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
517
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
518
+ # : : : : : : ... :
519
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
520
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
521
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
522
+ #
523
+ # penguins.filter(penguins.bill_length_mm < 50)
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
527
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
528
+ # <string> <string> <double> <double> <uint8> ... <uint16>
529
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
530
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
531
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
532
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
533
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
534
+ # : : : : : : ... :
535
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
536
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
537
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
538
+ #
539
+ # @overload filter
540
+ # Select records by filtering with block and return a DataFrame.
541
+ #
542
+ # @yieldparam self [DataFrame]
543
+ # gives self to the block.
544
+ # The block is evaluated within the context of self.
545
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
546
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
547
+ # @return [DataFrame]
548
+ # filtered records as a DataFrame.
549
+ # @example Filter by boolean Vector
550
+ # penguins.filter { bill_length_mm < 50 }
551
+ #
552
+ # # =>
553
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
554
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
555
+ # <string> <string> <double> <double> <uint8> ... <uint16>
556
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
557
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
558
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
559
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
560
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
561
+ # : : : : : : ... :
562
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
563
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
564
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
565
+ #
566
+ def filter(*booleans, &block)
567
+ booleans.flatten!
568
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
569
+
96
570
  if block
97
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
571
+ unless booleans.empty?
572
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
573
+ end
574
+
575
+ booleans = [instance_eval(&block)]
576
+ end
577
+
578
+ case booleans
579
+ in [] | [[]]
580
+ return remove_all_values
581
+ in [Vector => v] if v.boolean?
582
+ filter_by_array(v.data)
583
+ in [Arrow::ChunkedArray => ca] if ca.boolean?
584
+ filter_by_array(ca)
585
+ in [Arrow::BooleanArray => b]
586
+ filter_by_array(b)
587
+ else
588
+ a = Arrow::Array.new(parse_args(booleans, size))
589
+ unless a.boolean?
590
+ raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
591
+ end
98
592
 
99
- remover = [instance_eval(&block)]
593
+ filter_by_array(a)
100
594
  end
101
- remover.flatten!
595
+ end
102
596
 
103
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
104
- return self if remover.empty? || remover[0].nil?
597
+ # Select records and remove them to create a remainer DataFrame.
598
+ #
599
+ # @overload remove(row)
600
+ # Select a record and remove it to create a remainer DataFrame.
601
+ # - The order of records in self will be preserved.
602
+ #
603
+ # @param row [Indeger, Float]
604
+ # a row index to remove.
605
+ # @return [DataFrame]
606
+ # remainer variables as a DataFrame.
607
+ # @example Remove a row
608
+ # penguins.remove(-1)
609
+ #
610
+ # # =>
611
+ # #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
612
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
613
+ # <string> <string> <double> <double> <uint8> ... <uint16>
614
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
615
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
616
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
617
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
618
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
619
+ # : : : : : : ... :
620
+ # 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
621
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
622
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
623
+ #
624
+ # @overload remove(rows)
625
+ # Select records and remove them to create a remainer DataFrame.
626
+ # - Duplicated selection is acceptable.
627
+ # - The order of records in self will be preserved.
628
+ #
629
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
630
+ # row indeces to remove.
631
+ # @return [DataFrame]
632
+ # remainer variables as a DataFrame.
633
+ # @example Remove rows
634
+ # penguins.remove(100..200)
635
+ #
636
+ # # =>
637
+ # #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
638
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
639
+ # <string> <string> <double> <double> <uint8> ... <uint16>
640
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
641
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
642
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
643
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
644
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
645
+ # : : : : : : ... :
646
+ # 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
647
+ # 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
648
+ # 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
649
+ #
650
+ # @overload remove
651
+ # Select records by indices from block
652
+ # and remove them to create a remainer DataFrame.
653
+ # - Duplicated selection is acceptable.
654
+ # - The order of records in self will be preserved.
655
+ #
656
+ # @yieldparam self [DataFrame]
657
+ # gives self to the block.
658
+ # The block is evaluated within the context of self.
659
+ # @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
660
+ # row indeces to remove.
661
+ # @return [DataFrame]
662
+ # remainer variables as a DataFrame.
663
+ # @example Remove rows by indices from block
664
+ # penguins.remove { 0.step(size, 10) }
665
+ #
666
+ # # =>
667
+ # #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
668
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
669
+ # <string> <string> <double> <double> <uint8> ... <uint16>
670
+ # 0 Adelie Torgersen 39.5 17.4 186 ... 2007
671
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
672
+ # 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
673
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
674
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
675
+ # : : : : : : ... :
676
+ # 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
677
+ # 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
678
+ # 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
679
+ #
680
+ # @overload remove(booleans)
681
+ # Select records by filtering with booleans and return a DataFrame.
682
+ # - The order of records in self will be preserved.
683
+ #
684
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
685
+ # a boolean filter to remove.
686
+ # @return [DataFrame]
687
+ # remainer records as a DataFrame.
688
+ # @example Remove rows by boolean filter
689
+ # penguins.remove(penguins.bill_length_mm.is_nil)
690
+ #
691
+ # # =>
692
+ # #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
693
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
694
+ # <string> <string> <double> <double> <uint8> ... <uint16>
695
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
696
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
697
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
698
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
699
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
700
+ # : : : : : : ... :
701
+ # 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
702
+ # 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
703
+ # 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
704
+ #
705
+ # @overload remove
706
+ # Select records by booleans from block
707
+ # and remove them to create a remainer DataFrame.
708
+ # - The order of records in self will be preserved.
709
+ #
710
+ # @yieldparam self [DataFrame]
711
+ # gives self to the block.
712
+ # The block is evaluated within the context of self.
713
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
714
+ # a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
715
+ # @return [DataFrame]
716
+ # remainer records as a DataFrame.
717
+ # @example Remove rows by booleans from block
718
+ # penguins.remove { (species == 'Adelie') | (year == 2009) }
719
+ #
720
+ # # =>
721
+ # #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
722
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
723
+ # <string> <string> <double> <double> <uint8> ... <uint16>
724
+ # 0 Chinstrap Dream 46.5 17.9 192 ... 2007
725
+ # 1 Chinstrap Dream 50.0 19.5 196 ... 2007
726
+ # 2 Chinstrap Dream 51.3 19.2 193 ... 2007
727
+ # 3 Chinstrap Dream 45.4 18.7 188 ... 2007
728
+ # 4 Chinstrap Dream 52.7 19.8 197 ... 2007
729
+ # : : : : : : ... :
730
+ # 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
731
+ # 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
732
+ # 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
733
+ #
734
+ def remove(*args, &block)
735
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
105
736
 
106
- vector = parse_to_vector(remover)
107
- if vector.boolean?
108
- return filter_by_vector(vector.primitive_invert.data) if vector.size == size
737
+ if block
738
+ unless args.empty?
739
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
740
+ end
109
741
 
110
- raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
742
+ args = [instance_eval(&block)]
111
743
  end
112
- if vector.numeric?
113
- raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
114
744
 
115
- normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
116
- if normalized_indices.max >= size
117
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
745
+ arrow_array =
746
+ case args
747
+ in [] | [[]] | [nil]
748
+ return self
749
+ in [Vector => v]
750
+ v.data
751
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
752
+ aa
753
+ else
754
+ Arrow::Array.new(parse_args(args, size))
118
755
  end
119
756
 
120
- normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
121
- return remove_all_values if normalized_indices == indices.to_a
122
- return self if normalized_indices.empty?
757
+ if arrow_array.boolean?
758
+ filter_by_array(arrow_array.primitive_invert)
759
+ elsif arrow_array.numeric?
760
+ remover = normalize_indices(arrow_array).to_a
761
+ return self if remover.empty?
123
762
 
124
- index_array = indices.to_a - normalized_indices
763
+ slicer = indices.to_a - remover.map(&:to_i)
764
+ return remove_all_values if slicer.empty?
125
765
 
126
- datum = Arrow::Function.find(:take).execute([table, index_array])
127
- return DataFrame.new(datum.value)
766
+ take(slicer)
767
+ else
768
+ raise DataFrameArgumentError, "Invalid argument #{args}"
128
769
  end
129
-
130
- raise DataFrameArgumentError, "Invalid argument #{remover}"
131
770
  end
132
771
 
772
+ # Remove records (rows) contains any nil.
773
+ #
774
+ # @return [DataFrame]
775
+ # removed DataFrame.
776
+ # @example
777
+ # penguins.remove_nil
778
+ # # =>
779
+ # #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
780
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
781
+ # <string> <string> <double> <double> <uint8> ... <uint16>
782
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
783
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
784
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
785
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
786
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
787
+ # : : : : : : ... :
788
+ # 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
789
+ # 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
790
+ # 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
791
+ #
133
792
  def remove_nil
134
793
  func = Arrow::Function.find(:drop_null)
135
- DataFrame.new(func.execute([table]).value)
794
+ DataFrame.create(func.execute([table]).value)
136
795
  end
137
796
  alias_method :drop_nil, :remove_nil
138
797
 
139
- # Select a variable by a key in String or Symbol
140
- def v(key)
141
- unless key.is_a?(Symbol) || key.is_a?(String)
142
- raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
143
- end
144
- raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
145
-
146
- variables[key.to_sym]
147
- end
148
-
798
+ # Select records from the top.
799
+ #
800
+ # @param n_obs [Integer]
801
+ # number of records to select.
802
+ # @return [DataFrame]
803
+ #
149
804
  def head(n_obs = 5)
150
805
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
151
806
 
152
807
  self[0...[n_obs, size].min]
153
808
  end
154
809
 
810
+ # Select records from the end.
811
+ #
812
+ # @param n_obs [Integer]
813
+ # number of records to select.
814
+ # @return [DataFrame]
815
+ #
155
816
  def tail(n_obs = 5)
156
817
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
157
818
 
158
819
  self[-[n_obs, size].min..]
159
820
  end
160
821
 
822
+ # Select records from the top.
823
+ #
824
+ # @param n_obs [Integer]
825
+ # number of records to select.
826
+ # @return [DataFrame]
827
+ #
161
828
  def first(n_obs = 1)
162
829
  head(n_obs)
163
830
  end
164
831
 
832
+ # Select records from the end.
833
+ #
834
+ # @param n_obs [Integer]
835
+ # number of records to select.
836
+ # @return [DataFrame]
837
+ #
165
838
  def last(n_obs = 1)
166
839
  tail(n_obs)
167
840
  end
168
841
 
169
- # Undocumented
170
- # TODO: support for option {boundscheck: true}
171
- def take(*arg_indices)
172
- arg_indices.flatten!
173
- return remove_all_values if arg_indices.empty?
174
-
175
- arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
176
- arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
177
-
178
- take_by_array(arg_indices)
842
+ # Select records by index Array to create a DataFrame.
843
+ #
844
+ # - TODO: support for option `boundscheck: true`
845
+ # - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
846
+ # - Negative index is not supported.
847
+ # @param index_array [<Integer>, Arrow::Array]
848
+ # row indeces to select.
849
+ # @return [DataFrame]
850
+ # selected variables as a DataFrame.
851
+ #
852
+ # @api private
853
+ #
854
+ def take(index_array)
855
+ DataFrame.create(@table.take(index_array))
179
856
  end
180
857
 
181
- # Undocumented
182
- # TODO: support for option {null_selection_behavior: :drop}
183
- def filter(*booleans)
184
- booleans.flatten!
185
- return remove_all_values if booleans.empty?
186
-
187
- b = booleans[0]
188
- case b
189
- when Vector
190
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
191
-
192
- filter_by_vector(b.data)
193
- when Arrow::BooleanArray
194
- filter_by_vector(b)
195
- else
196
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
197
-
198
- filter_by_vector(Arrow::BooleanArray.new(booleans))
199
- end
200
- end
858
+ # rubocop:enable Layout/LineLength
201
859
 
202
860
  private
203
861
 
204
- def select_vars_by_keys(keys)
862
+ def select_variables_by_keys(keys)
205
863
  if keys.one?
206
864
  key = keys[0].to_sym
207
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
865
+ raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
208
866
 
209
867
  variables[key]
868
+ # Vector.new(@table.find_column(*key).data)
210
869
  else
211
- DataFrame.new(@table[keys])
870
+ check_duplicate_keys(keys)
871
+ DataFrame.create(@table.select_columns(*keys))
212
872
  end
213
873
  end
214
874
 
215
- # Accepts indices by numeric Vector
216
- def take_by_array(indices)
217
- raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
218
- raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
219
-
220
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
221
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
222
-
223
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
224
-
225
- datum = Arrow::Function.find(:take).execute([table, index_array])
226
- DataFrame.new(datum.value)
875
+ # Accepts indices by numeric arrow array and returns positive indices.
876
+ def normalize_indices(arrow_array)
877
+ b = Arrow::Function.find(:less).execute([arrow_array, 0])
878
+ a = Arrow::Function.find(:add).execute([arrow_array, size])
879
+ r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
880
+ if r.float?
881
+ r = Arrow::Function.find(:floor).execute([r]).value
882
+ Arrow::UInt64ArrayBuilder.build(r)
883
+ else
884
+ r
885
+ end
227
886
  end
228
887
 
229
- # Accepts booleans by Arrow::BooleanArray
230
- def filter_by_vector(boolean_array)
231
- raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
888
+ # Accepts booleans by a Arrow::BooleanArray or an Array
889
+ def filter_by_array(boolean_array)
890
+ unless boolean_array.length == size
891
+ raise DataFrameArgumentError, 'Booleans must be same size as self.'
892
+ end
232
893
 
233
894
  datum = Arrow::Function.find(:filter).execute([table, boolean_array])
234
- DataFrame.new(datum.value)
895
+ DataFrame.create(datum.value)
235
896
  end
236
897
 
237
898
  # return a DataFrame with same keys as self without values
238
899
  def remove_all_values
239
- filter_by_vector(Arrow::BooleanArray.new([false] * size))
900
+ filter_by_array(Arrow::BooleanArray.new([false] * size))
240
901
  end
241
902
  end
242
903
  end