red_amber 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,40 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
6
  # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
7
  using RefineArray
8
8
  using RefineArrayLike
9
9
 
10
+ # rubocop:disable Layout/LineLength
11
+
10
12
  # Select variables or records.
11
13
  #
12
14
  # @overload [](key)
13
- # select single variable and return as a Vetor.
15
+ # Select single variable (column) and return as a Vetor.
14
16
  #
15
- # @param key [Symbol, String] key name to select.
16
- # @return [Vector] selected variable as a Vector.
17
+ # @param key [Symbol, String]
18
+ # key name to select.
19
+ # @return [Vector]
20
+ # selected variable as a Vector.
17
21
  # @note DataFrame.v(key) is faster to create Vector from a variable.
22
+ # @example Select a column and return Vector
23
+ # penguins
24
+ #
25
+ # # =>
26
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
27
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
28
+ # <string> <string> <double> <double> <uint8> ... <uint16>
29
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
30
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
31
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
32
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
33
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
34
+ # : : : : : : ... :
35
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
36
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
37
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
38
+ #
39
+ # penguins[:bill_length_mm]
40
+ #
41
+ # # =>
42
+ # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
43
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
18
44
  #
19
45
  # @overload [](keys)
20
- # select variables and return a DataFrame.
46
+ # Select variables and return a DataFrame.
21
47
  #
22
48
  # @param keys [<Symbol, String>] key names to select.
23
- # @return [DataFrame] selected variables as a DataFrame.
49
+ # @return [DataFrame]
50
+ # selected variables as a DataFrame.
51
+ # @example Select columns
52
+ # penguins[:island, :bill_length_mm]
53
+ #
54
+ # # =>
55
+ # #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
56
+ # island bill_length_mm
57
+ # <string> <double>
58
+ # 0 Torgersen 39.1
59
+ # 1 Torgersen 39.5
60
+ # 2 Torgersen 40.3
61
+ # 3 Torgersen (nil)
62
+ # 4 Torgersen 36.7
63
+ # : : :
64
+ # 341 Biscoe 50.4
65
+ # 342 Biscoe 45.2
66
+ # 343 Biscoe 49.9
24
67
  #
25
68
  # @overload [](index)
26
- # select records and return a DataFrame.
69
+ # Select a record and return a DataFrame.
27
70
  #
28
71
  # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
29
72
  # index of a row to select.
30
- # @return [DataFrame] selected variables as a DataFrame.
73
+ # @return [DataFrame]
74
+ # selected variables as a DataFrame.
75
+ # @example Select a row
76
+ # penguins[0]
77
+ #
78
+ # # =>
79
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
80
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
+ # <string> <string> <double> <double> <uint8> ... <uint16>
82
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
31
83
  #
32
84
  # @overload [](indices)
33
- # select records and return a DataFrame.
85
+ # Select records by indices and return a DataFrame.
34
86
  #
35
- # @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
87
+ # @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
36
88
  # indices of rows to select.
37
- # @return [DataFrame] selected variables as a DataFrame.
89
+ # @return [DataFrame]
90
+ # selected variables as a DataFrame.
91
+ # @example Select rows by indices
92
+ # penguins[0..100]
93
+ #
94
+ # # =>
95
+ # #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
96
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
97
+ # <string> <string> <double> <double> <uint8> ... <uint16>
98
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
99
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
100
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
101
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
102
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
103
+ # : : : : : : ... :
104
+ # 98 Adelie Dream 33.1 16.1 178 ... 2008
105
+ # 99 Adelie Dream 43.2 18.5 192 ... 2008
106
+ # 100 Adelie Biscoe 35.0 17.9 192 ... 2009
107
+ #
108
+ # @overload [](booleans)
109
+ # Select records by booleans and return a DataFrame.
110
+ #
111
+ # @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
112
+ # booleans of rows to select.
113
+ # @return [DataFrame]
114
+ # selected variables as a DataFrame.
115
+ # @example Select rows by booleans
116
+ # penguins[penguins.species == 'Adelie']
117
+ #
118
+ # # =>
119
+ # #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
120
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
121
+ # <string> <string> <double> <double> <uint8> ... <uint16>
122
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
123
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
124
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
125
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
126
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
127
+ # : : : : : : ... :
128
+ # 149 Adelie Dream 37.8 18.1 193 ... 2009
129
+ # 150 Adelie Dream 36.0 17.1 187 ... 2009
130
+ # 151 Adelie Dream 41.5 18.5 201 ... 2009
38
131
  #
39
132
  def [](*args)
40
133
  raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
@@ -52,10 +145,10 @@ module RedAmber
52
145
  arrow_array = aa
53
146
  else
54
147
  a = parse_args(args, size)
55
- return select_variables_by_keys(a) if a.symbols?
56
- return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
148
+ return select_variables_by_keys(a) if a.symbol?
149
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
57
150
  return remove_all_values if a.compact.empty?
58
- return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
151
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
59
152
 
60
153
  raise DataFrameArgumentError, "invalid arguments: #{args}"
61
154
  end
@@ -64,12 +157,25 @@ module RedAmber
64
157
  return filter_by_array(arrow_array) if arrow_array.boolean?
65
158
 
66
159
  a = arrow_array.to_a
67
- return select_variables_by_keys(a) if a.symbols_or_strings?
160
+ return select_variables_by_keys(a) if a.symbol_or_string?
68
161
 
69
162
  raise DataFrameArgumentError, "invalid arguments: #{args}"
70
163
  end
71
164
 
72
- # Select a variable by a key in String or Symbol
165
+ # Select a variable by String or Symbol and return as a Vector.
166
+ #
167
+ # @param key [Symbol, String]
168
+ # key name to select.
169
+ # @return [Vector]
170
+ # selected variable as a Vector.
171
+ # @note #v(key) is faster then #[](key).
172
+ # @example Select a column and return Vector
173
+ # penguins.v(:bill_length_mm)
174
+ #
175
+ # # =>
176
+ # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
177
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
+ #
73
179
  def v(key)
74
180
  unless key.is_a?(Symbol) || key.is_a?(String)
75
181
  raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
@@ -82,30 +188,168 @@ module RedAmber
82
188
  # Select records to create a DataFrame.
83
189
  #
84
190
  # @overload slice(row)
85
- # select a record and return a DataFrame.
191
+ # Select a record and return a DataFrame.
86
192
  #
87
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
88
- # a row index to select.
89
- # @yield [self] gives self to the block.
90
- # @note The block is evaluated within the context of self.
91
- # It is accessable to self's instance variables and private methods.
92
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
193
+ # @param row [Indeger, Float]
93
194
  # a row index to select.
94
- # @return [DataFrame] selected variables as a DataFrame.
195
+ # @return [DataFrame]
196
+ # selected records as a DataFrame.
197
+ # @example Select a row
198
+ # penguins
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
202
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
203
+ # <string> <string> <double> <double> <uint8> ... <uint16>
204
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
205
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
206
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
207
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
208
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
209
+ # : : : : : : ... :
210
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
211
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
212
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
213
+ # penguins.slice(2)
214
+ #
215
+ # # =>
216
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
217
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
218
+ # <string> <string> <double> <double> <uint8> ... <uint16>
219
+ # 0 Adelie Torgersen 40.3 18.0 195 ... 2007
95
220
  #
96
221
  # @overload slice(rows)
97
- # select records and return a DataFrame.
222
+ # Select records and return a DataFrame.
98
223
  # - Duplicated selection is acceptable. The same record will be returned.
99
224
  # - The order of records will be the same as specified indices.
100
225
  #
101
- # @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
226
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
102
227
  # row indeces to select.
103
- # @yield [self] gives self to the block.
104
- # @note The block is evaluated within the context of self.
105
- # It is accessable to self's instance variables and private methods.
106
- # @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
228
+ # @return [DataFrame]
229
+ # selected records as a DataFrame.
230
+ # @example Select rows
231
+ # penguins.slice(300..-1)
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
235
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
236
+ # <string> <string> <double> <double> <uint8> ... <uint16>
237
+ # 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
238
+ # 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
239
+ # 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
240
+ # 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
241
+ # 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
242
+ # : : : : : : ... :
243
+ # 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
244
+ # 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
245
+ # 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
246
+ #
247
+ # @overload slice(enumerator)
248
+ # Select records and return a DataFrame.
249
+ # - Duplicated selection is acceptable. The same record will be returned.
250
+ # - The order of records will be the same as specified indices.
251
+ #
252
+ # @param enumerator [Enumerator]
253
+ # an enumerator which returns row indeces to select.
254
+ # @return [DataFrame]
255
+ # selected records as a DataFrame.
256
+ # @example Select rows by Enumerator.
257
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
258
+ # .slice(0.step(by: 10, to: 340))
259
+ #
260
+ # # =>
261
+ # #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
262
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
263
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
264
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
265
+ # 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
266
+ # 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
267
+ # 3 30 Adelie Dream 39.5 16.7 178 ... 2007
268
+ # 4 40 Adelie Dream 36.5 18.0 182 ... 2007
269
+ # : : : : : : : ... :
270
+ # 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
271
+ # 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
272
+ # 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
273
+ #
274
+ # @overload slice
275
+ # Select records by indices with block and return a DataFrame.
276
+ # - Duplicated selection is acceptable. The same record will be returned.
277
+ # - The order of records will be the same as specified indices.
278
+ #
279
+ # @yieldparam self [DataFrame]
280
+ # gives self to the block.
281
+ # The block is evaluated within the context of self.
282
+ # @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
107
283
  # row indeces to select.
108
- # @return [DataFrame] selected variables as a DataFrame.
284
+ # @return [DataFrame]
285
+ # selected records as a DataFrame.
286
+ # @example Select rows by block
287
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
288
+ # .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
289
+ #
290
+ # # =>
291
+ # #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
292
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
293
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
294
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
295
+ # 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
296
+ # 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
297
+ # 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
298
+ # 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
299
+ # 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
300
+ # 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
301
+ # 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
302
+ #
303
+ # @overload slice(booleans)
304
+ # Select records by filtering with booleans and return a DataFrame.
305
+ #
306
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
307
+ # a boolean filter.
308
+ # @return [DataFrame]
309
+ # filtered records as a DataFrame.
310
+ # @example Select rows by boolean filter
311
+ # penguins.slice(penguins[:bill_length_mm] > 50)
312
+ #
313
+ # # =>
314
+ # #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
315
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
316
+ # <string> <string> <double> <double> <uint8> ... <uint16>
317
+ # 0 Chinstrap Dream 51.3 19.2 193 ... 2007
318
+ # 1 Chinstrap Dream 52.7 19.8 197 ... 2007
319
+ # 2 Chinstrap Dream 51.3 18.2 197 ... 2007
320
+ # 3 Chinstrap Dream 51.3 19.9 198 ... 2007
321
+ # 4 Chinstrap Dream 51.7 20.3 194 ... 2007
322
+ # : : : : : : ... :
323
+ # 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
324
+ # 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
325
+ # 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
326
+ #
327
+ # @overload slice
328
+ # Select records by filtering with block and return a DataFrame.
329
+ #
330
+ # @yieldparam self [DataFrame]
331
+ # gives self to the block.
332
+ # The block is evaluated within the context of self.
333
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
334
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
335
+ # @return [DataFrame]
336
+ # filtered records as a DataFrame.
337
+ # @example Select rows by booleans from block
338
+ # penguins.slice { indices.map(&:even?) }
339
+ #
340
+ # # =>
341
+ # #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
342
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
343
+ # <string> <string> <double> <double> <uint8> ... <uint16>
344
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
345
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
346
+ # 2 Adelie Torgersen 36.7 19.3 193 ... 2007
347
+ # 3 Adelie Torgersen 38.9 17.8 181 ... 2007
348
+ # 4 Adelie Torgersen 34.1 18.1 193 ... 2007
349
+ # : : : : : : ... :
350
+ # 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
351
+ # 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
352
+ # 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
109
353
  #
110
354
  def slice(*args, &block)
111
355
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -142,6 +386,73 @@ module RedAmber
142
386
  end
143
387
  end
144
388
 
389
+ # Select records by a column specified by a key
390
+ # and corresponding record with a block.
391
+ #
392
+ # @overload slice_by(key)
393
+ # Select records by elements.
394
+ #
395
+ # @param key [Symbol, String]
396
+ # a key to select column.
397
+ # @param keep_key [true, false]
398
+ # preserve column specified by key in the result if true.
399
+ # @yieldparam self [DataFrame]
400
+ # gives self to the block.
401
+ # The block is evaluated within the context of self.
402
+ # @yieldreturn [<elements>]
403
+ # array of elements to select.
404
+ # @return [DataFrame]
405
+ # selected records as a DataFrame.
406
+ # @example Select records by elements
407
+ # df
408
+ #
409
+ # # =>
410
+ # #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
411
+ # index float string
412
+ # <uint8> <double> <string>
413
+ # 0 0 0.0 A
414
+ # 1 1 1.1 B
415
+ # 2 2 2.2 C
416
+ # 3 3 NaN D
417
+ # 4 (nil) (nil) (nil)
418
+ #
419
+ # df.slice_by(:string) { ["A", "C"] }
420
+ #
421
+ # # =>
422
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
423
+ # index float
424
+ # <uint8> <double>
425
+ # 0 0 0.0
426
+ # 1 2 2.2
427
+ #
428
+ # @overload slice_by(key)
429
+ # Select records by elements range.
430
+ #
431
+ # @param key [Symbol, String]
432
+ # a key to select column.
433
+ # @param keep_key [true, false]
434
+ # preserve column specified by key in the result if true.
435
+ # @yieldparam self [DataFrame]
436
+ # gives self to the block.
437
+ # The block is evaluated within the context of self.
438
+ # @yieldreturn [Range]
439
+ # specifies position of elements at the start and the end and
440
+ # select records between them.
441
+ # @return [DataFrame]
442
+ # selected records as a DataFrame.
443
+ # @example Select records by elements range
444
+ # df.slice_by(:string) { "A".."C" }
445
+ #
446
+ # # =>
447
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
448
+ # index float
449
+ # <uint8> <double>
450
+ # 0 0 0.0
451
+ # 1 1 1.1
452
+ # 2 2 2.2
453
+ #
454
+ # @since 0.2.1
455
+ #
145
456
  def slice_by(key, keep_key: false, &block)
146
457
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
147
458
  raise DataFrameArgumentError, 'No block given' unless block
@@ -183,33 +494,242 @@ module RedAmber
183
494
  keep_key ? taken : taken.drop(key)
184
495
  end
185
496
 
497
+ # Select records by filtering with booleans to create a DataFrame.
498
+ #
499
+ # @overload filter(booleans)
500
+ # Select records by filtering with booleans and return a DataFrame.
501
+ #
502
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
503
+ # a boolean filter.
504
+ # @return [DataFrame]
505
+ # filtered records as a DataFrame.
506
+ # @example Filter by boolean Vector
507
+ # penguins
508
+ #
509
+ # # =>
510
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
511
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
512
+ # <string> <string> <double> <double> <uint8> ... <uint16>
513
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
514
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
515
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
516
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
517
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
518
+ # : : : : : : ... :
519
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
520
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
521
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
522
+ #
523
+ # penguins.filter(penguins.bill_length_mm < 50)
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
527
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
528
+ # <string> <string> <double> <double> <uint8> ... <uint16>
529
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
530
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
531
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
532
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
533
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
534
+ # : : : : : : ... :
535
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
536
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
537
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
538
+ #
539
+ # @overload filter
540
+ # Select records by filtering with block and return a DataFrame.
541
+ #
542
+ # @yieldparam self [DataFrame]
543
+ # gives self to the block.
544
+ # The block is evaluated within the context of self.
545
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
546
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
547
+ # @return [DataFrame]
548
+ # filtered records as a DataFrame.
549
+ # @example Filter by boolean Vector
550
+ # penguins.filter { bill_length_mm < 50 }
551
+ #
552
+ # # =>
553
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
554
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
555
+ # <string> <string> <double> <double> <uint8> ... <uint16>
556
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
557
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
558
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
559
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
560
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
561
+ # : : : : : : ... :
562
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
563
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
564
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
565
+ #
566
+ def filter(*booleans, &block)
567
+ booleans.flatten!
568
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
569
+
570
+ if block
571
+ unless booleans.empty?
572
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
573
+ end
574
+
575
+ booleans = [instance_eval(&block)]
576
+ end
577
+
578
+ case booleans
579
+ in [] | [[]]
580
+ return remove_all_values
581
+ in [Vector => v] if v.boolean?
582
+ filter_by_array(v.data)
583
+ in [Arrow::ChunkedArray => ca] if ca.boolean?
584
+ filter_by_array(ca)
585
+ in [Arrow::BooleanArray => b]
586
+ filter_by_array(b)
587
+ else
588
+ a = Arrow::Array.new(parse_args(booleans, size))
589
+ unless a.boolean?
590
+ raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
591
+ end
592
+
593
+ filter_by_array(a)
594
+ end
595
+ end
596
+
186
597
  # Select records and remove them to create a remainer DataFrame.
187
598
  #
188
599
  # @overload remove(row)
189
- # select a record and remove it to create a remainer DataFrame.
600
+ # Select a record and remove it to create a remainer DataFrame.
190
601
  # - The order of records in self will be preserved.
191
602
  #
192
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
603
+ # @param row [Indeger, Float]
193
604
  # a row index to remove.
194
- # @yield [self] gives self to the block.
195
- # @note The block is evaluated within the context of self.
196
- # It is accessable to self's instance variables and private methods.
197
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
198
- # a row index to remove.
199
- # @return [DataFrame] remainer variables as a DataFrame.
605
+ # @return [DataFrame]
606
+ # remainer variables as a DataFrame.
607
+ # @example Remove a row
608
+ # penguins.remove(-1)
609
+ #
610
+ # # =>
611
+ # #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
612
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
613
+ # <string> <string> <double> <double> <uint8> ... <uint16>
614
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
615
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
616
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
617
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
618
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
619
+ # : : : : : : ... :
620
+ # 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
621
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
622
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
200
623
  #
201
624
  # @overload remove(rows)
202
- # select records and remove them to create a remainer DataFrame.
625
+ # Select records and remove them to create a remainer DataFrame.
626
+ # - Duplicated selection is acceptable.
203
627
  # - The order of records in self will be preserved.
204
628
  #
205
- # @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
629
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
206
630
  # row indeces to remove.
207
- # @yield [self] gives self to the block.
208
- # @note The block is evaluated within the context of self.
209
- # It is accessable to self's instance variables and private methods.
210
- # @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
631
+ # @return [DataFrame]
632
+ # remainer variables as a DataFrame.
633
+ # @example Remove rows
634
+ # penguins.remove(100..200)
635
+ #
636
+ # # =>
637
+ # #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
638
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
639
+ # <string> <string> <double> <double> <uint8> ... <uint16>
640
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
641
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
642
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
643
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
644
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
645
+ # : : : : : : ... :
646
+ # 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
647
+ # 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
648
+ # 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
649
+ #
650
+ # @overload remove
651
+ # Select records by indices from block
652
+ # and remove them to create a remainer DataFrame.
653
+ # - Duplicated selection is acceptable.
654
+ # - The order of records in self will be preserved.
655
+ #
656
+ # @yieldparam self [DataFrame]
657
+ # gives self to the block.
658
+ # The block is evaluated within the context of self.
659
+ # @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
211
660
  # row indeces to remove.
212
- # @return [DataFrame] remainer variables as a DataFrame.
661
+ # @return [DataFrame]
662
+ # remainer variables as a DataFrame.
663
+ # @example Remove rows by indices from block
664
+ # penguins.remove { 0.step(size, 10) }
665
+ #
666
+ # # =>
667
+ # #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
668
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
669
+ # <string> <string> <double> <double> <uint8> ... <uint16>
670
+ # 0 Adelie Torgersen 39.5 17.4 186 ... 2007
671
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
672
+ # 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
673
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
674
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
675
+ # : : : : : : ... :
676
+ # 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
677
+ # 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
678
+ # 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
679
+ #
680
+ # @overload remove(booleans)
681
+ # Select records by filtering with booleans and return a DataFrame.
682
+ # - The order of records in self will be preserved.
683
+ #
684
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
685
+ # a boolean filter to remove.
686
+ # @return [DataFrame]
687
+ # remainer records as a DataFrame.
688
+ # @example Remove rows by boolean filter
689
+ # penguins.remove(penguins.bill_length_mm.is_nil)
690
+ #
691
+ # # =>
692
+ # #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
693
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
694
+ # <string> <string> <double> <double> <uint8> ... <uint16>
695
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
696
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
697
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
698
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
699
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
700
+ # : : : : : : ... :
701
+ # 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
702
+ # 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
703
+ # 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
704
+ #
705
+ # @overload remove
706
+ # Select records by booleans from block
707
+ # and remove them to create a remainer DataFrame.
708
+ # - The order of records in self will be preserved.
709
+ #
710
+ # @yieldparam self [DataFrame]
711
+ # gives self to the block.
712
+ # The block is evaluated within the context of self.
713
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
714
+ # a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
715
+ # @return [DataFrame]
716
+ # remainer records as a DataFrame.
717
+ # @example Remove rows by booleans from block
718
+ # penguins.remove { (species == 'Adelie') | (year == 2009) }
719
+ #
720
+ # # =>
721
+ # #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
722
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
723
+ # <string> <string> <double> <double> <uint8> ... <uint16>
724
+ # 0 Chinstrap Dream 46.5 17.9 192 ... 2007
725
+ # 1 Chinstrap Dream 50.0 19.5 196 ... 2007
726
+ # 2 Chinstrap Dream 51.3 19.2 193 ... 2007
727
+ # 3 Chinstrap Dream 45.4 18.7 188 ... 2007
728
+ # 4 Chinstrap Dream 52.7 19.8 197 ... 2007
729
+ # : : : : : : ... :
730
+ # 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
731
+ # 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
732
+ # 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
213
733
  #
214
734
  def remove(*args, &block)
215
735
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -249,57 +769,93 @@ module RedAmber
249
769
  end
250
770
  end
251
771
 
772
+ # Remove records (rows) contains any nil.
773
+ #
774
+ # @return [DataFrame]
775
+ # removed DataFrame.
776
+ # @example
777
+ # penguins.remove_nil
778
+ # # =>
779
+ # #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
780
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
781
+ # <string> <string> <double> <double> <uint8> ... <uint16>
782
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
783
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
784
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
785
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
786
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
787
+ # : : : : : : ... :
788
+ # 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
789
+ # 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
790
+ # 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
791
+ #
252
792
  def remove_nil
253
793
  func = Arrow::Function.find(:drop_null)
254
794
  DataFrame.create(func.execute([table]).value)
255
795
  end
256
796
  alias_method :drop_nil, :remove_nil
257
797
 
798
+ # Select records from the top.
799
+ #
800
+ # @param n_obs [Integer]
801
+ # number of records to select.
802
+ # @return [DataFrame]
803
+ #
258
804
  def head(n_obs = 5)
259
805
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
260
806
 
261
807
  self[0...[n_obs, size].min]
262
808
  end
263
809
 
810
+ # Select records from the end.
811
+ #
812
+ # @param n_obs [Integer]
813
+ # number of records to select.
814
+ # @return [DataFrame]
815
+ #
264
816
  def tail(n_obs = 5)
265
817
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
266
818
 
267
819
  self[-[n_obs, size].min..]
268
820
  end
269
821
 
822
+ # Select records from the top.
823
+ #
824
+ # @param n_obs [Integer]
825
+ # number of records to select.
826
+ # @return [DataFrame]
827
+ #
270
828
  def first(n_obs = 1)
271
829
  head(n_obs)
272
830
  end
273
831
 
832
+ # Select records from the end.
833
+ #
834
+ # @param n_obs [Integer]
835
+ # number of records to select.
836
+ # @return [DataFrame]
837
+ #
274
838
  def last(n_obs = 1)
275
839
  tail(n_obs)
276
840
  end
277
841
 
842
+ # Select records by index Array to create a DataFrame.
843
+ #
844
+ # - TODO: support for option `boundscheck: true`
845
+ # - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
846
+ # - Negative index is not supported.
847
+ # @param index_array [<Integer>, Arrow::Array]
848
+ # row indeces to select.
849
+ # @return [DataFrame]
850
+ # selected variables as a DataFrame.
851
+ #
278
852
  # @api private
279
- # TODO: support for option `boundscheck: true`
280
- # Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
281
- # Negative index is not supported.
853
+ #
282
854
  def take(index_array)
283
855
  DataFrame.create(@table.take(index_array))
284
856
  end
285
857
 
286
- # @api private
287
- # TODO: support for option `null_selection_behavior: :drop``
288
- def filter(*booleans)
289
- booleans.flatten!
290
- case booleans
291
- in []
292
- return remove_all_values
293
- in [Arrow::BooleanArray => b]
294
- filter_by_array(b)
295
- else
296
- unless booleans.booleans?
297
- raise DataFrameArgumentError, 'Argument is not a boolean.'
298
- end
299
-
300
- filter_by_array(Arrow::BooleanArray.new(booleans))
301
- end
302
- end
858
+ # rubocop:enable Layout/LineLength
303
859
 
304
860
  private
305
861