red_amber 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,40 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
6
  # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
7
  using RefineArray
8
8
  using RefineArrayLike
9
9
 
10
+ # rubocop:disable Layout/LineLength
11
+
10
12
  # Select variables or records.
11
13
  #
12
14
  # @overload [](key)
13
- # select single variable and return as a Vetor.
15
+ # Select single variable (column) and return as a Vetor.
14
16
  #
15
- # @param key [Symbol, String] key name to select.
16
- # @return [Vector] selected variable as a Vector.
17
+ # @param key [Symbol, String]
18
+ # key name to select.
19
+ # @return [Vector]
20
+ # selected variable as a Vector.
17
21
  # @note DataFrame.v(key) is faster to create Vector from a variable.
22
+ # @example Select a column and return Vector
23
+ # penguins
24
+ #
25
+ # # =>
26
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
27
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
28
+ # <string> <string> <double> <double> <uint8> ... <uint16>
29
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
30
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
31
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
32
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
33
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
34
+ # : : : : : : ... :
35
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
36
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
37
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
38
+ #
39
+ # penguins[:bill_length_mm]
40
+ #
41
+ # # =>
42
+ # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
43
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
18
44
  #
19
45
  # @overload [](keys)
20
- # select variables and return a DataFrame.
46
+ # Select variables and return a DataFrame.
21
47
  #
22
48
  # @param keys [<Symbol, String>] key names to select.
23
- # @return [DataFrame] selected variables as a DataFrame.
49
+ # @return [DataFrame]
50
+ # selected variables as a DataFrame.
51
+ # @example Select columns
52
+ # penguins[:island, :bill_length_mm]
53
+ #
54
+ # # =>
55
+ # #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
56
+ # island bill_length_mm
57
+ # <string> <double>
58
+ # 0 Torgersen 39.1
59
+ # 1 Torgersen 39.5
60
+ # 2 Torgersen 40.3
61
+ # 3 Torgersen (nil)
62
+ # 4 Torgersen 36.7
63
+ # : : :
64
+ # 341 Biscoe 50.4
65
+ # 342 Biscoe 45.2
66
+ # 343 Biscoe 49.9
24
67
  #
25
68
  # @overload [](index)
26
- # select records and return a DataFrame.
69
+ # Select a record and return a DataFrame.
27
70
  #
28
71
  # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
29
72
  # index of a row to select.
30
- # @return [DataFrame] selected variables as a DataFrame.
73
+ # @return [DataFrame]
74
+ # selected variables as a DataFrame.
75
+ # @example Select a row
76
+ # penguins[0]
77
+ #
78
+ # # =>
79
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
80
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
+ # <string> <string> <double> <double> <uint8> ... <uint16>
82
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
31
83
  #
32
84
  # @overload [](indices)
33
- # select records and return a DataFrame.
85
+ # Select records by indices and return a DataFrame.
34
86
  #
35
- # @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
87
+ # @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
36
88
  # indices of rows to select.
37
- # @return [DataFrame] selected variables as a DataFrame.
89
+ # @return [DataFrame]
90
+ # selected variables as a DataFrame.
91
+ # @example Select rows by indices
92
+ # penguins[0..100]
93
+ #
94
+ # # =>
95
+ # #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
96
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
97
+ # <string> <string> <double> <double> <uint8> ... <uint16>
98
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
99
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
100
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
101
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
102
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
103
+ # : : : : : : ... :
104
+ # 98 Adelie Dream 33.1 16.1 178 ... 2008
105
+ # 99 Adelie Dream 43.2 18.5 192 ... 2008
106
+ # 100 Adelie Biscoe 35.0 17.9 192 ... 2009
107
+ #
108
+ # @overload [](booleans)
109
+ # Select records by booleans and return a DataFrame.
110
+ #
111
+ # @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
112
+ # booleans of rows to select.
113
+ # @return [DataFrame]
114
+ # selected variables as a DataFrame.
115
+ # @example Select rows by booleans
116
+ # penguins[penguins.species == 'Adelie']
117
+ #
118
+ # # =>
119
+ # #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
120
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
121
+ # <string> <string> <double> <double> <uint8> ... <uint16>
122
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
123
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
124
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
125
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
126
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
127
+ # : : : : : : ... :
128
+ # 149 Adelie Dream 37.8 18.1 193 ... 2009
129
+ # 150 Adelie Dream 36.0 17.1 187 ... 2009
130
+ # 151 Adelie Dream 41.5 18.5 201 ... 2009
38
131
  #
39
132
  def [](*args)
40
133
  raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
@@ -52,10 +145,10 @@ module RedAmber
52
145
  arrow_array = aa
53
146
  else
54
147
  a = parse_args(args, size)
55
- return select_variables_by_keys(a) if a.symbols?
56
- return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
148
+ return select_variables_by_keys(a) if a.symbol?
149
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
57
150
  return remove_all_values if a.compact.empty?
58
- return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
151
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
59
152
 
60
153
  raise DataFrameArgumentError, "invalid arguments: #{args}"
61
154
  end
@@ -64,12 +157,25 @@ module RedAmber
64
157
  return filter_by_array(arrow_array) if arrow_array.boolean?
65
158
 
66
159
  a = arrow_array.to_a
67
- return select_variables_by_keys(a) if a.symbols_or_strings?
160
+ return select_variables_by_keys(a) if a.symbol_or_string?
68
161
 
69
162
  raise DataFrameArgumentError, "invalid arguments: #{args}"
70
163
  end
71
164
 
72
- # Select a variable by a key in String or Symbol
165
+ # Select a variable by String or Symbol and return as a Vector.
166
+ #
167
+ # @param key [Symbol, String]
168
+ # key name to select.
169
+ # @return [Vector]
170
+ # selected variable as a Vector.
171
+ # @note #v(key) is faster then #[](key).
172
+ # @example Select a column and return Vector
173
+ # penguins.v(:bill_length_mm)
174
+ #
175
+ # # =>
176
+ # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
177
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
+ #
73
179
  def v(key)
74
180
  unless key.is_a?(Symbol) || key.is_a?(String)
75
181
  raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
@@ -82,30 +188,168 @@ module RedAmber
82
188
  # Select records to create a DataFrame.
83
189
  #
84
190
  # @overload slice(row)
85
- # select a record and return a DataFrame.
191
+ # Select a record and return a DataFrame.
86
192
  #
87
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
88
- # a row index to select.
89
- # @yield [self] gives self to the block.
90
- # @note The block is evaluated within the context of self.
91
- # It is accessable to self's instance variables and private methods.
92
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
193
+ # @param row [Indeger, Float]
93
194
  # a row index to select.
94
- # @return [DataFrame] selected variables as a DataFrame.
195
+ # @return [DataFrame]
196
+ # selected records as a DataFrame.
197
+ # @example Select a row
198
+ # penguins
199
+ #
200
+ # # =>
201
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
202
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
203
+ # <string> <string> <double> <double> <uint8> ... <uint16>
204
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
205
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
206
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
207
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
208
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
209
+ # : : : : : : ... :
210
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
211
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
212
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
213
+ # penguins.slice(2)
214
+ #
215
+ # # =>
216
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
217
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
218
+ # <string> <string> <double> <double> <uint8> ... <uint16>
219
+ # 0 Adelie Torgersen 40.3 18.0 195 ... 2007
95
220
  #
96
221
  # @overload slice(rows)
97
- # select records and return a DataFrame.
222
+ # Select records and return a DataFrame.
98
223
  # - Duplicated selection is acceptable. The same record will be returned.
99
224
  # - The order of records will be the same as specified indices.
100
225
  #
101
- # @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
226
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
102
227
  # row indeces to select.
103
- # @yield [self] gives self to the block.
104
- # @note The block is evaluated within the context of self.
105
- # It is accessable to self's instance variables and private methods.
106
- # @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
228
+ # @return [DataFrame]
229
+ # selected records as a DataFrame.
230
+ # @example Select rows
231
+ # penguins.slice(300..-1)
232
+ #
233
+ # # =>
234
+ # #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
235
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
236
+ # <string> <string> <double> <double> <uint8> ... <uint16>
237
+ # 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
238
+ # 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
239
+ # 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
240
+ # 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
241
+ # 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
242
+ # : : : : : : ... :
243
+ # 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
244
+ # 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
245
+ # 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
246
+ #
247
+ # @overload slice(enumerator)
248
+ # Select records and return a DataFrame.
249
+ # - Duplicated selection is acceptable. The same record will be returned.
250
+ # - The order of records will be the same as specified indices.
251
+ #
252
+ # @param enumerator [Enumerator]
253
+ # an enumerator which returns row indeces to select.
254
+ # @return [DataFrame]
255
+ # selected records as a DataFrame.
256
+ # @example Select rows by Enumerator.
257
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
258
+ # .slice(0.step(by: 10, to: 340))
259
+ #
260
+ # # =>
261
+ # #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
262
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
263
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
264
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
265
+ # 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
266
+ # 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
267
+ # 3 30 Adelie Dream 39.5 16.7 178 ... 2007
268
+ # 4 40 Adelie Dream 36.5 18.0 182 ... 2007
269
+ # : : : : : : : ... :
270
+ # 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
271
+ # 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
272
+ # 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
273
+ #
274
+ # @overload slice
275
+ # Select records by indices with block and return a DataFrame.
276
+ # - Duplicated selection is acceptable. The same record will be returned.
277
+ # - The order of records will be the same as specified indices.
278
+ #
279
+ # @yieldparam self [DataFrame]
280
+ # gives self to the block.
281
+ # The block is evaluated within the context of self.
282
+ # @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
107
283
  # row indeces to select.
108
- # @return [DataFrame] selected variables as a DataFrame.
284
+ # @return [DataFrame]
285
+ # selected records as a DataFrame.
286
+ # @example Select rows by block
287
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
288
+ # .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
289
+ #
290
+ # # =>
291
+ # #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
292
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
293
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
294
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
295
+ # 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
296
+ # 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
297
+ # 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
298
+ # 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
299
+ # 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
300
+ # 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
301
+ # 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
302
+ #
303
+ # @overload slice(booleans)
304
+ # Select records by filtering with booleans and return a DataFrame.
305
+ #
306
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
307
+ # a boolean filter.
308
+ # @return [DataFrame]
309
+ # filtered records as a DataFrame.
310
+ # @example Select rows by boolean filter
311
+ # penguins.slice(penguins[:bill_length_mm] > 50)
312
+ #
313
+ # # =>
314
+ # #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
315
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
316
+ # <string> <string> <double> <double> <uint8> ... <uint16>
317
+ # 0 Chinstrap Dream 51.3 19.2 193 ... 2007
318
+ # 1 Chinstrap Dream 52.7 19.8 197 ... 2007
319
+ # 2 Chinstrap Dream 51.3 18.2 197 ... 2007
320
+ # 3 Chinstrap Dream 51.3 19.9 198 ... 2007
321
+ # 4 Chinstrap Dream 51.7 20.3 194 ... 2007
322
+ # : : : : : : ... :
323
+ # 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
324
+ # 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
325
+ # 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
326
+ #
327
+ # @overload slice
328
+ # Select records by filtering with block and return a DataFrame.
329
+ #
330
+ # @yieldparam self [DataFrame]
331
+ # gives self to the block.
332
+ # The block is evaluated within the context of self.
333
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
334
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
335
+ # @return [DataFrame]
336
+ # filtered records as a DataFrame.
337
+ # @example Select rows by booleans from block
338
+ # penguins.slice { indices.map(&:even?) }
339
+ #
340
+ # # =>
341
+ # #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
342
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
343
+ # <string> <string> <double> <double> <uint8> ... <uint16>
344
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
345
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
346
+ # 2 Adelie Torgersen 36.7 19.3 193 ... 2007
347
+ # 3 Adelie Torgersen 38.9 17.8 181 ... 2007
348
+ # 4 Adelie Torgersen 34.1 18.1 193 ... 2007
349
+ # : : : : : : ... :
350
+ # 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
351
+ # 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
352
+ # 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
109
353
  #
110
354
  def slice(*args, &block)
111
355
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -142,6 +386,73 @@ module RedAmber
142
386
  end
143
387
  end
144
388
 
389
+ # Select records by a column specified by a key
390
+ # and corresponding record with a block.
391
+ #
392
+ # @overload slice_by(key)
393
+ # Select records by elements.
394
+ #
395
+ # @param key [Symbol, String]
396
+ # a key to select column.
397
+ # @param keep_key [true, false]
398
+ # preserve column specified by key in the result if true.
399
+ # @yieldparam self [DataFrame]
400
+ # gives self to the block.
401
+ # The block is evaluated within the context of self.
402
+ # @yieldreturn [<elements>]
403
+ # array of elements to select.
404
+ # @return [DataFrame]
405
+ # selected records as a DataFrame.
406
+ # @example Select records by elements
407
+ # df
408
+ #
409
+ # # =>
410
+ # #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
411
+ # index float string
412
+ # <uint8> <double> <string>
413
+ # 0 0 0.0 A
414
+ # 1 1 1.1 B
415
+ # 2 2 2.2 C
416
+ # 3 3 NaN D
417
+ # 4 (nil) (nil) (nil)
418
+ #
419
+ # df.slice_by(:string) { ["A", "C"] }
420
+ #
421
+ # # =>
422
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
423
+ # index float
424
+ # <uint8> <double>
425
+ # 0 0 0.0
426
+ # 1 2 2.2
427
+ #
428
+ # @overload slice_by(key)
429
+ # Select records by elements range.
430
+ #
431
+ # @param key [Symbol, String]
432
+ # a key to select column.
433
+ # @param keep_key [true, false]
434
+ # preserve column specified by key in the result if true.
435
+ # @yieldparam self [DataFrame]
436
+ # gives self to the block.
437
+ # The block is evaluated within the context of self.
438
+ # @yieldreturn [Range]
439
+ # specifies position of elements at the start and the end and
440
+ # select records between them.
441
+ # @return [DataFrame]
442
+ # selected records as a DataFrame.
443
+ # @example Select records by elements range
444
+ # df.slice_by(:string) { "A".."C" }
445
+ #
446
+ # # =>
447
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
448
+ # index float
449
+ # <uint8> <double>
450
+ # 0 0 0.0
451
+ # 1 1 1.1
452
+ # 2 2 2.2
453
+ #
454
+ # @since 0.2.1
455
+ #
145
456
  def slice_by(key, keep_key: false, &block)
146
457
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
147
458
  raise DataFrameArgumentError, 'No block given' unless block
@@ -183,33 +494,242 @@ module RedAmber
183
494
  keep_key ? taken : taken.drop(key)
184
495
  end
185
496
 
497
+ # Select records by filtering with booleans to create a DataFrame.
498
+ #
499
+ # @overload filter(booleans)
500
+ # Select records by filtering with booleans and return a DataFrame.
501
+ #
502
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
503
+ # a boolean filter.
504
+ # @return [DataFrame]
505
+ # filtered records as a DataFrame.
506
+ # @example Filter by boolean Vector
507
+ # penguins
508
+ #
509
+ # # =>
510
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
511
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
512
+ # <string> <string> <double> <double> <uint8> ... <uint16>
513
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
514
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
515
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
516
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
517
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
518
+ # : : : : : : ... :
519
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
520
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
521
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
522
+ #
523
+ # penguins.filter(penguins.bill_length_mm < 50)
524
+ #
525
+ # # =>
526
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
527
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
528
+ # <string> <string> <double> <double> <uint8> ... <uint16>
529
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
530
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
531
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
532
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
533
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
534
+ # : : : : : : ... :
535
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
536
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
537
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
538
+ #
539
+ # @overload filter
540
+ # Select records by filtering with block and return a DataFrame.
541
+ #
542
+ # @yieldparam self [DataFrame]
543
+ # gives self to the block.
544
+ # The block is evaluated within the context of self.
545
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
546
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
547
+ # @return [DataFrame]
548
+ # filtered records as a DataFrame.
549
+ # @example Filter by boolean Vector
550
+ # penguins.filter { bill_length_mm < 50 }
551
+ #
552
+ # # =>
553
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
554
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
555
+ # <string> <string> <double> <double> <uint8> ... <uint16>
556
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
557
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
558
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
559
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
560
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
561
+ # : : : : : : ... :
562
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
563
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
564
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
565
+ #
566
+ def filter(*booleans, &block)
567
+ booleans.flatten!
568
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
569
+
570
+ if block
571
+ unless booleans.empty?
572
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
573
+ end
574
+
575
+ booleans = [instance_eval(&block)]
576
+ end
577
+
578
+ case booleans
579
+ in [] | [[]]
580
+ return remove_all_values
581
+ in [Vector => v] if v.boolean?
582
+ filter_by_array(v.data)
583
+ in [Arrow::ChunkedArray => ca] if ca.boolean?
584
+ filter_by_array(ca)
585
+ in [Arrow::BooleanArray => b]
586
+ filter_by_array(b)
587
+ else
588
+ a = Arrow::Array.new(parse_args(booleans, size))
589
+ unless a.boolean?
590
+ raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
591
+ end
592
+
593
+ filter_by_array(a)
594
+ end
595
+ end
596
+
186
597
  # Select records and remove them to create a remainer DataFrame.
187
598
  #
188
599
  # @overload remove(row)
189
- # select a record and remove it to create a remainer DataFrame.
600
+ # Select a record and remove it to create a remainer DataFrame.
190
601
  # - The order of records in self will be preserved.
191
602
  #
192
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
603
+ # @param row [Indeger, Float]
193
604
  # a row index to remove.
194
- # @yield [self] gives self to the block.
195
- # @note The block is evaluated within the context of self.
196
- # It is accessable to self's instance variables and private methods.
197
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
198
- # a row index to remove.
199
- # @return [DataFrame] remainer variables as a DataFrame.
605
+ # @return [DataFrame]
606
+ # remainer variables as a DataFrame.
607
+ # @example Remove a row
608
+ # penguins.remove(-1)
609
+ #
610
+ # # =>
611
+ # #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
612
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
613
+ # <string> <string> <double> <double> <uint8> ... <uint16>
614
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
615
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
616
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
617
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
618
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
619
+ # : : : : : : ... :
620
+ # 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
621
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
622
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
200
623
  #
201
624
  # @overload remove(rows)
202
- # select records and remove them to create a remainer DataFrame.
625
+ # Select records and remove them to create a remainer DataFrame.
626
+ # - Duplicated selection is acceptable.
203
627
  # - The order of records in self will be preserved.
204
628
  #
205
- # @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
629
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
206
630
  # row indeces to remove.
207
- # @yield [self] gives self to the block.
208
- # @note The block is evaluated within the context of self.
209
- # It is accessable to self's instance variables and private methods.
210
- # @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
631
+ # @return [DataFrame]
632
+ # remainer variables as a DataFrame.
633
+ # @example Remove rows
634
+ # penguins.remove(100..200)
635
+ #
636
+ # # =>
637
+ # #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
638
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
639
+ # <string> <string> <double> <double> <uint8> ... <uint16>
640
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
641
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
642
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
643
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
644
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
645
+ # : : : : : : ... :
646
+ # 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
647
+ # 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
648
+ # 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
649
+ #
650
+ # @overload remove
651
+ # Select records by indices from block
652
+ # and remove them to create a remainer DataFrame.
653
+ # - Duplicated selection is acceptable.
654
+ # - The order of records in self will be preserved.
655
+ #
656
+ # @yieldparam self [DataFrame]
657
+ # gives self to the block.
658
+ # The block is evaluated within the context of self.
659
+ # @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
211
660
  # row indeces to remove.
212
- # @return [DataFrame] remainer variables as a DataFrame.
661
+ # @return [DataFrame]
662
+ # remainer variables as a DataFrame.
663
+ # @example Remove rows by indices from block
664
+ # penguins.remove { 0.step(size, 10) }
665
+ #
666
+ # # =>
667
+ # #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
668
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
669
+ # <string> <string> <double> <double> <uint8> ... <uint16>
670
+ # 0 Adelie Torgersen 39.5 17.4 186 ... 2007
671
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
672
+ # 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
673
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
674
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
675
+ # : : : : : : ... :
676
+ # 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
677
+ # 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
678
+ # 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
679
+ #
680
+ # @overload remove(booleans)
681
+ # Select records by filtering with booleans and return a DataFrame.
682
+ # - The order of records in self will be preserved.
683
+ #
684
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
685
+ # a boolean filter to remove.
686
+ # @return [DataFrame]
687
+ # remainer records as a DataFrame.
688
+ # @example Remove rows by boolean filter
689
+ # penguins.remove(penguins.bill_length_mm.is_nil)
690
+ #
691
+ # # =>
692
+ # #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
693
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
694
+ # <string> <string> <double> <double> <uint8> ... <uint16>
695
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
696
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
697
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
698
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
699
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
700
+ # : : : : : : ... :
701
+ # 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
702
+ # 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
703
+ # 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
704
+ #
705
+ # @overload remove
706
+ # Select records by booleans from block
707
+ # and remove them to create a remainer DataFrame.
708
+ # - The order of records in self will be preserved.
709
+ #
710
+ # @yieldparam self [DataFrame]
711
+ # gives self to the block.
712
+ # The block is evaluated within the context of self.
713
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
714
+ # a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
715
+ # @return [DataFrame]
716
+ # remainer records as a DataFrame.
717
+ # @example Remove rows by booleans from block
718
+ # penguins.remove { (species == 'Adelie') | (year == 2009) }
719
+ #
720
+ # # =>
721
+ # #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
722
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
723
+ # <string> <string> <double> <double> <uint8> ... <uint16>
724
+ # 0 Chinstrap Dream 46.5 17.9 192 ... 2007
725
+ # 1 Chinstrap Dream 50.0 19.5 196 ... 2007
726
+ # 2 Chinstrap Dream 51.3 19.2 193 ... 2007
727
+ # 3 Chinstrap Dream 45.4 18.7 188 ... 2007
728
+ # 4 Chinstrap Dream 52.7 19.8 197 ... 2007
729
+ # : : : : : : ... :
730
+ # 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
731
+ # 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
732
+ # 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
213
733
  #
214
734
  def remove(*args, &block)
215
735
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -249,57 +769,93 @@ module RedAmber
249
769
  end
250
770
  end
251
771
 
772
+ # Remove records (rows) contains any nil.
773
+ #
774
+ # @return [DataFrame]
775
+ # removed DataFrame.
776
+ # @example
777
+ # penguins.remove_nil
778
+ # # =>
779
+ # #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
780
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
781
+ # <string> <string> <double> <double> <uint8> ... <uint16>
782
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
783
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
784
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
785
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
786
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
787
+ # : : : : : : ... :
788
+ # 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
789
+ # 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
790
+ # 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
791
+ #
252
792
  def remove_nil
253
793
  func = Arrow::Function.find(:drop_null)
254
794
  DataFrame.create(func.execute([table]).value)
255
795
  end
256
796
  alias_method :drop_nil, :remove_nil
257
797
 
798
+ # Select records from the top.
799
+ #
800
+ # @param n_obs [Integer]
801
+ # number of records to select.
802
+ # @return [DataFrame]
803
+ #
258
804
  def head(n_obs = 5)
259
805
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
260
806
 
261
807
  self[0...[n_obs, size].min]
262
808
  end
263
809
 
810
+ # Select records from the end.
811
+ #
812
+ # @param n_obs [Integer]
813
+ # number of records to select.
814
+ # @return [DataFrame]
815
+ #
264
816
  def tail(n_obs = 5)
265
817
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
266
818
 
267
819
  self[-[n_obs, size].min..]
268
820
  end
269
821
 
822
+ # Select records from the top.
823
+ #
824
+ # @param n_obs [Integer]
825
+ # number of records to select.
826
+ # @return [DataFrame]
827
+ #
270
828
  def first(n_obs = 1)
271
829
  head(n_obs)
272
830
  end
273
831
 
832
+ # Select records from the end.
833
+ #
834
+ # @param n_obs [Integer]
835
+ # number of records to select.
836
+ # @return [DataFrame]
837
+ #
274
838
  def last(n_obs = 1)
275
839
  tail(n_obs)
276
840
  end
277
841
 
842
+ # Select records by index Array to create a DataFrame.
843
+ #
844
+ # - TODO: support for option `boundscheck: true`
845
+ # - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
846
+ # - Negative index is not supported.
847
+ # @param index_array [<Integer>, Arrow::Array]
848
+ # row indeces to select.
849
+ # @return [DataFrame]
850
+ # selected variables as a DataFrame.
851
+ #
278
852
  # @api private
279
- # TODO: support for option `boundscheck: true`
280
- # Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
281
- # Negative index is not supported.
853
+ #
282
854
  def take(index_array)
283
855
  DataFrame.create(@table.take(index_array))
284
856
  end
285
857
 
286
- # @api private
287
- # TODO: support for option `null_selection_behavior: :drop``
288
- def filter(*booleans)
289
- booleans.flatten!
290
- case booleans
291
- in []
292
- return remove_all_values
293
- in [Arrow::BooleanArray => b]
294
- filter_by_array(b)
295
- else
296
- unless booleans.booleans?
297
- raise DataFrameArgumentError, 'Argument is not a boolean.'
298
- end
299
-
300
- filter_by_array(Arrow::BooleanArray.new(booleans))
301
- end
302
- end
858
+ # rubocop:enable Layout/LineLength
303
859
 
304
860
  private
305
861