red_amber 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +56 -22
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +29 -30
  8. data/benchmark/basic.yml +7 -7
  9. data/benchmark/combine.yml +3 -3
  10. data/benchmark/dataframe.yml +15 -9
  11. data/benchmark/group.yml +6 -6
  12. data/benchmark/reshape.yml +6 -6
  13. data/benchmark/vector.yml +6 -3
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +454 -85
  20. data/lib/red_amber/data_frame_combinable.rb +609 -115
  21. data/lib/red_amber/data_frame_displayable.rb +313 -34
  22. data/lib/red_amber/data_frame_indexable.rb +122 -19
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +623 -70
  26. data/lib/red_amber/data_frame_variable_operation.rb +452 -35
  27. data/lib/red_amber/group.rb +186 -22
  28. data/lib/red_amber/helper.rb +74 -14
  29. data/lib/red_amber/refinements.rb +26 -6
  30. data/lib/red_amber/subframes.rb +1101 -0
  31. data/lib/red_amber/vector.rb +362 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +506 -0
  34. data/lib/red_amber/vector_selectable.rb +265 -23
  35. data/lib/red_amber/vector_unary_element_wise.rb +529 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -1,40 +1,133 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-in for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
6
  # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
7
  using RefineArray
8
8
  using RefineArrayLike
9
9
 
10
+ # rubocop:disable Layout/LineLength
11
+
10
12
  # Select variables or records.
11
13
  #
12
14
  # @overload [](key)
13
- # select single variable and return as a Vetor.
15
+ # Select single variable (column) and return as a Vetor.
14
16
  #
15
- # @param key [Symbol, String] key name to select.
16
- # @return [Vector] selected variable as a Vector.
17
+ # @param key [Symbol, String]
18
+ # key name to select.
19
+ # @return [Vector]
20
+ # selected variable as a Vector.
17
21
  # @note DataFrame.v(key) is faster to create Vector from a variable.
22
+ # @example Select a column and return Vector
23
+ # penguins
24
+ #
25
+ # # =>
26
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
27
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
28
+ # <string> <string> <double> <double> <uint8> ... <uint16>
29
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
30
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
31
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
32
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
33
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
34
+ # : : : : : : ... :
35
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
36
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
37
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
38
+ #
39
+ # penguins[:bill_length_mm]
40
+ #
41
+ # # =>
42
+ # #<RedAmber::Vector(:double, size=344):0x00000000000104dc>
43
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
18
44
  #
19
45
  # @overload [](keys)
20
- # select variables and return a DataFrame.
46
+ # Select variables and return a DataFrame.
21
47
  #
22
48
  # @param keys [<Symbol, String>] key names to select.
23
- # @return [DataFrame] selected variables as a DataFrame.
49
+ # @return [DataFrame]
50
+ # selected variables as a DataFrame.
51
+ # @example Select columns
52
+ # penguins[:island, :bill_length_mm]
53
+ #
54
+ # # =>
55
+ # #<RedAmber::DataFrame : 344 x 2 Vectors, 0x00000000000104f0>
56
+ # island bill_length_mm
57
+ # <string> <double>
58
+ # 0 Torgersen 39.1
59
+ # 1 Torgersen 39.5
60
+ # 2 Torgersen 40.3
61
+ # 3 Torgersen (nil)
62
+ # 4 Torgersen 36.7
63
+ # : : :
64
+ # 341 Biscoe 50.4
65
+ # 342 Biscoe 45.2
66
+ # 343 Biscoe 49.9
24
67
  #
25
68
  # @overload [](index)
26
- # select records and return a DataFrame.
69
+ # Select a record and return a DataFrame.
27
70
  #
28
71
  # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
29
72
  # index of a row to select.
30
- # @return [DataFrame] selected variables as a DataFrame.
73
+ # @return [DataFrame]
74
+ # selected variables as a DataFrame.
75
+ # @example Select a row
76
+ # penguins[0]
77
+ #
78
+ # # =>
79
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x0000000000010504>
80
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
81
+ # <string> <string> <double> <double> <uint8> ... <uint16>
82
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
31
83
  #
32
84
  # @overload [](indices)
33
- # select records and return a DataFrame.
85
+ # Select records by indices and return a DataFrame.
34
86
  #
35
- # @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
87
+ # @param indices [<Indeger>, <Float>, Range<Integer>, Vector, Arrow::Array>]
36
88
  # indices of rows to select.
37
- # @return [DataFrame] selected variables as a DataFrame.
89
+ # @return [DataFrame]
90
+ # selected variables as a DataFrame.
91
+ # @example Select rows by indices
92
+ # penguins[0..100]
93
+ #
94
+ # # =>
95
+ # #<RedAmber::DataFrame : 101 x 8 Vectors, 0x00000000000105e0>
96
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
97
+ # <string> <string> <double> <double> <uint8> ... <uint16>
98
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
99
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
100
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
101
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
102
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
103
+ # : : : : : : ... :
104
+ # 98 Adelie Dream 33.1 16.1 178 ... 2008
105
+ # 99 Adelie Dream 43.2 18.5 192 ... 2008
106
+ # 100 Adelie Biscoe 35.0 17.9 192 ... 2009
107
+ #
108
+ # @overload [](booleans)
109
+ # Select records by booleans and return a DataFrame.
110
+ #
111
+ # @param booleans [Array<true, false, nil>, Vector, Arrow::Array>]
112
+ # booleans of rows to select.
113
+ # @return [DataFrame]
114
+ # selected variables as a DataFrame.
115
+ # @example Select rows by booleans
116
+ # penguins[penguins.species == 'Adelie']
117
+ #
118
+ # # =>
119
+ # #<RedAmber::DataFrame : 152 x 8 Vectors, 0x0000000000010658>
120
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
121
+ # <string> <string> <double> <double> <uint8> ... <uint16>
122
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
123
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
124
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
125
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
126
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
127
+ # : : : : : : ... :
128
+ # 149 Adelie Dream 37.8 18.1 193 ... 2009
129
+ # 150 Adelie Dream 36.0 17.1 187 ... 2009
130
+ # 151 Adelie Dream 41.5 18.5 201 ... 2009
38
131
  #
39
132
  def [](*args)
40
133
  raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
@@ -52,10 +145,10 @@ module RedAmber
52
145
  arrow_array = aa
53
146
  else
54
147
  a = parse_args(args, size)
55
- return select_variables_by_keys(a) if a.symbols?
56
- return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
148
+ return select_variables_by_keys(a) if a.symbol?
149
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integer?
57
150
  return remove_all_values if a.compact.empty?
58
- return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
151
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.boolean?
59
152
 
60
153
  raise DataFrameArgumentError, "invalid arguments: #{args}"
61
154
  end
@@ -64,17 +157,27 @@ module RedAmber
64
157
  return filter_by_array(arrow_array) if arrow_array.boolean?
65
158
 
66
159
  a = arrow_array.to_a
67
- return select_variables_by_keys(a) if a.symbols_or_strings?
160
+ return select_variables_by_keys(a) if a.symbol_or_string?
68
161
 
69
162
  raise DataFrameArgumentError, "invalid arguments: #{args}"
70
163
  end
71
164
 
72
- # Select a variable by a key in String or Symbol
165
+ # Select a variable by String or Symbol and return as a Vector.
166
+ #
167
+ # @param key [Symbol, String]
168
+ # key name to select.
169
+ # @return [Vector]
170
+ # selected variable as a Vector.
171
+ # @note #v(key) is faster then #[](key).
172
+ # @example Select a column and return Vector
173
+ # penguins.v(:bill_length_mm)
174
+ #
175
+ # # =>
176
+ # #<RedAmber::Vector(:double, size=344):0x000000000000f140>
177
+ # [39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
178
+ #
73
179
  def v(key)
74
- unless key.is_a?(Symbol) || key.is_a?(String)
75
- raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
76
- end
77
- raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
180
+ raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key?(key)
78
181
 
79
182
  variables[key.to_sym]
80
183
  end
@@ -82,30 +185,168 @@ module RedAmber
82
185
  # Select records to create a DataFrame.
83
186
  #
84
187
  # @overload slice(row)
85
- # select a record and return a DataFrame.
188
+ # Select a record and return a DataFrame.
86
189
  #
87
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
190
+ # @param row [Indeger, Float]
88
191
  # a row index to select.
89
- # @yield [self] gives self to the block.
90
- # @note The block is evaluated within the context of self.
91
- # It is accessable to self's instance variables and private methods.
92
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
93
- # a row index to select.
94
- # @return [DataFrame] selected variables as a DataFrame.
192
+ # @return [DataFrame]
193
+ # selected records as a DataFrame.
194
+ # @example Select a row
195
+ # penguins
196
+ #
197
+ # # =>
198
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
199
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
200
+ # <string> <string> <double> <double> <uint8> ... <uint16>
201
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
202
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
203
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
204
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
205
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
206
+ # : : : : : : ... :
207
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
208
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
209
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
210
+ # penguins.slice(2)
211
+ #
212
+ # # =>
213
+ # #<RedAmber::DataFrame : 1 x 8 Vectors, 0x00000000000039d0>
214
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
215
+ # <string> <string> <double> <double> <uint8> ... <uint16>
216
+ # 0 Adelie Torgersen 40.3 18.0 195 ... 2007
95
217
  #
96
218
  # @overload slice(rows)
97
- # select records and return a DataFrame.
219
+ # Select records and return a DataFrame.
98
220
  # - Duplicated selection is acceptable. The same record will be returned.
99
221
  # - The order of records will be the same as specified indices.
100
222
  #
101
- # @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
223
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
102
224
  # row indeces to select.
103
- # @yield [self] gives self to the block.
104
- # @note The block is evaluated within the context of self.
105
- # It is accessable to self's instance variables and private methods.
106
- # @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
225
+ # @return [DataFrame]
226
+ # selected records as a DataFrame.
227
+ # @example Select rows
228
+ # penguins.slice(300..-1)
229
+ #
230
+ # # =>
231
+ # #<RedAmber::DataFrame : 44 x 8 Vectors, 0x000000000000fb54>
232
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
233
+ # <string> <string> <double> <double> <uint8> ... <uint16>
234
+ # 0 Gentoo Biscoe 49.1 14.5 212 ... 2009
235
+ # 1 Gentoo Biscoe 52.5 15.6 221 ... 2009
236
+ # 2 Gentoo Biscoe 47.4 14.6 212 ... 2009
237
+ # 3 Gentoo Biscoe 50.0 15.9 224 ... 2009
238
+ # 4 Gentoo Biscoe 44.9 13.8 212 ... 2009
239
+ # : : : : : : ... :
240
+ # 41 Gentoo Biscoe 50.4 15.7 222 ... 2009
241
+ # 42 Gentoo Biscoe 45.2 14.8 212 ... 2009
242
+ # 43 Gentoo Biscoe 49.9 16.1 213 ... 2009
243
+ #
244
+ # @overload slice(enumerator)
245
+ # Select records and return a DataFrame.
246
+ # - Duplicated selection is acceptable. The same record will be returned.
247
+ # - The order of records will be the same as specified indices.
248
+ #
249
+ # @param enumerator [Enumerator]
250
+ # an enumerator which returns row indeces to select.
251
+ # @return [DataFrame]
252
+ # selected records as a DataFrame.
253
+ # @example Select rows by Enumerator.
254
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
255
+ # .slice(0.step(by: 10, to: 340))
256
+ #
257
+ # # =>
258
+ # #<RedAmber::DataFrame : 35 x 9 Vectors, 0x000000000000f2e4>
259
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
260
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
261
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
262
+ # 1 10 Adelie Torgersen 37.8 17.1 186 ... 2007
263
+ # 2 20 Adelie Biscoe 37.8 18.3 174 ... 2007
264
+ # 3 30 Adelie Dream 39.5 16.7 178 ... 2007
265
+ # 4 40 Adelie Dream 36.5 18.0 182 ... 2007
266
+ # : : : : : : : ... :
267
+ # 32 320 Gentoo Biscoe 48.5 15.0 219 ... 2009
268
+ # 33 330 Gentoo Biscoe 50.5 15.2 216 ... 2009
269
+ # 34 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
270
+ #
271
+ # @overload slice
272
+ # Select records by indices with block and return a DataFrame.
273
+ # - Duplicated selection is acceptable. The same record will be returned.
274
+ # - The order of records will be the same as specified indices.
275
+ #
276
+ # @yieldparam self [DataFrame]
277
+ # gives self to the block.
278
+ # The block is evaluated within the context of self.
279
+ # @yieldreturn [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array, Enumerator]
107
280
  # row indeces to select.
108
- # @return [DataFrame] selected variables as a DataFrame.
281
+ # @return [DataFrame]
282
+ # selected records as a DataFrame.
283
+ # @example Select rows by block
284
+ # penguins.assign_left(index: penguins.indices) # 0.2.0 feature
285
+ # .slice { 0.step(by: 100, to: 300).map { |i| i..(i+1) } }
286
+ #
287
+ # # =>
288
+ # #<RedAmber::DataFrame : 8 x 9 Vectors, 0x000000000000f3ac>
289
+ # index species island bill_length_mm bill_depth_mm flipper_length_mm ... year
290
+ # <uint16> <string> <string> <double> <double> <uint8> ... <uint16>
291
+ # 0 0 Adelie Torgersen 39.1 18.7 181 ... 2007
292
+ # 1 1 Adelie Torgersen 39.5 17.4 186 ... 2007
293
+ # 2 100 Adelie Biscoe 35.0 17.9 192 ... 2009
294
+ # 3 101 Adelie Biscoe 41.0 20.0 203 ... 2009
295
+ # 4 200 Chinstrap Dream 51.5 18.7 187 ... 2009
296
+ # 5 201 Chinstrap Dream 49.8 17.3 198 ... 2009
297
+ # 6 300 Gentoo Biscoe 49.1 14.5 212 ... 2009
298
+ # 7 301 Gentoo Biscoe 52.5 15.6 221 ... 2009
299
+ #
300
+ # @overload slice(booleans)
301
+ # Select records by filtering with booleans and return a DataFrame.
302
+ #
303
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
304
+ # a boolean filter.
305
+ # @return [DataFrame]
306
+ # filtered records as a DataFrame.
307
+ # @example Select rows by boolean filter
308
+ # penguins.slice(penguins[:bill_length_mm] > 50)
309
+ #
310
+ # # =>
311
+ # #<RedAmber::DataFrame : 52 x 8 Vectors, 0x000000000000fd98>
312
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
313
+ # <string> <string> <double> <double> <uint8> ... <uint16>
314
+ # 0 Chinstrap Dream 51.3 19.2 193 ... 2007
315
+ # 1 Chinstrap Dream 52.7 19.8 197 ... 2007
316
+ # 2 Chinstrap Dream 51.3 18.2 197 ... 2007
317
+ # 3 Chinstrap Dream 51.3 19.9 198 ... 2007
318
+ # 4 Chinstrap Dream 51.7 20.3 194 ... 2007
319
+ # : : : : : : ... :
320
+ # 49 Gentoo Biscoe 51.5 16.3 230 ... 2009
321
+ # 50 Gentoo Biscoe 55.1 16.0 230 ... 2009
322
+ # 51 Gentoo Biscoe 50.4 15.7 222 ... 2009
323
+ #
324
+ # @overload slice
325
+ # Select records by filtering with block and return a DataFrame.
326
+ #
327
+ # @yieldparam self [DataFrame]
328
+ # gives self to the block.
329
+ # The block is evaluated within the context of self.
330
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
331
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
332
+ # @return [DataFrame]
333
+ # filtered records as a DataFrame.
334
+ # @example Select rows by booleans from block
335
+ # penguins.slice { indices.map(&:even?) }
336
+ #
337
+ # # =>
338
+ # #<RedAmber::DataFrame : 172 x 8 Vectors, 0x000000000000ff78>
339
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
340
+ # <string> <string> <double> <double> <uint8> ... <uint16>
341
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
342
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
343
+ # 2 Adelie Torgersen 36.7 19.3 193 ... 2007
344
+ # 3 Adelie Torgersen 38.9 17.8 181 ... 2007
345
+ # 4 Adelie Torgersen 34.1 18.1 193 ... 2007
346
+ # : : : : : : ... :
347
+ # 169 Gentoo Biscoe 47.2 13.7 214 ... 2009
348
+ # 170 Gentoo Biscoe 46.8 14.3 215 ... 2009
349
+ # 171 Gentoo Biscoe 45.2 14.8 212 ... 2009
109
350
  #
110
351
  def slice(*args, &block)
111
352
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -142,6 +383,73 @@ module RedAmber
142
383
  end
143
384
  end
144
385
 
386
+ # Select records by a column specified by a key
387
+ # and corresponding record with a block.
388
+ #
389
+ # @overload slice_by(key)
390
+ # Select records by elements.
391
+ #
392
+ # @param key [Symbol, String]
393
+ # a key to select column.
394
+ # @param keep_key [true, false]
395
+ # preserve column specified by key in the result if true.
396
+ # @yieldparam self [DataFrame]
397
+ # gives self to the block.
398
+ # The block is evaluated within the context of self.
399
+ # @yieldreturn [<elements>]
400
+ # array of elements to select.
401
+ # @return [DataFrame]
402
+ # selected records as a DataFrame.
403
+ # @example Select records by elements
404
+ # df
405
+ #
406
+ # # =>
407
+ # #<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000069e60>
408
+ # index float string
409
+ # <uint8> <double> <string>
410
+ # 0 0 0.0 A
411
+ # 1 1 1.1 B
412
+ # 2 2 2.2 C
413
+ # 3 3 NaN D
414
+ # 4 (nil) (nil) (nil)
415
+ #
416
+ # df.slice_by(:string) { ["A", "C"] }
417
+ #
418
+ # # =>
419
+ # #<RedAmber::DataFrame : 2 x 2 Vectors, 0x000000000001b1ac>
420
+ # index float
421
+ # <uint8> <double>
422
+ # 0 0 0.0
423
+ # 1 2 2.2
424
+ #
425
+ # @overload slice_by(key)
426
+ # Select records by elements range.
427
+ #
428
+ # @param key [Symbol, String]
429
+ # a key to select column.
430
+ # @param keep_key [true, false]
431
+ # preserve column specified by key in the result if true.
432
+ # @yieldparam self [DataFrame]
433
+ # gives self to the block.
434
+ # The block is evaluated within the context of self.
435
+ # @yieldreturn [Range]
436
+ # specifies position of elements at the start and the end and
437
+ # select records between them.
438
+ # @return [DataFrame]
439
+ # selected records as a DataFrame.
440
+ # @example Select records by elements range
441
+ # df.slice_by(:string) { "A".."C" }
442
+ #
443
+ # # =>
444
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000069668>
445
+ # index float
446
+ # <uint8> <double>
447
+ # 0 0 0.0
448
+ # 1 1 1.1
449
+ # 2 2 2.2
450
+ #
451
+ # @since 0.2.1
452
+ #
145
453
  def slice_by(key, keep_key: false, &block)
146
454
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
147
455
  raise DataFrameArgumentError, 'No block given' unless block
@@ -183,33 +491,242 @@ module RedAmber
183
491
  keep_key ? taken : taken.drop(key)
184
492
  end
185
493
 
494
+ # Select records by filtering with booleans to create a DataFrame.
495
+ #
496
+ # @overload filter(booleans)
497
+ # Select records by filtering with booleans and return a DataFrame.
498
+ #
499
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
500
+ # a boolean filter.
501
+ # @return [DataFrame]
502
+ # filtered records as a DataFrame.
503
+ # @example Filter by boolean Vector
504
+ # penguins
505
+ #
506
+ # # =>
507
+ # #<RedAmber::DataFrame : 344 x 8 Vectors, 0x00000000000039bc>
508
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
509
+ # <string> <string> <double> <double> <uint8> ... <uint16>
510
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
511
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
512
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
513
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
514
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
515
+ # : : : : : : ... :
516
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
517
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
518
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
519
+ #
520
+ # penguins.filter(penguins.bill_length_mm < 50)
521
+ #
522
+ # # =>
523
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101a8>
524
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
525
+ # <string> <string> <double> <double> <uint8> ... <uint16>
526
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
527
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
528
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
529
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
530
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
531
+ # : : : : : : ... :
532
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
533
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
534
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
535
+ #
536
+ # @overload filter
537
+ # Select records by filtering with block and return a DataFrame.
538
+ #
539
+ # @yieldparam self [DataFrame]
540
+ # gives self to the block.
541
+ # The block is evaluated within the context of self.
542
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
543
+ # a boolean filter. `Vector` or `Arrow::Array` must be boolean type.
544
+ # @return [DataFrame]
545
+ # filtered records as a DataFrame.
546
+ # @example Filter by boolean Vector
547
+ # penguins.filter { bill_length_mm < 50 }
548
+ #
549
+ # # =>
550
+ # #<RedAmber::DataFrame : 285 x 8 Vectors, 0x00000000000101bc>
551
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
552
+ # <string> <string> <double> <double> <uint8> ... <uint16>
553
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
554
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
555
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
556
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
557
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
558
+ # : : : : : : ... :
559
+ # 282 Gentoo Biscoe 46.8 14.3 215 ... 2009
560
+ # 283 Gentoo Biscoe 45.2 14.8 212 ... 2009
561
+ # 284 Gentoo Biscoe 49.9 16.1 213 ... 2009
562
+ #
563
+ def filter(*booleans, &block)
564
+ booleans.flatten!
565
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
566
+
567
+ if block
568
+ unless booleans.empty?
569
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
570
+ end
571
+
572
+ booleans = [instance_eval(&block)]
573
+ end
574
+
575
+ case booleans
576
+ in [] | [[]]
577
+ return remove_all_values
578
+ in [Vector => v] if v.boolean?
579
+ filter_by_array(v.data)
580
+ in [Arrow::ChunkedArray => ca] if ca.boolean?
581
+ filter_by_array(ca)
582
+ in [Arrow::BooleanArray => b]
583
+ filter_by_array(b)
584
+ else
585
+ a = Arrow::Array.new(parse_args(booleans, size))
586
+ unless a.boolean?
587
+ raise DataFrameArgumentError, "not a boolean filter: #{booleans}"
588
+ end
589
+
590
+ filter_by_array(a)
591
+ end
592
+ end
593
+
186
594
  # Select records and remove them to create a remainer DataFrame.
187
595
  #
188
596
  # @overload remove(row)
189
- # select a record and remove it to create a remainer DataFrame.
597
+ # Select a record and remove it to create a remainer DataFrame.
190
598
  # - The order of records in self will be preserved.
191
599
  #
192
- # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
600
+ # @param row [Indeger, Float]
193
601
  # a row index to remove.
194
- # @yield [self] gives self to the block.
195
- # @note The block is evaluated within the context of self.
196
- # It is accessable to self's instance variables and private methods.
197
- # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
198
- # a row index to remove.
199
- # @return [DataFrame] remainer variables as a DataFrame.
602
+ # @return [DataFrame]
603
+ # remainer variables as a DataFrame.
604
+ # @example Remove a row
605
+ # penguins.remove(-1)
606
+ #
607
+ # # =>
608
+ # #<RedAmber::DataFrame : 343 x 8 Vectors, 0x0000000000010310>
609
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
610
+ # <string> <string> <double> <double> <uint8> ... <uint16>
611
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
612
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
613
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
614
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
615
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
616
+ # : : : : : : ... :
617
+ # 340 Gentoo Biscoe 46.8 14.3 215 ... 2009
618
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
619
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
200
620
  #
201
621
  # @overload remove(rows)
202
- # select records and remove them to create a remainer DataFrame.
622
+ # Select records and remove them to create a remainer DataFrame.
623
+ # - Duplicated selection is acceptable.
203
624
  # - The order of records in self will be preserved.
204
625
  #
205
- # @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
626
+ # @param rows [<Integer>, <Float>, Range<Integer>, Vector, Arrow::Array]
206
627
  # row indeces to remove.
207
- # @yield [self] gives self to the block.
208
- # @note The block is evaluated within the context of self.
209
- # It is accessable to self's instance variables and private methods.
210
- # @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
628
+ # @return [DataFrame]
629
+ # remainer variables as a DataFrame.
630
+ # @example Remove rows
631
+ # penguins.remove(100..200)
632
+ #
633
+ # # =>
634
+ # #<RedAmber::DataFrame : 243 x 8 Vectors, 0x0000000000010450>
635
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
636
+ # <string> <string> <double> <double> <uint8> ... <uint16>
637
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
638
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
639
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
640
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
641
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
642
+ # : : : : : : ... :
643
+ # 240 Gentoo Biscoe 50.4 15.7 222 ... 2009
644
+ # 241 Gentoo Biscoe 45.2 14.8 212 ... 2009
645
+ # 242 Gentoo Biscoe 49.9 16.1 213 ... 2009
646
+ #
647
+ # @overload remove
648
+ # Select records by indices from block
649
+ # and remove them to create a remainer DataFrame.
650
+ # - Duplicated selection is acceptable.
651
+ # - The order of records in self will be preserved.
652
+ #
653
+ # @yieldparam self [DataFrame]
654
+ # gives self to the block.
655
+ # The block is evaluated within the context of self.
656
+ # @yieldreturn [<Integer, Float>, Range<Integer>, Vector, Arrow::Array]
211
657
  # row indeces to remove.
212
- # @return [DataFrame] remainer variables as a DataFrame.
658
+ # @return [DataFrame]
659
+ # remainer variables as a DataFrame.
660
+ # @example Remove rows by indices from block
661
+ # penguins.remove { 0.step(size, 10) }
662
+ #
663
+ # # =>
664
+ # #<RedAmber::DataFrame : 309 x 8 Vectors, 0x00000000000104c8>
665
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
666
+ # <string> <string> <double> <double> <uint8> ... <uint16>
667
+ # 0 Adelie Torgersen 39.5 17.4 186 ... 2007
668
+ # 1 Adelie Torgersen 40.3 18.0 195 ... 2007
669
+ # 2 Adelie Torgersen (nil) (nil) (nil) ... 2007
670
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
671
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
672
+ # : : : : : : ... :
673
+ # 306 Gentoo Biscoe 50.4 15.7 222 ... 2009
674
+ # 307 Gentoo Biscoe 45.2 14.8 212 ... 2009
675
+ # 308 Gentoo Biscoe 49.9 16.1 213 ... 2009
676
+ #
677
+ # @overload remove(booleans)
678
+ # Select records by filtering with booleans and return a DataFrame.
679
+ # - The order of records in self will be preserved.
680
+ #
681
+ # @param booleans [<Boolean, nil>, Vector, Arrow::Array]
682
+ # a boolean filter to remove.
683
+ # @return [DataFrame]
684
+ # remainer records as a DataFrame.
685
+ # @example Remove rows by boolean filter
686
+ # penguins.remove(penguins.bill_length_mm.is_nil)
687
+ #
688
+ # # =>
689
+ # #<RedAmber::DataFrame : 342 x 8 Vectors, 0x0000000000010234>
690
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
691
+ # <string> <string> <double> <double> <uint8> ... <uint16>
692
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
693
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
694
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
695
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
696
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
697
+ # : : : : : : ... :
698
+ # 339 Gentoo Biscoe 50.4 15.7 222 ... 2009
699
+ # 340 Gentoo Biscoe 45.2 14.8 212 ... 2009
700
+ # 341 Gentoo Biscoe 49.9 16.1 213 ... 2009
701
+ #
702
+ # @overload remove
703
+ # Select records by booleans from block
704
+ # and remove them to create a remainer DataFrame.
705
+ # - The order of records in self will be preserved.
706
+ #
707
+ # @yieldparam self [DataFrame]
708
+ # gives self to the block.
709
+ # The block is evaluated within the context of self.
710
+ # @yieldreturn [<Boolean, nil>, Vector, Arrow::Array]
711
+ # a boolean filter to remove. `Vector` or `Arrow::Array` must be boolean type.
712
+ # @return [DataFrame]
713
+ # remainer records as a DataFrame.
714
+ # @example Remove rows by booleans from block
715
+ # penguins.remove { (species == 'Adelie') | (year == 2009) }
716
+ #
717
+ # # =>
718
+ # #<RedAmber::DataFrame : 124 x 8 Vectors, 0x00000000000102fc>
719
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
720
+ # <string> <string> <double> <double> <uint8> ... <uint16>
721
+ # 0 Chinstrap Dream 46.5 17.9 192 ... 2007
722
+ # 1 Chinstrap Dream 50.0 19.5 196 ... 2007
723
+ # 2 Chinstrap Dream 51.3 19.2 193 ... 2007
724
+ # 3 Chinstrap Dream 45.4 18.7 188 ... 2007
725
+ # 4 Chinstrap Dream 52.7 19.8 197 ... 2007
726
+ # : : : : : : ... :
727
+ # 121 Gentoo Biscoe 51.1 16.3 220 ... 2008
728
+ # 122 Gentoo Biscoe 45.2 13.8 215 ... 2008
729
+ # 123 Gentoo Biscoe 45.2 16.4 223 ... 2008
213
730
  #
214
731
  def remove(*args, &block)
215
732
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
@@ -249,57 +766,93 @@ module RedAmber
249
766
  end
250
767
  end
251
768
 
769
+ # Remove records (rows) contains any nil.
770
+ #
771
+ # @return [DataFrame]
772
+ # removed DataFrame.
773
+ # @example
774
+ # penguins.remove_nil
775
+ # # =>
776
+ # #<RedAmber::DataFrame : 333 x 8 Vectors, 0x00000000000039d0>
777
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
778
+ # <string> <string> <double> <double> <uint8> ... <uint16>
779
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
780
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
781
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
782
+ # 3 Adelie Torgersen 36.7 19.3 193 ... 2007
783
+ # 4 Adelie Torgersen 39.3 20.6 190 ... 2007
784
+ # : : : : : : ... :
785
+ # 330 Gentoo Biscoe 50.4 15.7 222 ... 2009
786
+ # 331 Gentoo Biscoe 45.2 14.8 212 ... 2009
787
+ # 332 Gentoo Biscoe 49.9 16.1 213 ... 2009
788
+ #
252
789
  def remove_nil
253
790
  func = Arrow::Function.find(:drop_null)
254
791
  DataFrame.create(func.execute([table]).value)
255
792
  end
256
793
  alias_method :drop_nil, :remove_nil
257
794
 
795
+ # Select records from the top.
796
+ #
797
+ # @param n_obs [Integer]
798
+ # number of records to select.
799
+ # @return [DataFrame]
800
+ #
258
801
  def head(n_obs = 5)
259
802
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
260
803
 
261
804
  self[0...[n_obs, size].min]
262
805
  end
263
806
 
807
+ # Select records from the end.
808
+ #
809
+ # @param n_obs [Integer]
810
+ # number of records to select.
811
+ # @return [DataFrame]
812
+ #
264
813
  def tail(n_obs = 5)
265
814
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
266
815
 
267
816
  self[-[n_obs, size].min..]
268
817
  end
269
818
 
819
+ # Select records from the top.
820
+ #
821
+ # @param n_obs [Integer]
822
+ # number of records to select.
823
+ # @return [DataFrame]
824
+ #
270
825
  def first(n_obs = 1)
271
826
  head(n_obs)
272
827
  end
273
828
 
829
+ # Select records from the end.
830
+ #
831
+ # @param n_obs [Integer]
832
+ # number of records to select.
833
+ # @return [DataFrame]
834
+ #
274
835
  def last(n_obs = 1)
275
836
  tail(n_obs)
276
837
  end
277
838
 
839
+ # Select records by index Array to create a DataFrame.
840
+ #
841
+ # - TODO: support for option `boundscheck: true`
842
+ # - Supports indices in an Arrow::UInt8, UInt16, Uint32, Uint64 or an Array
843
+ # - Negative index is not supported.
844
+ # @param index_array [<Integer>, Arrow::Array]
845
+ # row indeces to select.
846
+ # @return [DataFrame]
847
+ # selected variables as a DataFrame.
848
+ #
278
849
  # @api private
279
- # TODO: support for option `boundscheck: true`
280
- # Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
281
- # Negative index is not supported.
850
+ #
282
851
  def take(index_array)
283
852
  DataFrame.create(@table.take(index_array))
284
853
  end
285
854
 
286
- # @api private
287
- # TODO: support for option `null_selection_behavior: :drop``
288
- def filter(*booleans)
289
- booleans.flatten!
290
- case booleans
291
- in []
292
- return remove_all_values
293
- in [Arrow::BooleanArray => b]
294
- filter_by_array(b)
295
- else
296
- unless booleans.booleans?
297
- raise DataFrameArgumentError, 'Argument is not a boolean.'
298
- end
299
-
300
- filter_by_array(Arrow::BooleanArray.new(booleans))
301
- end
302
- end
855
+ # rubocop:enable Layout/LineLength
303
856
 
304
857
  private
305
858