red_amber 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +114 -39
  3. data/CHANGELOG.md +203 -31
  4. data/Gemfile +5 -2
  5. data/README.md +62 -29
  6. data/benchmark/basic.yml +86 -0
  7. data/benchmark/combine.yml +62 -0
  8. data/benchmark/dataframe.yml +62 -0
  9. data/benchmark/drop_nil.yml +15 -3
  10. data/benchmark/group.yml +39 -0
  11. data/benchmark/reshape.yml +31 -0
  12. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  13. data/benchmark/rover/flights.yml +23 -0
  14. data/benchmark/rover/penguins.yml +23 -0
  15. data/benchmark/rover/planes.yml +23 -0
  16. data/benchmark/rover/weather.yml +23 -0
  17. data/benchmark/vector.yml +60 -0
  18. data/doc/DataFrame.md +335 -53
  19. data/doc/Vector.md +91 -0
  20. data/doc/image/dataframe/join.png +0 -0
  21. data/doc/image/dataframe/set_and_bind.png +0 -0
  22. data/doc/image/dataframe_model.png +0 -0
  23. data/lib/red_amber/data_frame.rb +167 -51
  24. data/lib/red_amber/data_frame_combinable.rb +486 -0
  25. data/lib/red_amber/data_frame_displayable.rb +6 -4
  26. data/lib/red_amber/data_frame_indexable.rb +2 -2
  27. data/lib/red_amber/data_frame_loadsave.rb +4 -1
  28. data/lib/red_amber/data_frame_reshaping.rb +35 -10
  29. data/lib/red_amber/data_frame_selectable.rb +221 -116
  30. data/lib/red_amber/data_frame_variable_operation.rb +146 -82
  31. data/lib/red_amber/group.rb +108 -18
  32. data/lib/red_amber/helper.rb +53 -43
  33. data/lib/red_amber/refinements.rb +199 -0
  34. data/lib/red_amber/vector.rb +56 -46
  35. data/lib/red_amber/vector_functions.rb +23 -83
  36. data/lib/red_amber/vector_selectable.rb +116 -69
  37. data/lib/red_amber/vector_updatable.rb +189 -65
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +3 -0
  40. data/red_amber.gemspec +4 -3
  41. metadata +24 -10
@@ -3,53 +3,149 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
+ using RefineArray
8
+ using RefineArrayLike
9
+
10
+ # Select variables or records.
11
+ #
12
+ # @overload [](key)
13
+ # select single variable and return as a Vetor.
14
+ #
15
+ # @param key [Symbol, String] key name to select.
16
+ # @return [Vector] selected variable as a Vector.
17
+ # @note DataFrame.v(key) is faster to create Vector from a variable.
18
+ #
19
+ # @overload [](keys)
20
+ # select variables and return a DataFrame.
21
+ #
22
+ # @param keys [<Symbol, String>] key names to select.
23
+ # @return [DataFrame] selected variables as a DataFrame.
24
+ #
25
+ # @overload [](index)
26
+ # select records and return a DataFrame.
27
+ #
28
+ # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
29
+ # index of a row to select.
30
+ # @return [DataFrame] selected variables as a DataFrame.
31
+ #
32
+ # @overload [](indices)
33
+ # select records and return a DataFrame.
34
+ #
35
+ # @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
36
+ # indices of rows to select.
37
+ # @return [DataFrame] selected variables as a DataFrame.
38
+ #
8
39
  def [](*args)
9
- args.flatten!
10
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
11
- return remove_all_values if args.empty? || args[0].nil?
40
+ raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
41
+
42
+ case args
43
+ in [] | [nil]
44
+ return remove_all_values
45
+ in [(Symbol | String) => k] if key? k
46
+ return variables[k.to_sym]
47
+ in [Integer => i]
48
+ return take([i.negative? ? i + size : i])
49
+ in [Vector => v]
50
+ arrow_array = v.data
51
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
52
+ arrow_array = aa
53
+ else
54
+ a = parse_args(args, size)
55
+ return select_variables_by_keys(a) if a.symbols?
56
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
57
+ return remove_all_values if a.compact.empty?
58
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
59
+
60
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
61
+ end
62
+
63
+ return take(normalize_indices(arrow_array)) if arrow_array.numeric?
64
+ return filter_by_array(arrow_array) if arrow_array.boolean?
12
65
 
13
- vector = parse_to_vector(args)
14
- if vector.boolean?
15
- return filter_by_vector(vector.data) if vector.size == size
66
+ a = arrow_array.to_a
67
+ return select_variables_by_keys(a) if a.symbols_or_strings?
16
68
 
17
- raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
69
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
70
+ end
71
+
72
+ # Select a variable by a key in String or Symbol
73
+ def v(key)
74
+ unless key.is_a?(Symbol) || key.is_a?(String)
75
+ raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
18
76
  end
19
- return take_by_array(vector) if vector.numeric?
20
- return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
77
+ raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
21
78
 
22
- raise DataFrameArgumentError, "Invalid argument: #{args}"
79
+ variables[key.to_sym]
23
80
  end
24
81
 
25
- # slice and select rows to create sub DataFrame
82
+ # Select records to create a DataFrame.
83
+ #
84
+ # @overload slice(row)
85
+ # select a record and return a DataFrame.
86
+ #
87
+ # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
88
+ # a row index to select.
89
+ # @yield [self] gives self to the block.
90
+ # @note The block is evaluated within the context of self.
91
+ # It is accessable to self's instance variables and private methods.
92
+ # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
93
+ # a row index to select.
94
+ # @return [DataFrame] selected variables as a DataFrame.
95
+ #
96
+ # @overload slice(rows)
97
+ # select records and return a DataFrame.
98
+ # - Duplicated selection is acceptable. The same record will be returned.
99
+ # - The order of records will be the same as specified indices.
100
+ #
101
+ # @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
102
+ # row indeces to select.
103
+ # @yield [self] gives self to the block.
104
+ # @note The block is evaluated within the context of self.
105
+ # It is accessable to self's instance variables and private methods.
106
+ # @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
107
+ # row indeces to select.
108
+ # @return [DataFrame] selected variables as a DataFrame.
109
+ #
26
110
  def slice(*args, &block)
27
- slicer = args
111
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
112
+
28
113
  if block
29
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
114
+ unless args.empty?
115
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
116
+ end
30
117
 
31
- slicer = [instance_eval(&block)]
118
+ args = [instance_eval(&block)]
32
119
  end
33
- slicer.flatten!
34
120
 
35
- raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
- return remove_all_values if slicer.empty? || slicer[0].nil?
37
-
38
- vector = parse_to_vector(slicer)
39
- if vector.boolean?
40
- return filter_by_vector(vector.data) if vector.size == size
121
+ arrow_array =
122
+ case args
123
+ in [] | [[]]
124
+ return remove_all_values
125
+ in [Vector => v]
126
+ v.data
127
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
128
+ aa
129
+ else
130
+ Arrow::Array.new(parse_args(args, size))
131
+ end
41
132
 
42
- raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
133
+ if arrow_array.numeric?
134
+ take(normalize_indices(arrow_array))
135
+ elsif arrow_array.boolean?
136
+ filter_by_array(arrow_array)
137
+ elsif arrow_array.to_a.compact.empty?
138
+ # Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
139
+ remove_all_values
140
+ else
141
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
43
142
  end
44
- return take_by_array(vector) if vector.numeric?
45
-
46
- raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
143
  end
48
144
 
49
145
  def slice_by(key, keep_key: false, &block)
50
146
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
147
  raise DataFrameArgumentError, 'No block given' unless block
52
- raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
148
+ raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
53
149
  return self if key.nil?
54
150
 
55
151
  slicer = instance_eval(&block)
@@ -83,69 +179,82 @@ module RedAmber
83
179
  slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
180
  end
85
181
 
86
- if keep_key
87
- take(slicer)
88
- else
89
- take(slicer).drop(key)
90
- end
182
+ taken = take(normalize_indices(Arrow::Array.new(slicer)))
183
+ keep_key ? taken : taken.drop(key)
91
184
  end
92
185
 
93
- # remove selected rows to create remainer DataFrame
186
+ # Select records and remove them to create a remainer DataFrame.
187
+ #
188
+ # @overload remove(row)
189
+ # select a record and remove it to create a remainer DataFrame.
190
+ # - The order of records in self will be preserved.
191
+ #
192
+ # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
193
+ # a row index to remove.
194
+ # @yield [self] gives self to the block.
195
+ # @note The block is evaluated within the context of self.
196
+ # It is accessable to self's instance variables and private methods.
197
+ # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
198
+ # a row index to remove.
199
+ # @return [DataFrame] remainer variables as a DataFrame.
200
+ #
201
+ # @overload remove(rows)
202
+ # select records and remove them to create a remainer DataFrame.
203
+ # - The order of records in self will be preserved.
204
+ #
205
+ # @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
206
+ # row indeces to remove.
207
+ # @yield [self] gives self to the block.
208
+ # @note The block is evaluated within the context of self.
209
+ # It is accessable to self's instance variables and private methods.
210
+ # @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
211
+ # row indeces to remove.
212
+ # @return [DataFrame] remainer variables as a DataFrame.
213
+ #
94
214
  def remove(*args, &block)
95
- remover = args
96
- if block
97
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
98
-
99
- remover = [instance_eval(&block)]
100
- end
101
- remover.flatten!
102
-
103
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
104
- return self if remover.empty? || remover[0].nil?
215
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
105
216
 
106
- vector = parse_to_vector(remover)
107
- if vector.boolean?
108
- return filter_by_vector(vector.primitive_invert.data) if vector.size == size
217
+ if block
218
+ unless args.empty?
219
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
220
+ end
109
221
 
110
- raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
222
+ args = [instance_eval(&block)]
111
223
  end
112
- if vector.numeric?
113
- raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
114
224
 
115
- normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
116
- if normalized_indices.max >= size
117
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
225
+ arrow_array =
226
+ case args
227
+ in [] | [[]] | [nil]
228
+ return self
229
+ in [Vector => v]
230
+ v.data
231
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
232
+ aa
233
+ else
234
+ Arrow::Array.new(parse_args(args, size))
118
235
  end
119
236
 
120
- normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
121
- return remove_all_values if normalized_indices == indices
122
- return self if normalized_indices.empty?
237
+ if arrow_array.boolean?
238
+ filter_by_array(arrow_array.primitive_invert)
239
+ elsif arrow_array.numeric?
240
+ remover = normalize_indices(arrow_array).to_a
241
+ return self if remover.empty?
123
242
 
124
- index_array = indices - normalized_indices
243
+ slicer = indices.to_a - remover.map(&:to_i)
244
+ return remove_all_values if slicer.empty?
125
245
 
126
- datum = Arrow::Function.find(:take).execute([table, index_array])
127
- return DataFrame.new(datum.value)
246
+ take(slicer)
247
+ else
248
+ raise DataFrameArgumentError, "Invalid argument #{args}"
128
249
  end
129
-
130
- raise DataFrameArgumentError, "Invalid argument #{remover}"
131
250
  end
132
251
 
133
252
  def remove_nil
134
253
  func = Arrow::Function.find(:drop_null)
135
- DataFrame.new(func.execute([table]).value)
254
+ DataFrame.create(func.execute([table]).value)
136
255
  end
137
256
  alias_method :drop_nil, :remove_nil
138
257
 
139
- # Select a variable by a key in String or Symbol
140
- def v(key)
141
- unless key.is_a?(Symbol) || key.is_a?(String)
142
- raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
143
- end
144
- raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
145
-
146
- variables[key.to_sym]
147
- end
148
-
149
258
  def head(n_obs = 5)
150
259
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
151
260
 
@@ -166,77 +275,73 @@ module RedAmber
166
275
  tail(n_obs)
167
276
  end
168
277
 
169
- # Undocumented
170
- # TODO: support for option {boundscheck: true}
171
- def take(*indices)
172
- indices.flatten!
173
- return remove_all_values if indices.empty?
174
-
175
- indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
176
- indices = Vector.new(indices) unless indices.is_a?(Vector)
177
-
178
- take_by_array(indices)
278
+ # @api private
279
+ # TODO: support for option `boundscheck: true`
280
+ # Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
281
+ # Negative index is not supported.
282
+ def take(index_array)
283
+ DataFrame.create(@table.take(index_array))
179
284
  end
180
285
 
181
- # Undocumented
182
- # TODO: support for option {null_selection_behavior: :drop}
286
+ # @api private
287
+ # TODO: support for option `null_selection_behavior: :drop``
183
288
  def filter(*booleans)
184
289
  booleans.flatten!
185
- return remove_all_values if booleans.empty?
186
-
187
- b = booleans[0]
188
- case b
189
- when Vector
190
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
191
-
192
- filter_by_vector(b.data)
193
- when Arrow::BooleanArray
194
- filter_by_vector(b)
290
+ case booleans
291
+ in []
292
+ return remove_all_values
293
+ in [Arrow::BooleanArray => b]
294
+ filter_by_array(b)
195
295
  else
196
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
296
+ unless booleans.booleans?
297
+ raise DataFrameArgumentError, 'Argument is not a boolean.'
298
+ end
197
299
 
198
- filter_by_vector(Arrow::BooleanArray.new(booleans))
300
+ filter_by_array(Arrow::BooleanArray.new(booleans))
199
301
  end
200
302
  end
201
303
 
202
304
  private
203
305
 
204
- def select_vars_by_keys(keys)
306
+ def select_variables_by_keys(keys)
205
307
  if keys.one?
206
308
  key = keys[0].to_sym
207
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
309
+ raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
208
310
 
209
311
  variables[key]
312
+ # Vector.new(@table.find_column(*key).data)
210
313
  else
211
- DataFrame.new(@table[keys])
314
+ check_duplicate_keys(keys)
315
+ DataFrame.create(@table.select_columns(*keys))
212
316
  end
213
317
  end
214
318
 
215
- # Accepts indices by numeric Vector
216
- def take_by_array(indices)
217
- raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
218
- raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
219
-
220
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
221
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
222
-
223
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
224
-
225
- datum = Arrow::Function.find(:take).execute([table, index_array])
226
- DataFrame.new(datum.value)
319
+ # Accepts indices by numeric arrow array and returns positive indices.
320
+ def normalize_indices(arrow_array)
321
+ b = Arrow::Function.find(:less).execute([arrow_array, 0])
322
+ a = Arrow::Function.find(:add).execute([arrow_array, size])
323
+ r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
324
+ if r.float?
325
+ r = Arrow::Function.find(:floor).execute([r]).value
326
+ Arrow::UInt64ArrayBuilder.build(r)
327
+ else
328
+ r
329
+ end
227
330
  end
228
331
 
229
- # Accepts booleans by Arrow::BooleanArray
230
- def filter_by_vector(boolean_array)
231
- raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
332
+ # Accepts booleans by a Arrow::BooleanArray or an Array
333
+ def filter_by_array(boolean_array)
334
+ unless boolean_array.length == size
335
+ raise DataFrameArgumentError, 'Booleans must be same size as self.'
336
+ end
232
337
 
233
338
  datum = Arrow::Function.find(:filter).execute([table, boolean_array])
234
- DataFrame.new(datum.value)
339
+ DataFrame.create(datum.value)
235
340
  end
236
341
 
237
342
  # return a DataFrame with same keys as self without values
238
343
  def remove_all_values
239
- filter_by_vector(Arrow::BooleanArray.new([false] * size))
344
+ filter_by_array(Arrow::BooleanArray.new([false] * size))
240
345
  end
241
346
  end
242
347
  end