red_amber 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,11 +8,14 @@ module RedAmber
8
8
  # @param key [Symbol] key of the index column
9
9
  # to transepose into keys.
10
10
  # If it is not specified, keys[0] is used.
11
- # @param new_key [Symbol] key name of transposed index column.
12
- # If it is not specified, :NAME is used. If it already exists, :NAME1 or :NAME1.succ is used.
11
+ # @param name [Symbol] key name of transposed index column.
12
+ # If it is not specified, :NAME is used.
13
+ # If it already exists, :NAME1 or :NAME1.succ is used.
13
14
  # @return [DataFrame] trnsposed DataFrame
14
15
  def transpose(key: keys.first, name: :NAME)
15
- raise DataFrameArgumentError, "Self does not include: #{key}" unless keys.include?(key)
16
+ unless keys.include?(key)
17
+ raise DataFrameArgumentError, "Self does not include: #{key}"
18
+ end
16
19
 
17
20
  # Find unused name
18
21
  new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
@@ -35,14 +38,24 @@ module RedAmber
35
38
  # @param value [Symbol, String] key of the column which is come **from values**.
36
39
  # @return [DataFrame] long DataFrame.
37
40
  def to_long(*keep_keys, name: :NAME, value: :VALUE)
41
+ warn('[Info] No key to keep is specified.') if keep_keys.empty?
42
+
38
43
  not_included = keep_keys - keys
39
- raise DataFrameArgumentError, "Not have keys #{not_included}" unless not_included.empty?
44
+ unless not_included.empty?
45
+ raise DataFrameArgumentError, "Not have keys #{not_included}"
46
+ end
40
47
 
41
48
  name = name.to_sym
42
- raise DataFrameArgumentError, "Invalid key: #{name}" if keep_keys.include?(name)
49
+ if keep_keys.include?(name)
50
+ raise DataFrameArgumentError,
51
+ "Can't specify the key: #{name} for the column from keys."
52
+ end
43
53
 
44
54
  value = value.to_sym
45
- raise DataFrameArgumentError, "Invalid key: #{value}" if keep_keys.include?(value)
55
+ if keep_keys.include?(value)
56
+ raise DataFrameArgumentError,
57
+ "Can't specify the key: #{value} for the column from values."
58
+ end
46
59
 
47
60
  hash = Hash.new { |h, k| h[k] = [] }
48
61
  l = keys.size - keep_keys.size
@@ -62,15 +75,27 @@ module RedAmber
62
75
 
63
76
  # Reshape long DataFrame to a wide DataFrame.
64
77
  #
65
- # @param name [Symbol, String] key of the column which will be expanded **to key names**.
66
- # @param value [Symbol, String] key of the column which will be expanded **to values**.
78
+ # @param name [Symbol, String]
79
+ # key of the column which will be expanded **to key names**.
80
+ # @param value [Symbol, String]
81
+ # key of the column which will be expanded **to values**.
67
82
  # @return [DataFrame] wide DataFrame.
68
83
  def to_wide(name: :NAME, value: :VALUE)
69
84
  name = name.to_sym
70
- raise DataFrameArgumentError, "Invalid key: #{name}" unless keys.include?(name)
85
+ unless keys.include?(name)
86
+ raise DataFrameArgumentError,
87
+ "You are going to keep the key: #{name}. " \
88
+ 'You may need to specify the column name ' \
89
+ 'that gives the new keys by `:name` option.'
90
+ end
71
91
 
72
92
  value = value.to_sym
73
- raise DataFrameArgumentError, "Invalid key: #{value}" unless keys.include?(value)
93
+ unless keys.include?(value)
94
+ raise DataFrameArgumentError,
95
+ "You are going to keep the key: #{value}. " \
96
+ 'You may need to specify the column name ' \
97
+ 'that gives the new values by `:value` option.'
98
+ end
74
99
 
75
100
  hash = Hash.new { |h, k| h[k] = {} }
76
101
  keep_keys = keys - [name, value]
@@ -3,53 +3,149 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # Array, Arrow::Array and Arrow::ChunkedArray are refined
7
+ using RefineArray
8
+ using RefineArrayLike
9
+
10
+ # Select variables or records.
11
+ #
12
+ # @overload [](key)
13
+ # select single variable and return as a Vetor.
14
+ #
15
+ # @param key [Symbol, String] key name to select.
16
+ # @return [Vector] selected variable as a Vector.
17
+ # @note DataFrame.v(key) is faster to create Vector from a variable.
18
+ #
19
+ # @overload [](keys)
20
+ # select variables and return a DataFrame.
21
+ #
22
+ # @param keys [<Symbol, String>] key names to select.
23
+ # @return [DataFrame] selected variables as a DataFrame.
24
+ #
25
+ # @overload [](index)
26
+ # select records and return a DataFrame.
27
+ #
28
+ # @param index [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
29
+ # index of a row to select.
30
+ # @return [DataFrame] selected variables as a DataFrame.
31
+ #
32
+ # @overload [](indices)
33
+ # select records and return a DataFrame.
34
+ #
35
+ # @param indices [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
36
+ # indices of rows to select.
37
+ # @return [DataFrame] selected variables as a DataFrame.
38
+ #
8
39
  def [](*args)
9
- args.flatten!
10
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
11
- return remove_all_values if args.empty? || args[0].nil?
40
+ raise DataFrameArgumentError, 'self is an empty dataframe' if empty?
41
+
42
+ case args
43
+ in [] | [nil]
44
+ return remove_all_values
45
+ in [(Symbol | String) => k] if key? k
46
+ return variables[k.to_sym]
47
+ in [Integer => i]
48
+ return take([i.negative? ? i + size : i])
49
+ in [Vector => v]
50
+ arrow_array = v.data
51
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
52
+ arrow_array = aa
53
+ else
54
+ a = parse_args(args, size)
55
+ return select_variables_by_keys(a) if a.symbols?
56
+ return take(normalize_indices(Arrow::Array.new(a))) if a.integers?
57
+ return remove_all_values if a.compact.empty?
58
+ return filter_by_array(Arrow::BooleanArray.new(a)) if a.booleans?
59
+
60
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
61
+ end
62
+
63
+ return take(normalize_indices(arrow_array)) if arrow_array.numeric?
64
+ return filter_by_array(arrow_array) if arrow_array.boolean?
12
65
 
13
- vector = parse_to_vector(args)
14
- if vector.boolean?
15
- return filter_by_vector(vector.data) if vector.size == size
66
+ a = arrow_array.to_a
67
+ return select_variables_by_keys(a) if a.symbols_or_strings?
16
68
 
17
- raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
69
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
70
+ end
71
+
72
+ # Select a variable by a key in String or Symbol
73
+ def v(key)
74
+ unless key.is_a?(Symbol) || key.is_a?(String)
75
+ raise DataFrameArgumentError, "Key is not a Symbol or a String: [#{key}]"
18
76
  end
19
- return take_by_array(vector) if vector.numeric?
20
- return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.dictionary?
77
+ raise DataFrameArgumentError, "Key does not exist: [#{key}]" unless key? key
21
78
 
22
- raise DataFrameArgumentError, "Invalid argument: #{args}"
79
+ variables[key.to_sym]
23
80
  end
24
81
 
25
- # slice and select rows to create sub DataFrame
82
+ # Select records to create a DataFrame.
83
+ #
84
+ # @overload slice(row)
85
+ # select a record and return a DataFrame.
86
+ #
87
+ # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
88
+ # a row index to select.
89
+ # @yield [self] gives self to the block.
90
+ # @note The block is evaluated within the context of self.
91
+ # It is accessable to self's instance variables and private methods.
92
+ # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
93
+ # a row index to select.
94
+ # @return [DataFrame] selected variables as a DataFrame.
95
+ #
96
+ # @overload slice(rows)
97
+ # select records and return a DataFrame.
98
+ # - Duplicated selection is acceptable. The same record will be returned.
99
+ # - The order of records will be the same as specified indices.
100
+ #
101
+ # @param rows [Integer, Float, Range<Integer>, Vector, Arrow::Array]
102
+ # row indeces to select.
103
+ # @yield [self] gives self to the block.
104
+ # @note The block is evaluated within the context of self.
105
+ # It is accessable to self's instance variables and private methods.
106
+ # @yieldreturn [<Integer, Float, Range<Integer>, Vector, Arrow::Array>]
107
+ # row indeces to select.
108
+ # @return [DataFrame] selected variables as a DataFrame.
109
+ #
26
110
  def slice(*args, &block)
27
- slicer = args
111
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
112
+
28
113
  if block
29
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
114
+ unless args.empty?
115
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
116
+ end
30
117
 
31
- slicer = [instance_eval(&block)]
118
+ args = [instance_eval(&block)]
32
119
  end
33
- slicer.flatten!
34
120
 
35
- raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
- return remove_all_values if slicer.empty? || slicer[0].nil?
37
-
38
- vector = parse_to_vector(slicer)
39
- if vector.boolean?
40
- return filter_by_vector(vector.data) if vector.size == size
121
+ arrow_array =
122
+ case args
123
+ in [] | [[]]
124
+ return remove_all_values
125
+ in [Vector => v]
126
+ v.data
127
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
128
+ aa
129
+ else
130
+ Arrow::Array.new(parse_args(args, size))
131
+ end
41
132
 
42
- raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
133
+ if arrow_array.numeric?
134
+ take(normalize_indices(arrow_array))
135
+ elsif arrow_array.boolean?
136
+ filter_by_array(arrow_array)
137
+ elsif arrow_array.to_a.compact.empty?
138
+ # Ruby 3.0.4 does not accept Arrow::Array#compact here. 2.7.6 and 3.1.2 is OK.
139
+ remove_all_values
140
+ else
141
+ raise DataFrameArgumentError, "invalid arguments: #{args}"
43
142
  end
44
- return take_by_array(vector) if vector.numeric?
45
-
46
- raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
143
  end
48
144
 
49
145
  def slice_by(key, keep_key: false, &block)
50
146
  raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
147
  raise DataFrameArgumentError, 'No block given' unless block
52
- raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
148
+ raise DataFrameArgumentError, "#{key} is not a key of self" unless key?(key)
53
149
  return self if key.nil?
54
150
 
55
151
  slicer = instance_eval(&block)
@@ -83,69 +179,82 @@ module RedAmber
83
179
  slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
180
  end
85
181
 
86
- if keep_key
87
- take(slicer)
88
- else
89
- take(slicer).drop(key)
90
- end
182
+ taken = take(normalize_indices(Arrow::Array.new(slicer)))
183
+ keep_key ? taken : taken.drop(key)
91
184
  end
92
185
 
93
- # remove selected rows to create remainer DataFrame
186
+ # Select records and remove them to create a remainer DataFrame.
187
+ #
188
+ # @overload remove(row)
189
+ # select a record and remove it to create a remainer DataFrame.
190
+ # - The order of records in self will be preserved.
191
+ #
192
+ # @param row [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
193
+ # a row index to remove.
194
+ # @yield [self] gives self to the block.
195
+ # @note The block is evaluated within the context of self.
196
+ # It is accessable to self's instance variables and private methods.
197
+ # @yieldreturn [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
198
+ # a row index to remove.
199
+ # @return [DataFrame] remainer variables as a DataFrame.
200
+ #
201
+ # @overload remove(rows)
202
+ # select records and remove them to create a remainer DataFrame.
203
+ # - The order of records in self will be preserved.
204
+ #
205
+ # @param rows [Indeger, Float, Range<Integer>, Vector, Arrow::Array]
206
+ # row indeces to remove.
207
+ # @yield [self] gives self to the block.
208
+ # @note The block is evaluated within the context of self.
209
+ # It is accessable to self's instance variables and private methods.
210
+ # @yieldreturn [<Indeger, Float, Range<Integer>, Vector, Arrow::Array>]
211
+ # row indeces to remove.
212
+ # @return [DataFrame] remainer variables as a DataFrame.
213
+ #
94
214
  def remove(*args, &block)
95
- remover = args
96
- if block
97
- raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
98
-
99
- remover = [instance_eval(&block)]
100
- end
101
- remover.flatten!
102
-
103
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
104
- return self if remover.empty? || remover[0].nil?
215
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
105
216
 
106
- vector = parse_to_vector(remover)
107
- if vector.boolean?
108
- return filter_by_vector(vector.primitive_invert.data) if vector.size == size
217
+ if block
218
+ unless args.empty?
219
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.'
220
+ end
109
221
 
110
- raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
222
+ args = [instance_eval(&block)]
111
223
  end
112
- if vector.numeric?
113
- raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
114
224
 
115
- normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
116
- if normalized_indices.max >= size
117
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
225
+ arrow_array =
226
+ case args
227
+ in [] | [[]] | [nil]
228
+ return self
229
+ in [Vector => v]
230
+ v.data
231
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
232
+ aa
233
+ else
234
+ Arrow::Array.new(parse_args(args, size))
118
235
  end
119
236
 
120
- normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
121
- return remove_all_values if normalized_indices == indices.to_a
122
- return self if normalized_indices.empty?
237
+ if arrow_array.boolean?
238
+ filter_by_array(arrow_array.primitive_invert)
239
+ elsif arrow_array.numeric?
240
+ remover = normalize_indices(arrow_array).to_a
241
+ return self if remover.empty?
123
242
 
124
- index_array = indices.to_a - normalized_indices
243
+ slicer = indices.to_a - remover.map(&:to_i)
244
+ return remove_all_values if slicer.empty?
125
245
 
126
- datum = Arrow::Function.find(:take).execute([table, index_array])
127
- return DataFrame.new(datum.value)
246
+ take(slicer)
247
+ else
248
+ raise DataFrameArgumentError, "Invalid argument #{args}"
128
249
  end
129
-
130
- raise DataFrameArgumentError, "Invalid argument #{remover}"
131
250
  end
132
251
 
133
252
  def remove_nil
134
253
  func = Arrow::Function.find(:drop_null)
135
- DataFrame.new(func.execute([table]).value)
254
+ DataFrame.create(func.execute([table]).value)
136
255
  end
137
256
  alias_method :drop_nil, :remove_nil
138
257
 
139
- # Select a variable by a key in String or Symbol
140
- def v(key)
141
- unless key.is_a?(Symbol) || key.is_a?(String)
142
- raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
143
- end
144
- raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
145
-
146
- variables[key.to_sym]
147
- end
148
-
149
258
  def head(n_obs = 5)
150
259
  raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
151
260
 
@@ -166,77 +275,73 @@ module RedAmber
166
275
  tail(n_obs)
167
276
  end
168
277
 
169
- # Undocumented
170
- # TODO: support for option {boundscheck: true}
171
- def take(*arg_indices)
172
- arg_indices.flatten!
173
- return remove_all_values if arg_indices.empty?
174
-
175
- arg_indices = arg_indices[0] if arg_indices.one? && !arg_indices[0].is_a?(Numeric)
176
- arg_indices = Vector.new(arg_indices) unless arg_indices.is_a?(Vector)
177
-
178
- take_by_array(arg_indices)
278
+ # @api private
279
+ # TODO: support for option `boundscheck: true`
280
+ # Supports indices in an Arrow::UInt{8, 16, 32, 64} or an Array
281
+ # Negative index is not supported.
282
+ def take(index_array)
283
+ DataFrame.create(@table.take(index_array))
179
284
  end
180
285
 
181
- # Undocumented
182
- # TODO: support for option {null_selection_behavior: :drop}
286
+ # @api private
287
+ # TODO: support for option `null_selection_behavior: :drop``
183
288
  def filter(*booleans)
184
289
  booleans.flatten!
185
- return remove_all_values if booleans.empty?
186
-
187
- b = booleans[0]
188
- case b
189
- when Vector
190
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
191
-
192
- filter_by_vector(b.data)
193
- when Arrow::BooleanArray
194
- filter_by_vector(b)
290
+ case booleans
291
+ in []
292
+ return remove_all_values
293
+ in [Arrow::BooleanArray => b]
294
+ filter_by_array(b)
195
295
  else
196
- raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
296
+ unless booleans.booleans?
297
+ raise DataFrameArgumentError, 'Argument is not a boolean.'
298
+ end
197
299
 
198
- filter_by_vector(Arrow::BooleanArray.new(booleans))
300
+ filter_by_array(Arrow::BooleanArray.new(booleans))
199
301
  end
200
302
  end
201
303
 
202
304
  private
203
305
 
204
- def select_vars_by_keys(keys)
306
+ def select_variables_by_keys(keys)
205
307
  if keys.one?
206
308
  key = keys[0].to_sym
207
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
309
+ raise DataFrameArgumentError, "Key does not exist: #{key}" unless key? key
208
310
 
209
311
  variables[key]
312
+ # Vector.new(@table.find_column(*key).data)
210
313
  else
211
- DataFrame.new(@table[keys])
314
+ check_duplicate_keys(keys)
315
+ DataFrame.create(@table.select_columns(*keys))
212
316
  end
213
317
  end
214
318
 
215
- # Accepts indices by numeric Vector
216
- def take_by_array(indices)
217
- raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
218
- raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
219
-
220
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
221
- raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
222
-
223
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
224
-
225
- datum = Arrow::Function.find(:take).execute([table, index_array])
226
- DataFrame.new(datum.value)
319
+ # Accepts indices by numeric arrow array and returns positive indices.
320
+ def normalize_indices(arrow_array)
321
+ b = Arrow::Function.find(:less).execute([arrow_array, 0])
322
+ a = Arrow::Function.find(:add).execute([arrow_array, size])
323
+ r = Arrow::Function.find(:if_else).execute([b, a, arrow_array]).value
324
+ if r.float?
325
+ r = Arrow::Function.find(:floor).execute([r]).value
326
+ Arrow::UInt64ArrayBuilder.build(r)
327
+ else
328
+ r
329
+ end
227
330
  end
228
331
 
229
- # Accepts booleans by Arrow::BooleanArray
230
- def filter_by_vector(boolean_array)
231
- raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
332
+ # Accepts booleans by a Arrow::BooleanArray or an Array
333
+ def filter_by_array(boolean_array)
334
+ unless boolean_array.length == size
335
+ raise DataFrameArgumentError, 'Booleans must be same size as self.'
336
+ end
232
337
 
233
338
  datum = Arrow::Function.find(:filter).execute([table, boolean_array])
234
- DataFrame.new(datum.value)
339
+ DataFrame.create(datum.value)
235
340
  end
236
341
 
237
342
  # return a DataFrame with same keys as self without values
238
343
  def remove_all_values
239
- filter_by_vector(Arrow::BooleanArray.new([false] * size))
344
+ filter_by_array(Arrow::BooleanArray.new([false] * size))
240
345
  end
241
346
  end
242
347
  end