red_amber 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,35 +3,94 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # select variables: [symbol] or [string]
7
+ # select observations: [array of index], [range]
8
8
  def [](*args)
9
+ args.flatten!
9
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
- raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
-
12
- if args.one?
13
- case args[0]
14
- when Vector
15
- return select_obs_by_boolean(Arrow::BooleanArray.new(args[0].data))
16
- when Arrow::BooleanArray
17
- return select_obs_by_boolean(args[0])
18
- when Array
19
- return select_obs_by_boolean(Arrow::BooleanArray.new(args[0]))
20
-
21
- # when Hash
22
- # specify conditions to select by a Hash
23
- end
11
+ return remove_all_values if args.empty? || args[0].nil?
12
+
13
+ vector = parse_to_vector(args)
14
+ if vector.boolean?
15
+ return filter_by_vector(vector.data) if vector.size == size
16
+
17
+ raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
18
+ end
19
+ return take_by_array(vector) if vector.numeric?
20
+ return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
21
+
22
+ raise DataFrameArgumentError, "Invalid argument: #{args}"
23
+ end
24
+
25
+ # slice and select some observations to create sub DataFrame
26
+ def slice(*args, &block)
27
+ slicer = args
28
+ if block
29
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
+
31
+ slicer = instance_eval(&block)
32
+ end
33
+ slicer = [slicer].flatten
34
+
35
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
36
+ return remove_all_values if slicer.empty? || slicer[0].nil?
37
+
38
+ vector = parse_to_vector(slicer)
39
+ if vector.boolean?
40
+ return filter_by_vector(vector.data) if vector.size == size
41
+
42
+ raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
24
43
  end
44
+ return take_by_array(vector) if vector.numeric?
45
+
46
+ raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
+ end
48
+
49
+ # remove selected observations to create sub DataFrame
50
+ def remove(*args, &block)
51
+ remover = args
52
+ if block
53
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
+
55
+ remover = instance_eval(&block)
56
+ end
57
+ remover = [remover].flatten
58
+
59
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
+ return self if remover.empty? || remover[0].nil?
61
+
62
+ vector = parse_to_vector(remover)
63
+ if vector.boolean?
64
+ return filter_by_vector(vector.primitive_invert.data) if vector.size == size
65
+
66
+ raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
67
+ end
68
+ if vector.numeric?
69
+ raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
70
+
71
+ normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
72
+ if normalized_indices.max >= size
73
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
74
+ end
25
75
 
26
- return select_obs_by_boolean(args) if booleans?(args)
76
+ normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
77
+ return remove_all_values if normalized_indices == indices
78
+ return self if normalized_indices.empty?
27
79
 
28
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
- expanded = expand_range(args)
30
- return map_indices(*expanded) if integers?(expanded)
31
- return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
80
+ index_array = indices - normalized_indices
32
81
 
33
- raise DataFrameArgumentError, "Invalid argument #{args}"
82
+ datum = Arrow::Function.find(:take).execute([table, index_array])
83
+ return DataFrame.new(datum.value)
84
+ end
85
+
86
+ raise DataFrameArgumentError, "Invalid argument #{remover}"
87
+ end
88
+
89
+ def remove_nil
90
+ func = Arrow::Function.find(:drop_null)
91
+ DataFrame.new(func.execute([table]).value)
34
92
  end
93
+ alias_method :drop_nil, :remove_nil
35
94
 
36
95
  # Select a variable by a key in String or Symbol
37
96
  def v(key)
@@ -43,24 +102,57 @@ module RedAmber
43
102
  variables[key.to_sym]
44
103
  end
45
104
 
46
- def head(n_rows = 5)
47
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
105
+ def head(n_obs = 5)
106
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
48
107
 
49
- self[0...[n_rows, size].min]
108
+ self[0...[n_obs, size].min]
50
109
  end
51
110
 
52
- def tail(n_rows = 5)
53
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
111
+ def tail(n_obs = 5)
112
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
54
113
 
55
- self[-[n_rows, size].min..]
114
+ self[-[n_obs, size].min..]
56
115
  end
57
116
 
58
- def first(n_rows = 1)
59
- head(n_rows)
117
+ def first(n_obs = 1)
118
+ head(n_obs)
60
119
  end
61
120
 
62
- def last(n_rows = 1)
63
- tail(n_rows)
121
+ def last(n_obs = 1)
122
+ tail(n_obs)
123
+ end
124
+
125
+ # Undocumented
126
+ # TODO: support for option {boundscheck: true}
127
+ def take(*indices)
128
+ indices.flatten!
129
+ return remove_all_values if indices.empty?
130
+
131
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
132
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
133
+
134
+ take_by_array(indices)
135
+ end
136
+
137
+ # Undocumented
138
+ # TODO: support for option {null_selection_behavior: :drop}
139
+ def filter(*booleans)
140
+ booleans.flatten!
141
+ return remove_all_values if booleans.empty?
142
+
143
+ b = booleans[0]
144
+ case b
145
+ when Vector
146
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
147
+
148
+ filter_by_vector(b.data)
149
+ when Arrow::BooleanArray
150
+ filter_by_vector(b)
151
+ else
152
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
153
+
154
+ filter_by_vector(Arrow::BooleanArray.new(booleans))
155
+ end
64
156
  end
65
157
 
66
158
  private
@@ -75,5 +167,32 @@ module RedAmber
75
167
  DataFrame.new(@table[keys])
76
168
  end
77
169
  end
170
+
171
+ # Accepts indices by numeric Vector
172
+ def take_by_array(indices)
173
+ raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
174
+ raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
175
+
176
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
177
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
178
+
179
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
180
+
181
+ datum = Arrow::Function.find(:take).execute([table, index_array])
182
+ DataFrame.new(datum.value)
183
+ end
184
+
185
+ # Accepts booleans by Arrow::BooleanArray
186
+ def filter_by_vector(boolean_array)
187
+ raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
188
+
189
+ datum = Arrow::Function.find(:filter).execute([table, boolean_array])
190
+ DataFrame.new(datum.value)
191
+ end
192
+
193
+ # return a DataFrame with same keys as self without values
194
+ def remove_all_values
195
+ filter_by_vector(Arrow::BooleanArray.new([false] * size))
196
+ end
78
197
  end
79
198
  end
@@ -129,5 +129,9 @@ module RedAmber
129
129
  arrays << Arrow::ChunkedArray.new([a])
130
130
  end
131
131
  end
132
+
133
+ def keys_by_booleans(booleans)
134
+ keys.select.with_index { |_, i| booleans[i] }
135
+ end
132
136
  end
133
137
  end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module Helper
6
+ private
7
+
8
+ def pl(num)
9
+ num > 1 ? 's' : ''
10
+ end
11
+
12
+ def out_of_range?(indeces)
13
+ indeces.max >= size || indeces.min < -size
14
+ end
15
+
16
+ def integers?(enum)
17
+ enum.all?(Integer)
18
+ end
19
+
20
+ def sym_or_str?(enum)
21
+ enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
22
+ end
23
+
24
+ def booleans?(enum)
25
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
26
+ end
27
+
28
+ def create_dataframe_from_vector(key, vector)
29
+ DataFrame.new(key => vector.data)
30
+ end
31
+
32
+ def parse_to_vector(args)
33
+ a = args.reduce([]) do |accum, elem|
34
+ accum.concat(normalize_element(elem))
35
+ end
36
+ Vector.new(a)
37
+ end
38
+
39
+ def normalize_element(elem)
40
+ case elem
41
+ when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
42
+ [elem]
43
+ when Range
44
+ both_end = [elem.begin, elem.end]
45
+ both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
46
+
47
+ if both_end.any?(Integer) || both_end.all?(&:nil?)
48
+ if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
49
+ raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
50
+ end
51
+
52
+ (0...size).to_a[elem]
53
+ else
54
+ elem.to_a
55
+ end
56
+ else
57
+ Array(elem)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,25 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # Columnar data object
4
+ # Values in variable (columnar) data object
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
- include VectorCompensable
9
8
  include VectorFunctions
9
+ include VectorUpdatable
10
+ include VectorSelectable
11
+ include Helper
10
12
 
11
- # chunked_array may come from column.data
12
- def initialize(array)
13
+ def initialize(*array)
13
14
  @key = nil # default is 'headless'
14
- case array
15
- when Vector
16
- @data = array.data
17
- when Arrow::Array, Arrow::ChunkedArray
18
- @data = array
19
- when Array
20
- @data = Arrow::Array.new(array)
15
+ if array.empty? || array[0].nil?
16
+ Vector.new([])
21
17
  else
22
- raise VectorArgumentError, 'Unknown array in argument'
18
+ array.flatten!
19
+ case array[0]
20
+ when Vector
21
+ @data = array[0].data
22
+ return
23
+ when Arrow::Array, Arrow::ChunkedArray
24
+ @data = array[0]
25
+ return
26
+ when Range
27
+ @data = Arrow::Array.new(Array(array[0]))
28
+ return
29
+ end
30
+ begin
31
+ @data = Arrow::Array.new(Array(array))
32
+ rescue Error
33
+ raise VectorArgumentError, "Invalid argument: #{array}"
34
+ end
23
35
  end
24
36
  end
25
37
 
@@ -52,6 +64,16 @@ module RedAmber
52
64
  alias_method :to_a, :values
53
65
  alias_method :entries, :values
54
66
 
67
+ def indices
68
+ (0...size).to_a
69
+ end
70
+ alias_method :indexes, :indices
71
+ alias_method :indeces, :indices
72
+
73
+ def to_ary
74
+ to_a
75
+ end
76
+
55
77
  def size
56
78
  # only defined :length in Arrow?
57
79
  @data.length
@@ -60,6 +82,10 @@ module RedAmber
60
82
  alias_method :n_rows, :size
61
83
  alias_method :nrow, :size
62
84
 
85
+ def empty?
86
+ size.zero?
87
+ end
88
+
63
89
  def type
64
90
  @data.value_type.nick.to_sym
65
91
  end
@@ -124,5 +150,9 @@ module RedAmber
124
150
  def n_nans
125
151
  numeric? ? is_nan.to_a.count(true) : 0
126
152
  end
153
+
154
+ def has_nil?
155
+ is_nil.any
156
+ end
127
157
  end
128
158
  end
@@ -16,11 +16,13 @@ module RedAmber
16
16
  unary_aggregations.each do |function|
17
17
  define_method(function) do |opts: nil|
18
18
  datum = exec_func_unary(function, options: opts)
19
- take_out_scalar(datum)
19
+ get_scalar(datum)
20
20
  end
21
21
  end
22
22
  alias_method :median, :approximate_median
23
23
  alias_method :count_uniq, :count_distinct
24
+ alias_method :all?, :all
25
+ alias_method :any?, :any
24
26
 
25
27
  def unbiased_variance
26
28
  variance(opts: { ddof: 1 })
@@ -47,7 +49,7 @@ module RedAmber
47
49
  unary_element_wise.each do |function|
48
50
  define_method(function) do |opts: nil|
49
51
  datum = exec_func_unary(function, options: opts)
50
- take_out_element_wise(datum)
52
+ Vector.new(datum.value)
51
53
  end
52
54
  end
53
55
  alias_method :is_nil, :is_null
@@ -72,12 +74,12 @@ module RedAmber
72
74
  unary_element_wise_op.each do |function, operator|
73
75
  define_method(function) do |opts: nil|
74
76
  datum = exec_func_unary(function, options: opts)
75
- take_out_element_wise(datum)
77
+ Vector.new(datum.value)
76
78
  end
77
79
 
78
80
  define_method(operator) do |opts: nil|
79
81
  datum = exec_func_unary(function, options: opts)
80
- take_out_element_wise(datum)
82
+ Vector.new(datum.value)
81
83
  end
82
84
  end
83
85
  alias_method :not, :invert
@@ -95,7 +97,7 @@ module RedAmber
95
97
  binary_element_wise.each do |function|
96
98
  define_method(function) do |other, opts: nil|
97
99
  datum = exec_func_binary(function, other, options: opts)
98
- take_out_element_wise(datum)
100
+ Vector.new(datum.value)
99
101
  end
100
102
  end
101
103
 
@@ -111,7 +113,7 @@ module RedAmber
111
113
  logical_binary_element_wise.each do |method, function|
112
114
  define_method(method) do |other, opts: nil|
113
115
  datum = exec_func_binary(function, other, options: opts)
114
- take_out_element_wise(datum)
116
+ Vector.new(datum.value)
115
117
  end
116
118
  end
117
119
 
@@ -144,12 +146,12 @@ module RedAmber
144
146
  binary_element_wise_op.each do |function, operator|
145
147
  define_method(function) do |other, opts: nil|
146
148
  datum = exec_func_binary(function, other, options: opts)
147
- take_out_element_wise(datum)
149
+ Vector.new(datum.value)
148
150
  end
149
151
 
150
152
  define_method(operator) do |other, opts: nil|
151
153
  datum = exec_func_binary(function, other, options: opts)
152
- take_out_element_wise(datum)
154
+ Vector.new(datum.value)
153
155
  end
154
156
  end
155
157
  alias_method :eq, :equal
@@ -159,8 +161,17 @@ module RedAmber
159
161
  alias_method :lt, :less
160
162
  alias_method :ne, :not_equal
161
163
 
164
+ def coerce(other)
165
+ case other
166
+ when Vector, Array, Arrow::Array
167
+ raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
168
+
169
+ [Vector.new(Array(other)), self]
170
+ end
171
+ [Vector.new(Array(other) * size), self]
172
+ end
173
+
162
174
  # (array functions)
163
- # array_filter, array_take
164
175
  # dictionary_encode,
165
176
  # partition_nth_indices,
166
177
  # quarter, quarters_between,
@@ -192,17 +203,17 @@ module RedAmber
192
203
  # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
193
204
 
194
205
  # (onditional)
195
- # case_when, cast, if_else
206
+ # case_when, cast,
196
207
 
197
208
  # (indices)
198
209
  # choose, index_in, index_in_meta_binary, indices_nonzero
199
210
 
200
211
  # (others)
201
- # coalesce, drop_null,
202
- # filter, is_in, is_in_meta_binary,
212
+ # coalesce,
213
+ # is_in_meta_binary,
203
214
  # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
204
215
  # max_element_wise, min_element_wise, random, select_k_unstable,
205
- # sort_indices, struct_field, take
216
+ # struct_field,
206
217
 
207
218
  private # =======
208
219
 
@@ -221,7 +232,7 @@ module RedAmber
221
232
  end
222
233
  end
223
234
 
224
- def take_out_scalar(datum)
235
+ def get_scalar(datum)
225
236
  output = datum.value
226
237
  case output
227
238
  when Arrow::StringScalar then output.to_s
@@ -232,10 +243,6 @@ module RedAmber
232
243
  end
233
244
  end
234
245
 
235
- def take_out_element_wise(datum)
236
- Vector.new(datum.value)
237
- end
238
-
239
246
  module_function # ======
240
247
 
241
248
  def find(function_name)
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to select some data.
9
+ module VectorSelectable
10
+ def drop_nil
11
+ datum = find(:drop_null).execute([data])
12
+ Vector.new(datum.value)
13
+ end
14
+
15
+ # vector calculation version of selection by indices
16
+ # TODO: support for option {boundscheck: true}
17
+ def take(*indices)
18
+ indices.flatten!
19
+ return Vector.new([]) if indices.empty?
20
+
21
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
23
+
24
+ take_by_vector(indices) # returns sub Vector
25
+ end
26
+
27
+ # TODO: support for option {null_selection_behavior: :drop}
28
+ def filter(*booleans)
29
+ booleans.flatten!
30
+ return Vector.new([]) if booleans.empty?
31
+
32
+ b = booleans[0]
33
+ boolean_array =
34
+ case b
35
+ when Vector
36
+ raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
37
+
38
+ b.data
39
+ when Arrow::BooleanArray
40
+ b
41
+ else
42
+ raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
43
+
44
+ Arrow::BooleanArray.new(booleans)
45
+ end
46
+
47
+ filter_by_array(boolean_array) # returns sub Vector
48
+ end
49
+
50
+ # @param indices
51
+ # @param booleans
52
+ def [](*args)
53
+ args.flatten!
54
+ return Vector.new([]) if args.empty?
55
+
56
+ arg = args[0]
57
+ case arg
58
+ when Vector
59
+ return take_by_vector(arg) if arg.numeric?
60
+ return filter_by_array(arg.data) if arg.boolean?
61
+
62
+ raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
63
+ when Arrow::BooleanArray
64
+ return filter_by_array(arg)
65
+ when Arrow::Array
66
+ array = arg
67
+ else
68
+ unless arg.is_a?(Numeric) || booleans?([arg])
69
+ raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
70
+ end
71
+ end
72
+ array ||= Arrow::Array.new(args)
73
+ return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
74
+
75
+ vector = Vector.new(array)
76
+ return take_by_vector(vector) if vector.numeric?
77
+
78
+ raise VectorArgumentError, "Invalid argument: #{args}"
79
+ end
80
+
81
+ # @param values [Array, Arrow::Array, Vector]
82
+ def is_in(*values)
83
+ values.flatten!
84
+ array =
85
+ case values[0]
86
+ when Vector
87
+ values[0].data
88
+ when Arrow::Array
89
+ values[0]
90
+ end
91
+ array ||= data.class.new(values)
92
+ Vector.new(data.is_in(array))
93
+ end
94
+
95
+ # Arrow's support required
96
+ def index(element)
97
+ to_a.index(element)
98
+ end
99
+
100
+ private
101
+
102
+ # Accepts indices by numeric Vector
103
+ def take_by_vector(indices)
104
+ raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
105
+ raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
106
+
107
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
108
+ raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
109
+
110
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
111
+
112
+ datum = find(:array_take).execute([data, index_array])
113
+ Vector.new(datum.value)
114
+ end
115
+
116
+ # Accepts booleans by Arrow::BooleanArray
117
+ def filter_by_array(boolean_array)
118
+ raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
119
+
120
+ datum = find(:array_filter).execute([data, boolean_array])
121
+ Vector.new(datum.value)
122
+ end
123
+ end
124
+ end