red_amber 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,39 +10,37 @@ module RedAmber
10
10
  include VectorSelectable
11
11
  include Helper
12
12
 
13
+ using RefineArrayLike
14
+
15
+ # Quicker constructor of Vector.
16
+ #
17
+ def self.create(arrow_array)
18
+ instance = allocate
19
+ instance.instance_variable_set(:@data, arrow_array)
20
+ instance
21
+ end
22
+
23
+ # Create a Vector.
24
+ #
25
+ # @note default is headless Vector and '@key == nil'
13
26
  def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
28
- arrow_array_like.to_arrow_array
29
- in [Range => r]
30
- Arrow::Array.new(Array(r))
31
- else
32
- begin
33
- Arrow::Array.new(Array(array))
34
- rescue Error
35
- raise VectorArgumentError, "Invalid argument: #{array}"
36
- end
37
- end
38
- end
27
+ @data =
28
+ case array
29
+ in [Vector => v]
30
+ v.data
31
+ in [Range => r]
32
+ Arrow::Array.new(Array(r))
33
+ in [Arrow::Array | Arrow::ChunkedArray]
34
+ array[0]
35
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
36
+ arrow_array_like.to_arrow_array
37
+ else
38
+ Arrow::Array.new(array.flatten)
39
+ end
39
40
  end
40
41
 
41
42
  attr_reader :data
42
-
43
- def to_arrow_array
44
- @data
45
- end
43
+ alias_method :to_arrow_array, :data
46
44
 
47
45
  attr_accessor :key
48
46
 
@@ -52,45 +50,46 @@ module RedAmber
52
50
 
53
51
  def inspect(limit: 80)
54
52
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
55
- # Better performance than `.upcase == 'MINIMUM'``
53
+ # Better performance than `.upcase == 'MINIMUM'`
56
54
  "#{self.class}(:#{type}, size=#{size})"
57
55
  else
58
56
  sio = StringIO.new << '['
59
- to_a.each_with_object(sio).with_index do |(e, s), i|
60
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
61
- if (s.size + next_str.size) < limit
62
- s << next_str
57
+ each.with_index do |e, i|
58
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
59
+ if (sio.size + next_str.size) < limit
60
+ sio << next_str
63
61
  else
64
- s << ', ... ' if i < size
62
+ sio << ', ... ' if i < size
65
63
  break
66
64
  end
67
65
  end
68
66
  sio << ']'
69
67
 
70
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
68
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
69
+ object_id, sio.string
71
70
  end
72
71
  end
73
72
 
74
- def values
73
+ def to_ary
75
74
  @data.values
76
75
  end
77
- alias_method :to_a, :values
78
- alias_method :entries, :values
76
+
77
+ alias_method :to_a, :to_ary
78
+ alias_method :values, :to_ary
79
+ alias_method :entries, :to_ary
79
80
 
80
81
  def indices
81
82
  (0...size).to_a
82
83
  end
84
+
83
85
  alias_method :indexes, :indices
84
86
  alias_method :indeces, :indices
85
87
 
86
- def to_ary
87
- values
88
- end
89
-
90
88
  def size
91
89
  # only defined :length in Arrow?
92
90
  @data.length
93
91
  end
92
+
94
93
  alias_method :length, :size
95
94
  alias_method :n_rows, :size
96
95
  alias_method :nrow, :size
@@ -100,39 +99,43 @@ module RedAmber
100
99
  end
101
100
 
102
101
  def type
103
- @data.value_type.nick.to_sym
102
+ list? ? :list : @data.value_type.nick.to_sym
104
103
  end
105
104
 
106
105
  def boolean?
107
- type_class == Arrow::BooleanDataType
106
+ @data.boolean?
108
107
  end
109
108
 
110
109
  def numeric?
111
- type_class < Arrow::NumericDataType
110
+ @data.numeric?
112
111
  end
113
112
 
114
113
  def float?
115
- type_class < Arrow::FloatingPointDataType
114
+ @data.float?
116
115
  end
117
116
 
118
117
  def integer?
119
- type_class < Arrow::IntegerDataType
118
+ @data.integer?
120
119
  end
121
120
 
122
121
  def string?
123
- type_class == Arrow::StringDataType
122
+ @data.string?
124
123
  end
125
124
 
126
125
  def dictionary?
127
- type_class == Arrow::DictionaryDataType
126
+ @data.dictionary?
128
127
  end
129
128
 
130
129
  def temporal?
131
- type_class < Arrow::TemporalDataType
130
+ @data.temporal?
131
+ end
132
+
133
+ def list?
134
+ @data.list?
132
135
  end
133
136
 
134
137
  def type_class
135
- @data.value_data_type.class
138
+ @data.type_class
136
139
  end
137
140
 
138
141
  def each
@@ -12,7 +12,8 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max
16
+ product stddev sum variance]
16
17
  unary_aggregations.each do |function|
17
18
  define_method(function) do |**options|
18
19
  datum = exec_func_unary(function, options)
@@ -54,7 +55,10 @@ module RedAmber
54
55
  # @param min_count [Integer] min count.
55
56
  # @return [Float] quantile.
56
57
  def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
+ unless (0..1).cover? prob
59
+ raise VectorArgumentError,
60
+ "Invalid: probability #{prob} must be between 0 and 1"
61
+ end
58
62
 
59
63
  datum = find(:quantile).execute([data],
60
64
  q: prob,
@@ -66,7 +70,8 @@ module RedAmber
66
70
 
67
71
  # Return quantiles in a DataFrame
68
72
  #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
73
+ def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0],
74
+ interpolation: :linear, skip_nils: true, min_count: 0)
70
75
  if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
76
  raise VectorArgumentError, "Invarid probavilities #{probs}"
72
77
  end
@@ -74,20 +79,23 @@ module RedAmber
74
79
  DataFrame.new(
75
80
  probs: probs,
76
81
  quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
82
+ quantile(q,
83
+ interpolation: interpolation, skip_nils: skip_nils,
84
+ min_count: min_count)
78
85
  end
79
86
  )
80
87
  end
81
88
 
82
89
  # [Unary element-wise]: vector.func => vector
83
90
  unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
91
+ %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos
92
+ fill_null_backward fill_null_forward floor
93
+ is_finite is_inf is_nan is_null is_valid ln log10 log1p log2
86
94
  round round_to_multiple sign sin tan trunc unique]
87
95
  unary_element_wise.each do |function|
88
96
  define_method(function) do |**options|
89
97
  datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
98
+ Vector.create(datum.value)
91
99
  end
92
100
  end
93
101
  alias_method :is_nil, :is_null
@@ -113,12 +121,12 @@ module RedAmber
113
121
  unary_element_wise_op.each do |function, operator|
114
122
  define_method(function) do |**options|
115
123
  datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
124
+ Vector.create(datum.value)
117
125
  end
118
126
 
119
127
  define_method(operator) do |**options|
120
128
  datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
129
+ Vector.create(datum.value)
122
130
  end
123
131
  end
124
132
  alias_method :not, :invert
@@ -129,7 +137,7 @@ module RedAmber
129
137
  binary_element_wise.each do |function|
130
138
  define_method(function) do |other, **options|
131
139
  datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
140
+ Vector.create(datum.value)
133
141
  end
134
142
  end
135
143
 
@@ -145,7 +153,7 @@ module RedAmber
145
153
  logical_binary_element_wise.each do |method, function|
146
154
  define_method(method) do |other, **options|
147
155
  datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
156
+ Vector.create(datum.value)
149
157
  end
150
158
  end
151
159
 
@@ -171,12 +179,12 @@ module RedAmber
171
179
  binary_element_wise_op.each do |function, operator|
172
180
  define_method(function) do |other, **options|
173
181
  datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
182
+ Vector.create(datum.value)
175
183
  end
176
184
 
177
185
  define_method(operator) do |other, **options|
178
186
  datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
187
+ Vector.create(datum.value)
180
188
  end
181
189
  end
182
190
  alias_method :eq, :equal
@@ -190,67 +198,6 @@ module RedAmber
190
198
  [Vector.new(Array(other) * size), self]
191
199
  end
192
200
 
193
- # < Not implimented yet > ---
194
-
195
- # option(s) required
196
- # - index
197
-
198
- # Returns other than value
199
- # - mode
200
- # - tdigest
201
-
202
- # Functions with numerical range check (unary)
203
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
204
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
205
-
206
- # Functions with numerical range check (binary)
207
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
208
- # shift_left_checked shift_right_checked
209
-
210
- # (array functions)
211
- # dictionary_encode,
212
- # partition_nth_indices,
213
- # quarter, quarters_between,
214
-
215
- # (strings)
216
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
217
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
218
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
219
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
220
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
221
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
222
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
223
- # ends_with, extract_regex, find_substring, find_substring_regex,
224
- # match_like, match_substring, match_substring_regex, replace_substring,
225
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
226
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
227
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
228
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
229
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
230
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
231
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
232
-
233
- # (temporal)
234
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
235
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
236
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
237
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
238
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
239
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
240
-
241
- # (onditional)
242
- # case_when, cast,
243
-
244
- # (indices)
245
- # choose, index_in, index_in_meta_binary, indices_nonzero
246
-
247
- # (others)
248
- # coalesce,
249
- # is_in_meta_binary,
250
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
251
- # max_element_wise, min_element_wise, random, select_k_unstable,
252
- # struct_field,
253
-
254
201
  private # =======
255
202
 
256
203
  def exec_func_unary(function, options)
@@ -263,7 +210,8 @@ module RedAmber
263
210
  case other
264
211
  when Vector
265
212
  find(function).execute([data, other.data], options)
266
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
213
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
214
+ Array, Numeric, String, TrueClass, FalseClass
267
215
  find(function).execute([data, other], options)
268
216
  end
269
217
  end
@@ -4,91 +4,122 @@
4
4
  # reference: https://arrow.apache.org/docs/cpp/compute.html
5
5
 
6
6
  module RedAmber
7
- # mix-ins for class Vector
8
- # Functions to select some data.
7
+ # mix-in for class Vector
8
+ # Functions to select some data.
9
9
  module VectorSelectable
10
- def drop_nil
11
- datum = find(:drop_null).execute([data])
12
- Vector.new(datum.value)
13
- end
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ # Select elements in the self by indices.
14
+ #
15
+ # @param indices [Array<Numeric>, Vector] indices.
16
+ # @yield [Array<Numeric>, Vector] indices.
17
+ # @return [Vector] Vector by selected elements.
18
+ #
19
+ # TODO: support for the option `boundscheck: true`
20
+ def take(*indices, &block)
21
+ if block
22
+ unless indices.empty?
23
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
24
+ end
14
25
 
15
- # vector calculation version of selection by indices
16
- # TODO: support for option {boundscheck: true}
17
- def take(*indices)
18
- indices.flatten!
19
- return Vector.new([]) if indices.empty?
26
+ indices = [yield]
27
+ end
20
28
 
21
- indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
- indices = Vector.new(indices) unless indices.is_a?(Vector)
29
+ vector =
30
+ case indices
31
+ in [Vector => v] if v.numeric?
32
+ return Vector.create(take_by_vector(v))
33
+ in []
34
+ return Vector.new
35
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
36
+ Vector.create(aa)
37
+ else
38
+ Vector.new(indices.flatten)
39
+ end
23
40
 
24
- take_by_vector(indices) # returns sub Vector
41
+ unless vector.numeric?
42
+ raise VectorArgumentError, "argument must be a integers: #{indices}"
43
+ end
44
+
45
+ Vector.create(take_by_vector(vector))
25
46
  end
26
47
 
27
- # TODO: support for option {null_selection_behavior: :drop}
48
+ # Select elements in the self by booleans.
49
+ #
50
+ # @param booleans [Array<true, false, nil>, Vector] booleans.
51
+ # @yield [Array<true, false, nil>, Vector] booleans.
52
+ # @return [Vector] Vector by selected elements.
53
+ #
54
+ # TODO: support for the option `null_selection_behavior: :drop`
28
55
  def filter(*booleans, &block)
29
56
  if block
30
- raise VectorArgumentError, 'Must not specify both arguments and block.' unless booleans.empty?
57
+ unless booleans.empty?
58
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
59
+ end
31
60
 
32
61
  booleans = [yield]
33
62
  end
34
63
 
35
- booleans.flatten!
36
- return Vector.new([]) if booleans.empty?
37
-
38
- b = booleans[0]
39
- boolean_array =
40
- case b
41
- when Vector
42
- raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
64
+ case booleans
65
+ in [Vector => v]
66
+ raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
43
67
 
44
- b.data
45
- when Arrow::BooleanArray
46
- b
68
+ Vector.create(filter_by_array(v.data))
69
+ in [Arrow::BooleanArray => ba]
70
+ Vector.create(filter_by_array(ba))
71
+ in []
72
+ Vector.new
73
+ else
74
+ booleans.flatten!
75
+ a = Arrow::Array.new(booleans)
76
+ if a.boolean?
77
+ Vector.create(filter_by_array(a))
78
+ elsif booleans.compact.empty? # [nil, nil] becomes string array
79
+ Vector.new
47
80
  else
48
- raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
49
-
50
- Arrow::BooleanArray.new(booleans)
81
+ raise VectorTypeError, "Argument is not a boolean: #{booleans}"
51
82
  end
52
-
53
- filter_by_array(boolean_array) # returns sub Vector
83
+ end
54
84
  end
55
85
  alias_method :select, :filter
56
86
  alias_method :find_all, :filter
57
87
 
58
- # @param indices
59
- # @param booleans
88
+ # Select elements in the self by indices or booleans.
89
+ #
90
+ # @param args [Array<Numeric, true, false, nil>, Vector] specifier.
91
+ # @yield [Array<Numeric, true, false, nil>, Vector] specifier.
92
+ # @return [scalar, Array] returns scalar or array.
93
+ #
60
94
  def [](*args)
61
- args.flatten!
62
- return Vector.new([]) if args.empty?
63
-
64
- arg = args[0]
65
- case arg
66
- when Vector
67
- return take_by_vector(arg) if arg.numeric?
68
- return filter_by_array(arg.data) if arg.boolean?
69
-
70
- raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
71
- when Arrow::BooleanArray
72
- return filter_by_array(arg)
73
- when Arrow::Array
74
- array = arg
75
- when Range
76
- array = normalize_element(arg)
77
- else
78
- unless arg.is_a?(Numeric) || booleans?([arg])
79
- raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
95
+ array =
96
+ case args
97
+ in [Vector => v]
98
+ return scalar_or_array(take_by_vector(v)) if v.numeric?
99
+ return scalar_or_array(filter_by_array(v.data)) if v.boolean?
100
+
101
+ raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
102
+ in [Arrow::BooleanArray => ba]
103
+ return scalar_or_array(filter_by_array(ba))
104
+ in []
105
+ return nil
106
+ in [Arrow::Array => arrow_array]
107
+ arrow_array
108
+ in [Range => r]
109
+ Arrow::Array.new(parse_range(r, size))
110
+ else
111
+ Arrow::Array.new(args.flatten)
80
112
  end
81
- end
82
- array ||= Arrow::Array.new(args)
83
- return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
113
+
114
+ return scalar_or_array(filter_by_array(array)) if array.boolean?
84
115
 
85
116
  vector = Vector.new(array)
86
- return take_by_vector(vector) if vector.numeric?
117
+ return scalar_or_array(take_by_vector(vector)) if vector.numeric?
87
118
 
88
119
  raise VectorArgumentError, "Invalid argument: #{args}"
89
120
  end
90
121
 
91
- # @param values [Array, Arrow::Array, Vector]
122
+ # @param values [Array, Arrow::Array, Vector]
92
123
  def is_in(*values)
93
124
  self_data = chunked? ? data.pack : data
94
125
 
@@ -100,7 +131,7 @@ module RedAmber
100
131
  Array(values).flatten
101
132
  end
102
133
 
103
- Vector.new(self_data.is_in(array))
134
+ Vector.create(self_data.is_in(array))
104
135
  end
105
136
 
106
137
  # Arrow's support required
@@ -108,28 +139,44 @@ module RedAmber
108
139
  to_a.index(element)
109
140
  end
110
141
 
142
+ def drop_nil
143
+ datum = find(:drop_null).execute([data])
144
+ Vector.create(datum.value)
145
+ end
146
+
111
147
  private
112
148
 
113
149
  # Accepts indices by numeric Vector
114
150
  def take_by_vector(indices)
115
- raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
116
- raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
151
+ indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
117
152
 
118
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
119
- raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
153
+ min, max = indices.min_max
154
+ raise VectorArgumentError, "Index out of range: #{min}" if min < 0
155
+ raise VectorArgumentError, "Index out of range: #{max}" if max >= size
120
156
 
121
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
157
+ index_array =
158
+ if indices.float?
159
+ Arrow::UInt64ArrayBuilder.build(indices.data)
160
+ else
161
+ indices.data
162
+ end
122
163
 
123
- datum = find(:take).execute([data, index_array]) # :array_take will fail with ChunkedArray
124
- Vector.new(datum.value)
164
+ # :array_take will fail with ChunkedArray
165
+ find(:take).execute([data, index_array]).value
125
166
  end
126
167
 
127
168
  # Accepts booleans by Arrow::BooleanArray
128
169
  def filter_by_array(boolean_array)
129
- raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
170
+ unless boolean_array.length == size
171
+ raise VectorArgumentError, 'Booleans must be same size as self.'
172
+ end
173
+
174
+ find(:array_filter).execute([data, boolean_array]).value
175
+ end
130
176
 
131
- datum = find(:array_filter).execute([data, boolean_array])
132
- Vector.new(datum.value)
177
+ def scalar_or_array(arrow_array)
178
+ a = arrow_array.to_a
179
+ a.size > 1 ? a : a[0]
133
180
  end
134
181
  end
135
182
  end