red_amber 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,39 +10,37 @@ module RedAmber
10
10
  include VectorSelectable
11
11
  include Helper
12
12
 
13
+ using RefineArrayLike
14
+
15
+ # Quicker constructor of Vector.
16
+ #
17
+ def self.create(arrow_array)
18
+ instance = allocate
19
+ instance.instance_variable_set(:@data, arrow_array)
20
+ instance
21
+ end
22
+
23
+ # Create a Vector.
24
+ #
25
+ # @note default is headless Vector and '@key == nil'
13
26
  def initialize(*array)
14
- @key = nil # default is 'headless' Vector
15
- if array.empty? || array.first.nil?
16
- Vector.new([])
17
- else
18
- array.flatten!
19
- @data =
20
- case array
21
- in [Vector => v]
22
- v.data
23
- in [Arrow::Array => a]
24
- a
25
- in [Arrow::ChunkedArray => ca]
26
- ca
27
- in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
28
- arrow_array_like.to_arrow_array
29
- in [Range => r]
30
- Arrow::Array.new(Array(r))
31
- else
32
- begin
33
- Arrow::Array.new(Array(array))
34
- rescue Error
35
- raise VectorArgumentError, "Invalid argument: #{array}"
36
- end
37
- end
38
- end
27
+ @data =
28
+ case array
29
+ in [Vector => v]
30
+ v.data
31
+ in [Range => r]
32
+ Arrow::Array.new(Array(r))
33
+ in [Arrow::Array | Arrow::ChunkedArray]
34
+ array[0]
35
+ in [arrow_array_like] if arrow_array_like.respond_to?(:to_arrow_array)
36
+ arrow_array_like.to_arrow_array
37
+ else
38
+ Arrow::Array.new(array.flatten)
39
+ end
39
40
  end
40
41
 
41
42
  attr_reader :data
42
-
43
- def to_arrow_array
44
- @data
45
- end
43
+ alias_method :to_arrow_array, :data
46
44
 
47
45
  attr_accessor :key
48
46
 
@@ -52,45 +50,46 @@ module RedAmber
52
50
 
53
51
  def inspect(limit: 80)
54
52
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table').casecmp('MINIMUM').zero?
55
- # Better performance than `.upcase == 'MINIMUM'``
53
+ # Better performance than `.upcase == 'MINIMUM'`
56
54
  "#{self.class}(:#{type}, size=#{size})"
57
55
  else
58
56
  sio = StringIO.new << '['
59
- to_a.each_with_object(sio).with_index do |(e, s), i|
60
- next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
61
- if (s.size + next_str.size) < limit
62
- s << next_str
57
+ each.with_index do |e, i|
58
+ next_str = "#{sio.size > 1 ? ', ' : ''}#{e.inspect}"
59
+ if (sio.size + next_str.size) < limit
60
+ sio << next_str
63
61
  else
64
- s << ', ... ' if i < size
62
+ sio << ', ... ' if i < size
65
63
  break
66
64
  end
67
65
  end
68
66
  sio << ']'
69
67
 
70
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
68
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n",
69
+ object_id, sio.string
71
70
  end
72
71
  end
73
72
 
74
- def values
73
+ def to_ary
75
74
  @data.values
76
75
  end
77
- alias_method :to_a, :values
78
- alias_method :entries, :values
76
+
77
+ alias_method :to_a, :to_ary
78
+ alias_method :values, :to_ary
79
+ alias_method :entries, :to_ary
79
80
 
80
81
  def indices
81
82
  (0...size).to_a
82
83
  end
84
+
83
85
  alias_method :indexes, :indices
84
86
  alias_method :indeces, :indices
85
87
 
86
- def to_ary
87
- values
88
- end
89
-
90
88
  def size
91
89
  # only defined :length in Arrow?
92
90
  @data.length
93
91
  end
92
+
94
93
  alias_method :length, :size
95
94
  alias_method :n_rows, :size
96
95
  alias_method :nrow, :size
@@ -100,39 +99,43 @@ module RedAmber
100
99
  end
101
100
 
102
101
  def type
103
- @data.value_type.nick.to_sym
102
+ list? ? :list : @data.value_type.nick.to_sym
104
103
  end
105
104
 
106
105
  def boolean?
107
- type_class == Arrow::BooleanDataType
106
+ @data.boolean?
108
107
  end
109
108
 
110
109
  def numeric?
111
- type_class < Arrow::NumericDataType
110
+ @data.numeric?
112
111
  end
113
112
 
114
113
  def float?
115
- type_class < Arrow::FloatingPointDataType
114
+ @data.float?
116
115
  end
117
116
 
118
117
  def integer?
119
- type_class < Arrow::IntegerDataType
118
+ @data.integer?
120
119
  end
121
120
 
122
121
  def string?
123
- type_class == Arrow::StringDataType
122
+ @data.string?
124
123
  end
125
124
 
126
125
  def dictionary?
127
- type_class == Arrow::DictionaryDataType
126
+ @data.dictionary?
128
127
  end
129
128
 
130
129
  def temporal?
131
- type_class < Arrow::TemporalDataType
130
+ @data.temporal?
131
+ end
132
+
133
+ def list?
134
+ @data.list?
132
135
  end
133
136
 
134
137
  def type_class
135
- @data.value_data_type.class
138
+ @data.type_class
136
139
  end
137
140
 
138
141
  def each
@@ -12,7 +12,8 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max
16
+ product stddev sum variance]
16
17
  unary_aggregations.each do |function|
17
18
  define_method(function) do |**options|
18
19
  datum = exec_func_unary(function, options)
@@ -54,7 +55,10 @@ module RedAmber
54
55
  # @param min_count [Integer] min count.
55
56
  # @return [Float] quantile.
56
57
  def quantile(prob = 0.5, interpolation: :linear, skip_nils: true, min_count: 0)
57
- raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" unless (0..1).cover? prob
58
+ unless (0..1).cover? prob
59
+ raise VectorArgumentError,
60
+ "Invalid: probability #{prob} must be between 0 and 1"
61
+ end
58
62
 
59
63
  datum = find(:quantile).execute([data],
60
64
  q: prob,
@@ -66,7 +70,8 @@ module RedAmber
66
70
 
67
71
  # Return quantiles in a DataFrame
68
72
  #
69
- def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0], interpolation: :linear, skip_nils: true, min_count: 0)
73
+ def quantiles(probs = [1.0, 0.75, 0.5, 0.25, 0.0],
74
+ interpolation: :linear, skip_nils: true, min_count: 0)
70
75
  if probs.empty? || !probs.all? { |q| (0..1).cover?(q) }
71
76
  raise VectorArgumentError, "Invarid probavilities #{probs}"
72
77
  end
@@ -74,20 +79,23 @@ module RedAmber
74
79
  DataFrame.new(
75
80
  probs: probs,
76
81
  quantiles: probs.map do |q|
77
- quantile(q, interpolation: interpolation, skip_nils: skip_nils, min_count: min_count)
82
+ quantile(q,
83
+ interpolation: interpolation, skip_nils: skip_nils,
84
+ min_count: min_count)
78
85
  end
79
86
  )
80
87
  end
81
88
 
82
89
  # [Unary element-wise]: vector.func => vector
83
90
  unary_element_wise =
84
- %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos fill_null_backward \
85
- fill_null_forward floor is_finite is_inf is_nan is_null is_valid ln log10 log1p log2 \
91
+ %i[abs acos asin array_sort_indices atan bit_wise_not ceil cos
92
+ fill_null_backward fill_null_forward floor
93
+ is_finite is_inf is_nan is_null is_valid ln log10 log1p log2
86
94
  round round_to_multiple sign sin tan trunc unique]
87
95
  unary_element_wise.each do |function|
88
96
  define_method(function) do |**options|
89
97
  datum = exec_func_unary(function, options)
90
- Vector.new(datum.value)
98
+ Vector.create(datum.value)
91
99
  end
92
100
  end
93
101
  alias_method :is_nil, :is_null
@@ -113,12 +121,12 @@ module RedAmber
113
121
  unary_element_wise_op.each do |function, operator|
114
122
  define_method(function) do |**options|
115
123
  datum = exec_func_unary(function, options)
116
- Vector.new(datum.value)
124
+ Vector.create(datum.value)
117
125
  end
118
126
 
119
127
  define_method(operator) do |**options|
120
128
  datum = exec_func_unary(function, options)
121
- Vector.new(datum.value)
129
+ Vector.create(datum.value)
122
130
  end
123
131
  end
124
132
  alias_method :not, :invert
@@ -129,7 +137,7 @@ module RedAmber
129
137
  binary_element_wise.each do |function|
130
138
  define_method(function) do |other, **options|
131
139
  datum = exec_func_binary(function, other, options)
132
- Vector.new(datum.value)
140
+ Vector.create(datum.value)
133
141
  end
134
142
  end
135
143
 
@@ -145,7 +153,7 @@ module RedAmber
145
153
  logical_binary_element_wise.each do |method, function|
146
154
  define_method(method) do |other, **options|
147
155
  datum = exec_func_binary(function, other, options)
148
- Vector.new(datum.value)
156
+ Vector.create(datum.value)
149
157
  end
150
158
  end
151
159
 
@@ -171,12 +179,12 @@ module RedAmber
171
179
  binary_element_wise_op.each do |function, operator|
172
180
  define_method(function) do |other, **options|
173
181
  datum = exec_func_binary(function, other, options)
174
- Vector.new(datum.value)
182
+ Vector.create(datum.value)
175
183
  end
176
184
 
177
185
  define_method(operator) do |other, **options|
178
186
  datum = exec_func_binary(function, other, options)
179
- Vector.new(datum.value)
187
+ Vector.create(datum.value)
180
188
  end
181
189
  end
182
190
  alias_method :eq, :equal
@@ -190,67 +198,6 @@ module RedAmber
190
198
  [Vector.new(Array(other) * size), self]
191
199
  end
192
200
 
193
- # < Not implimented yet > ---
194
-
195
- # option(s) required
196
- # - index
197
-
198
- # Returns other than value
199
- # - mode
200
- # - tdigest
201
-
202
- # Functions with numerical range check (unary)
203
- # - abs_checked acos_checked asin_checked cos_checked ln_checked
204
- # log10_checked log1p_checked log2_checked sin_checked tan_checked
205
-
206
- # Functions with numerical range check (binary)
207
- # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
208
- # shift_left_checked shift_right_checked
209
-
210
- # (array functions)
211
- # dictionary_encode,
212
- # partition_nth_indices,
213
- # quarter, quarters_between,
214
-
215
- # (strings)
216
- # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
217
- # ascii_is_lower, ascii_is_printable, ascii_is_space, ascii_is_title, ascii_is_upper,
218
- # ascii_lower, ascii_lpad, ascii_ltrim, ascii_ltrim_whitespace, ascii_reverse,
219
- # ascii_rpad, ascii_rtrim, ascii_rtrim_whitespace, ascii_split_whitespace,
220
- # ascii_swapcase, ascii_title, ascii_trim, ascii_trim_whitespace, ascii_upper,
221
- # binary_join, binary_join_element_wise, binary_length, binary_repeat,
222
- # binary_replace_slice, binary_reverse, count_substring, count_substring_regex,
223
- # ends_with, extract_regex, find_substring, find_substring_regex,
224
- # match_like, match_substring, match_substring_regex, replace_substring,
225
- # replace_substring_regex, split_pattern, split_pattern_regex, starts_with,
226
- # string_is_ascii, utf8_capitalize, utf8_center, utf8_is_alnum, utf8_is_alpha,
227
- # utf8_is_decimal, utf8_is_digit, utf8_is_lower, utf8_is_numeric, utf8_is_printable,
228
- # utf8_is_space, utf8_is_title, utf8_is_upper, utf8_length, utf8_lower, utf8_lpad,
229
- # utf8_ltrim, utf8_ltrim_whitespace, utf8_normalize, utf8_replace_slice, utf8_reverse,
230
- # utf8_rpad, utf8_rtrim, utf8_rtrim_whitespace, utf8_slice_codeunits, utf8_split_whitespace,
231
- # utf8_swapcase, utf8_title, utf8_trim, utf8_trim_whitespace, utf8_upper
232
-
233
- # (temporal)
234
- # assume_timezone, ceil_temporal, day, day_of_week, day_of_year, day_time_interval_between,
235
- # days_between, floor_temporal, hour, hours_between, iso_calendar, iso_week, iso_year,
236
- # microsecond, microseconds_between, millisecond, milliseconds_between, minute,
237
- # minutes_between, month, month_day_nano_interval_between, month_interval_between,
238
- # nanosecond, nanoseconds_between, round_temporal, second, seconds_between, strftime,
239
- # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
240
-
241
- # (onditional)
242
- # case_when, cast,
243
-
244
- # (indices)
245
- # choose, index_in, index_in_meta_binary, indices_nonzero
246
-
247
- # (others)
248
- # coalesce,
249
- # is_in_meta_binary,
250
- # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
251
- # max_element_wise, min_element_wise, random, select_k_unstable,
252
- # struct_field,
253
-
254
201
  private # =======
255
202
 
256
203
  def exec_func_unary(function, options)
@@ -263,7 +210,8 @@ module RedAmber
263
210
  case other
264
211
  when Vector
265
212
  find(function).execute([data, other.data], options)
266
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
213
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar,
214
+ Array, Numeric, String, TrueClass, FalseClass
267
215
  find(function).execute([data, other], options)
268
216
  end
269
217
  end
@@ -4,91 +4,122 @@
4
4
  # reference: https://arrow.apache.org/docs/cpp/compute.html
5
5
 
6
6
  module RedAmber
7
- # mix-ins for class Vector
8
- # Functions to select some data.
7
+ # mix-in for class Vector
8
+ # Functions to select some data.
9
9
  module VectorSelectable
10
- def drop_nil
11
- datum = find(:drop_null).execute([data])
12
- Vector.new(datum.value)
13
- end
10
+ using RefineArray
11
+ using RefineArrayLike
12
+
13
+ # Select elements in the self by indices.
14
+ #
15
+ # @param indices [Array<Numeric>, Vector] indices.
16
+ # @yield [Array<Numeric>, Vector] indices.
17
+ # @return [Vector] Vector by selected elements.
18
+ #
19
+ # TODO: support for the option `boundscheck: true`
20
+ def take(*indices, &block)
21
+ if block
22
+ unless indices.empty?
23
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
24
+ end
14
25
 
15
- # vector calculation version of selection by indices
16
- # TODO: support for option {boundscheck: true}
17
- def take(*indices)
18
- indices.flatten!
19
- return Vector.new([]) if indices.empty?
26
+ indices = [yield]
27
+ end
20
28
 
21
- indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
- indices = Vector.new(indices) unless indices.is_a?(Vector)
29
+ vector =
30
+ case indices
31
+ in [Vector => v] if v.numeric?
32
+ return Vector.create(take_by_vector(v))
33
+ in []
34
+ return Vector.new
35
+ in [(Arrow::Array | Arrow::ChunkedArray) => aa]
36
+ Vector.create(aa)
37
+ else
38
+ Vector.new(indices.flatten)
39
+ end
23
40
 
24
- take_by_vector(indices) # returns sub Vector
41
+ unless vector.numeric?
42
+ raise VectorArgumentError, "argument must be a integers: #{indices}"
43
+ end
44
+
45
+ Vector.create(take_by_vector(vector))
25
46
  end
26
47
 
27
- # TODO: support for option {null_selection_behavior: :drop}
48
+ # Select elements in the self by booleans.
49
+ #
50
+ # @param booleans [Array<true, false, nil>, Vector] booleans.
51
+ # @yield [Array<true, false, nil>, Vector] booleans.
52
+ # @return [Vector] Vector by selected elements.
53
+ #
54
+ # TODO: support for the option `null_selection_behavior: :drop`
28
55
  def filter(*booleans, &block)
29
56
  if block
30
- raise VectorArgumentError, 'Must not specify both arguments and block.' unless booleans.empty?
57
+ unless booleans.empty?
58
+ raise VectorArgumentError, 'Must not specify both arguments and block.'
59
+ end
31
60
 
32
61
  booleans = [yield]
33
62
  end
34
63
 
35
- booleans.flatten!
36
- return Vector.new([]) if booleans.empty?
37
-
38
- b = booleans[0]
39
- boolean_array =
40
- case b
41
- when Vector
42
- raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
64
+ case booleans
65
+ in [Vector => v]
66
+ raise VectorTypeError, 'Argument is not a boolean.' unless v.boolean?
43
67
 
44
- b.data
45
- when Arrow::BooleanArray
46
- b
68
+ Vector.create(filter_by_array(v.data))
69
+ in [Arrow::BooleanArray => ba]
70
+ Vector.create(filter_by_array(ba))
71
+ in []
72
+ Vector.new
73
+ else
74
+ booleans.flatten!
75
+ a = Arrow::Array.new(booleans)
76
+ if a.boolean?
77
+ Vector.create(filter_by_array(a))
78
+ elsif booleans.compact.empty? # [nil, nil] becomes string array
79
+ Vector.new
47
80
  else
48
- raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
49
-
50
- Arrow::BooleanArray.new(booleans)
81
+ raise VectorTypeError, "Argument is not a boolean: #{booleans}"
51
82
  end
52
-
53
- filter_by_array(boolean_array) # returns sub Vector
83
+ end
54
84
  end
55
85
  alias_method :select, :filter
56
86
  alias_method :find_all, :filter
57
87
 
58
- # @param indices
59
- # @param booleans
88
+ # Select elements in the self by indices or booleans.
89
+ #
90
+ # @param args [Array<Numeric, true, false, nil>, Vector] specifier.
91
+ # @yield [Array<Numeric, true, false, nil>, Vector] specifier.
92
+ # @return [scalar, Array] returns scalar or array.
93
+ #
60
94
  def [](*args)
61
- args.flatten!
62
- return Vector.new([]) if args.empty?
63
-
64
- arg = args[0]
65
- case arg
66
- when Vector
67
- return take_by_vector(arg) if arg.numeric?
68
- return filter_by_array(arg.data) if arg.boolean?
69
-
70
- raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
71
- when Arrow::BooleanArray
72
- return filter_by_array(arg)
73
- when Arrow::Array
74
- array = arg
75
- when Range
76
- array = normalize_element(arg)
77
- else
78
- unless arg.is_a?(Numeric) || booleans?([arg])
79
- raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
95
+ array =
96
+ case args
97
+ in [Vector => v]
98
+ return scalar_or_array(take_by_vector(v)) if v.numeric?
99
+ return scalar_or_array(filter_by_array(v.data)) if v.boolean?
100
+
101
+ raise VectorTypeError, "Argument must be numeric or boolean: #{args}"
102
+ in [Arrow::BooleanArray => ba]
103
+ return scalar_or_array(filter_by_array(ba))
104
+ in []
105
+ return nil
106
+ in [Arrow::Array => arrow_array]
107
+ arrow_array
108
+ in [Range => r]
109
+ Arrow::Array.new(parse_range(r, size))
110
+ else
111
+ Arrow::Array.new(args.flatten)
80
112
  end
81
- end
82
- array ||= Arrow::Array.new(args)
83
- return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
113
+
114
+ return scalar_or_array(filter_by_array(array)) if array.boolean?
84
115
 
85
116
  vector = Vector.new(array)
86
- return take_by_vector(vector) if vector.numeric?
117
+ return scalar_or_array(take_by_vector(vector)) if vector.numeric?
87
118
 
88
119
  raise VectorArgumentError, "Invalid argument: #{args}"
89
120
  end
90
121
 
91
- # @param values [Array, Arrow::Array, Vector]
122
+ # @param values [Array, Arrow::Array, Vector]
92
123
  def is_in(*values)
93
124
  self_data = chunked? ? data.pack : data
94
125
 
@@ -100,7 +131,7 @@ module RedAmber
100
131
  Array(values).flatten
101
132
  end
102
133
 
103
- Vector.new(self_data.is_in(array))
134
+ Vector.create(self_data.is_in(array))
104
135
  end
105
136
 
106
137
  # Arrow's support required
@@ -108,28 +139,44 @@ module RedAmber
108
139
  to_a.index(element)
109
140
  end
110
141
 
142
+ def drop_nil
143
+ datum = find(:drop_null).execute([data])
144
+ Vector.create(datum.value)
145
+ end
146
+
111
147
  private
112
148
 
113
149
  # Accepts indices by numeric Vector
114
150
  def take_by_vector(indices)
115
- raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
116
- raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
151
+ indices = (indices < 0).if_else(indices + size, indices) if (indices < 0).any?
117
152
 
118
- normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
119
- raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
153
+ min, max = indices.min_max
154
+ raise VectorArgumentError, "Index out of range: #{min}" if min < 0
155
+ raise VectorArgumentError, "Index out of range: #{max}" if max >= size
120
156
 
121
- index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
157
+ index_array =
158
+ if indices.float?
159
+ Arrow::UInt64ArrayBuilder.build(indices.data)
160
+ else
161
+ indices.data
162
+ end
122
163
 
123
- datum = find(:take).execute([data, index_array]) # :array_take will fail with ChunkedArray
124
- Vector.new(datum.value)
164
+ # :array_take will fail with ChunkedArray
165
+ find(:take).execute([data, index_array]).value
125
166
  end
126
167
 
127
168
  # Accepts booleans by Arrow::BooleanArray
128
169
  def filter_by_array(boolean_array)
129
- raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
170
+ unless boolean_array.length == size
171
+ raise VectorArgumentError, 'Booleans must be same size as self.'
172
+ end
173
+
174
+ find(:array_filter).execute([data, boolean_array]).value
175
+ end
130
176
 
131
- datum = find(:array_filter).execute([data, boolean_array])
132
- Vector.new(datum.value)
177
+ def scalar_or_array(arrow_array)
178
+ a = arrow_array.to_a
179
+ a.size > 1 ? a : a[0]
133
180
  end
134
181
  end
135
182
  end