red_amber 0.1.3 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +31 -7
  3. data/CHANGELOG.md +214 -10
  4. data/Gemfile +4 -0
  5. data/README.md +117 -342
  6. data/benchmark/csv_load_penguins.yml +15 -0
  7. data/benchmark/drop_nil.yml +11 -0
  8. data/doc/DataFrame.md +854 -0
  9. data/doc/Vector.md +449 -0
  10. data/doc/image/arrow_table_new.png +0 -0
  11. data/doc/image/dataframe/assign.png +0 -0
  12. data/doc/image/dataframe/drop.png +0 -0
  13. data/doc/image/dataframe/pick.png +0 -0
  14. data/doc/image/dataframe/remove.png +0 -0
  15. data/doc/image/dataframe/rename.png +0 -0
  16. data/doc/image/dataframe/slice.png +0 -0
  17. data/doc/image/dataframe_model.png +0 -0
  18. data/doc/image/example_in_red_arrow.png +0 -0
  19. data/doc/image/tdr.png +0 -0
  20. data/doc/image/tdr_and_table.png +0 -0
  21. data/doc/image/tidy_data_in_TDR.png +0 -0
  22. data/doc/image/vector/binary_element_wise.png +0 -0
  23. data/doc/image/vector/unary_aggregation.png +0 -0
  24. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  25. data/doc/image/vector/unary_element_wise.png +0 -0
  26. data/doc/tdr.md +56 -0
  27. data/doc/tdr_ja.md +56 -0
  28. data/lib/red-amber.rb +27 -0
  29. data/lib/red_amber/data_frame.rb +91 -37
  30. data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
  31. data/lib/red_amber/data_frame_indexable.rb +38 -0
  32. data/lib/red_amber/data_frame_observation_operation.rb +11 -0
  33. data/lib/red_amber/data_frame_selectable.rb +155 -48
  34. data/lib/red_amber/data_frame_variable_operation.rb +137 -0
  35. data/lib/red_amber/helper.rb +61 -0
  36. data/lib/red_amber/vector.rb +69 -16
  37. data/lib/red_amber/vector_functions.rb +80 -45
  38. data/lib/red_amber/vector_selectable.rb +124 -0
  39. data/lib/red_amber/vector_updatable.rb +104 -0
  40. data/lib/red_amber/version.rb +1 -1
  41. data/lib/red_amber.rb +1 -16
  42. data/red_amber.gemspec +3 -6
  43. metadata +38 -9
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameVariableOperation
6
+ # pick up some variables to create sub DataFrame
7
+ def pick(*args, &block)
8
+ picker = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ picker = instance_eval(&block)
13
+ end
14
+ picker = [picker].flatten
15
+ return DataFrame.new if picker.empty? || picker == [nil]
16
+
17
+ picker = keys_by_booleans(picker) if booleans?(picker)
18
+
19
+ # DataFrame#[] creates a Vector with single key is specified.
20
+ # DataFrame#pick creates a DataFrame with single key.
21
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
22
+
23
+ raise DataFrameArgumentError, "Invalid argument #{args}"
24
+ end
25
+
26
+ # drop some variables to create remainer sub DataFrame
27
+ def drop(*args, &block)
28
+ dropper = args
29
+ if block
30
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
31
+
32
+ dropper = instance_eval(&block)
33
+ end
34
+ dropper = [dropper].flatten
35
+ dropper = keys_by_booleans(dropper) if booleans?(dropper)
36
+
37
+ picker = keys - dropper
38
+ return DataFrame.new if picker.empty?
39
+
40
+ # DataFrame#[] creates a Vector with single key is specified.
41
+ # DataFrame#drop creates a DataFrame with single key.
42
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
43
+
44
+ raise DataFrameArgumentError, "Invalid argument #{args}"
45
+ end
46
+
47
+ # rename variables to create new DataFrame
48
+ def rename(*args, &block)
49
+ renamer = args
50
+ if block
51
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
52
+
53
+ renamer = instance_eval(&block)
54
+ end
55
+ renamer = [renamer].flatten
56
+ return self if renamer.empty?
57
+
58
+ return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
59
+ return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
60
+
61
+ raise DataFrameArgumentError, "Invalid argument #{args}"
62
+ end
63
+
64
+ # assign variables to create new DataFrame
65
+ def assign(*args, &block)
66
+ assigner = args
67
+ if block
68
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
69
+
70
+ assigner = instance_eval(&block)
71
+ end
72
+ assigner = [assigner].flatten
73
+ return self if assigner.empty? || assigner == [nil]
74
+
75
+ raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
76
+
77
+ updater = {}
78
+ appender = {}
79
+ assigner[0].each do |key, value|
80
+ if keys.include? key
81
+ updater[key] = value
82
+ else
83
+ appender[key] = value
84
+ end
85
+ end
86
+ fields, arrays = update_fields_and_arrays(updater)
87
+ append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
88
+
89
+ DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
90
+ end
91
+
92
+ private
93
+
94
+ def rename_by_hash(key_pairs)
95
+ fields = keys.map do |key|
96
+ new_key = key_pairs[key]
97
+ if new_key
98
+ Arrow::Field.new(new_key.to_sym, @table[key].data_type)
99
+ else
100
+ @table.schema[key]
101
+ end
102
+ end
103
+ schema = Arrow::Schema.new(fields)
104
+ DataFrame.new(Arrow::Table.new(schema, @table.columns))
105
+ end
106
+
107
+ def update_fields_and_arrays(updater)
108
+ fields = @table.columns.map(&:field)
109
+ arrays = @table.columns.map(&:data) # chunked_arrays
110
+ keys.each_with_index do |key, i|
111
+ data = updater[key]
112
+ next unless data
113
+
114
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
115
+
116
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
117
+ fields[i] = Arrow::Field.new(key, a.value_data_type)
118
+ arrays[i] = Arrow::ChunkedArray.new([a])
119
+ end
120
+ [fields, arrays]
121
+ end
122
+
123
+ def append_to_fields_and_arrays(appender, fields, arrays)
124
+ appender.each do |key, data|
125
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
126
+
127
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
128
+ fields << Arrow::Field.new(key.to_sym, a.value_data_type)
129
+ arrays << Arrow::ChunkedArray.new([a])
130
+ end
131
+ end
132
+
133
+ def keys_by_booleans(booleans)
134
+ keys.select.with_index { |_, i| booleans[i] }
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module Helper
6
+ private
7
+
8
+ def pl(num)
9
+ num > 1 ? 's' : ''
10
+ end
11
+
12
+ def out_of_range?(indeces)
13
+ indeces.max >= size || indeces.min < -size
14
+ end
15
+
16
+ def integers?(enum)
17
+ enum.all?(Integer)
18
+ end
19
+
20
+ def sym_or_str?(enum)
21
+ enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
22
+ end
23
+
24
+ def booleans?(enum)
25
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
26
+ end
27
+
28
+ def create_dataframe_from_vector(key, vector)
29
+ DataFrame.new(key => vector.data)
30
+ end
31
+
32
+ def parse_to_vector(args)
33
+ a = args.reduce([]) do |accum, elem|
34
+ accum.concat(normalize_element(elem))
35
+ end
36
+ Vector.new(a)
37
+ end
38
+
39
+ def normalize_element(elem)
40
+ case elem
41
+ when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
42
+ [elem]
43
+ when Range
44
+ both_end = [elem.begin, elem.end]
45
+ both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
46
+
47
+ if both_end.any?(Integer) || both_end.all?(&:nil?)
48
+ if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
49
+ raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
50
+ end
51
+
52
+ (0...size).to_a[elem]
53
+ else
54
+ elem.to_a
55
+ end
56
+ else
57
+ Array(elem)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,27 +1,42 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # Columnar data object
4
+ # Values in variable (columnar) data object
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
8
  include VectorFunctions
9
-
10
- # chunked_array may come from column.data
11
- def initialize(array)
12
- case array
13
- when Vector
14
- @data = array.data
15
- when Arrow::Array, Arrow::ChunkedArray
16
- @data = array
17
- when Array
18
- @data = Arrow::Array.new(array)
9
+ include VectorUpdatable
10
+ include VectorSelectable
11
+ include Helper
12
+
13
+ def initialize(*array)
14
+ @key = nil # default is 'headless'
15
+ if array.empty? || array[0].nil?
16
+ Vector.new([])
19
17
  else
20
- raise ArgumentError, 'Unknown array in argument'
18
+ array.flatten!
19
+ case array[0]
20
+ when Vector
21
+ @data = array[0].data
22
+ return
23
+ when Arrow::Array, Arrow::ChunkedArray
24
+ @data = array[0]
25
+ return
26
+ when Range
27
+ @data = Arrow::Array.new(Array(array[0]))
28
+ return
29
+ end
30
+ begin
31
+ @data = Arrow::Array.new(Array(array))
32
+ rescue Error
33
+ raise VectorArgumentError, "Invalid argument: #{array}"
34
+ end
21
35
  end
22
36
  end
23
37
 
24
38
  attr_reader :data
39
+ attr_accessor :key
25
40
 
26
41
  def to_s
27
42
  @data.to_a.inspect
@@ -49,6 +64,16 @@ module RedAmber
49
64
  alias_method :to_a, :values
50
65
  alias_method :entries, :values
51
66
 
67
+ def indices
68
+ (0...size).to_a
69
+ end
70
+ alias_method :indexes, :indices
71
+ alias_method :indeces, :indices
72
+
73
+ def to_ary
74
+ to_a
75
+ end
76
+
52
77
  def size
53
78
  # only defined :length in Arrow?
54
79
  @data.length
@@ -57,6 +82,10 @@ module RedAmber
57
82
  alias_method :n_rows, :size
58
83
  alias_method :nrow, :size
59
84
 
85
+ def empty?
86
+ size.zero?
87
+ end
88
+
60
89
  def type
61
90
  @data.value_type.nick.to_sym
62
91
  end
@@ -66,15 +95,19 @@ module RedAmber
66
95
  end
67
96
 
68
97
  def numeric?
69
- %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
98
+ type_class < Arrow::NumericDataType
70
99
  end
71
100
 
72
101
  def string?
73
102
  type == :string
74
103
  end
75
104
 
76
- def data_type
77
- @data.value_type
105
+ def temporal?
106
+ type_class < Arrow::TemporalDataType
107
+ end
108
+
109
+ def type_class
110
+ @data.value_data_type.class
78
111
  end
79
112
 
80
113
  # def each() end
@@ -90,7 +123,23 @@ module RedAmber
90
123
  # def each_chunk() end
91
124
 
92
125
  def tally
93
- values.tally
126
+ hash = values.tally
127
+ if (type_class < Arrow::FloatingPointDataType) && is_nan.any
128
+ a = 0
129
+ hash.each do |key, value|
130
+ if key.is_a?(Float) && key.nan?
131
+ hash.delete(key)
132
+ a += value
133
+ end
134
+ end
135
+ hash[Float::NAN] = a
136
+ end
137
+ hash
138
+ end
139
+
140
+ def value_counts
141
+ values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
142
+ values.zip(counts).to_h
94
143
  end
95
144
 
96
145
  def n_nulls
@@ -101,5 +150,9 @@ module RedAmber
101
150
  def n_nans
102
151
  numeric? ? is_nan.to_a.count(true) : 0
103
152
  end
153
+
154
+ def has_nil?
155
+ is_nil.any
156
+ end
104
157
  end
105
158
  end
@@ -12,32 +12,44 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
16
16
  unary_aggregations.each do |function|
17
17
  define_method(function) do |opts: nil|
18
- output = exec_func_unary(function, options: opts)
19
- take_out_scalar(output)
18
+ datum = exec_func_unary(function, options: opts)
19
+ get_scalar(datum)
20
20
  end
21
21
  end
22
22
  alias_method :median, :approximate_median
23
23
  alias_method :count_uniq, :count_distinct
24
+ alias_method :all?, :all
25
+ alias_method :any?, :any
26
+
27
+ def unbiased_variance
28
+ variance(opts: { ddof: 1 })
29
+ end
30
+ alias_method :var, :unbiased_variance
31
+
32
+ def sd
33
+ stddev(opts: { ddof: 1 })
34
+ end
35
+ alias_method :std, :sd
24
36
 
25
37
  # option(s) required
26
38
  # - index
27
39
 
28
40
  # Returns other than value
29
- # - min_max
30
41
  # - mode
31
42
  # - quantile
32
43
  # - tdigest
33
44
 
34
45
  # [Unary element-wise]: vector.func => vector
35
46
  unary_element_wise =
36
- %i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
47
+ %i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
48
+ is_inf is_nan is_null is_valid round round_to_multiple sign sin tan trunc unique]
37
49
  unary_element_wise.each do |function|
38
50
  define_method(function) do |opts: nil|
39
- output = exec_func_unary(function, options: opts)
40
- take_out_element_wise(output)
51
+ datum = exec_func_unary(function, options: opts)
52
+ Vector.new(datum.value)
41
53
  end
42
54
  end
43
55
  alias_method :is_nil, :is_null
@@ -46,6 +58,14 @@ module RedAmber
46
58
  numeric? ? (is_nil | is_nan) : is_nil
47
59
  end
48
60
 
61
+ alias_method :fill_nil_backward, :fill_null_backward
62
+ alias_method :fill_nil_forward, :fill_null_forward
63
+
64
+ alias_method :sort_indexes, :array_sort_indices
65
+ alias_method :sort_indices, :array_sort_indices
66
+
67
+ alias_method :uniq, :unique
68
+
49
69
  # [Unary element-wise with operator]: vector.func => vector, op vector
50
70
  unary_element_wise_op = {
51
71
  invert: '!',
@@ -53,20 +73,17 @@ module RedAmber
53
73
  }
54
74
  unary_element_wise_op.each do |function, operator|
55
75
  define_method(function) do |opts: nil|
56
- output = exec_func_unary(function, options: opts)
57
- take_out_element_wise(output)
76
+ datum = exec_func_unary(function, options: opts)
77
+ Vector.new(datum.value)
58
78
  end
59
79
 
60
80
  define_method(operator) do |opts: nil|
61
- output = exec_func_unary(function, options: opts)
62
- take_out_element_wise(output)
81
+ datum = exec_func_unary(function, options: opts)
82
+ Vector.new(datum.value)
63
83
  end
64
84
  end
65
85
  alias_method :not, :invert
66
86
 
67
- # option(s) required
68
- # - round, round_to_multiple
69
-
70
87
  # NaN support needed
71
88
  # - acos asin ln log10 log1p log2
72
89
 
@@ -79,8 +96,8 @@ module RedAmber
79
96
  %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
80
97
  binary_element_wise.each do |function|
81
98
  define_method(function) do |other, opts: nil|
82
- output = exec_func_binary(function, other, options: opts)
83
- take_out_element_wise(output)
99
+ datum = exec_func_binary(function, other, options: opts)
100
+ Vector.new(datum.value)
84
101
  end
85
102
  end
86
103
 
@@ -95,8 +112,8 @@ module RedAmber
95
112
  }
96
113
  logical_binary_element_wise.each do |method, function|
97
114
  define_method(method) do |other, opts: nil|
98
- output = exec_func_binary(function, other, options: opts)
99
- take_out_element_wise(output)
115
+ datum = exec_func_binary(function, other, options: opts)
116
+ Vector.new(datum.value)
100
117
  end
101
118
  end
102
119
 
@@ -128,13 +145,13 @@ module RedAmber
128
145
  }
129
146
  binary_element_wise_op.each do |function, operator|
130
147
  define_method(function) do |other, opts: nil|
131
- output = exec_func_binary(function, other, options: opts)
132
- take_out_element_wise(output)
148
+ datum = exec_func_binary(function, other, options: opts)
149
+ Vector.new(datum.value)
133
150
  end
134
151
 
135
152
  define_method(operator) do |other, opts: nil|
136
- output = exec_func_binary(function, other, options: opts)
137
- take_out_element_wise(output)
153
+ datum = exec_func_binary(function, other, options: opts)
154
+ Vector.new(datum.value)
138
155
  end
139
156
  end
140
157
  alias_method :eq, :equal
@@ -144,14 +161,20 @@ module RedAmber
144
161
  alias_method :lt, :less
145
162
  alias_method :ne, :not_equal
146
163
 
164
+ def coerce(other)
165
+ case other
166
+ when Vector, Array, Arrow::Array
167
+ raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
168
+
169
+ [Vector.new(Array(other)), self]
170
+ end
171
+ [Vector.new(Array(other) * size), self]
172
+ end
173
+
147
174
  # (array functions)
148
- # array_filter, array_sort_indices, array_take
149
- # dictionary_encode, hash_all, hash_any, hash_approximate_median,
150
- # hash_count, hash_count_distinct, hash_distinct, hash_max, hash_mean, hash_min,
151
- # hash_min_max, hash_product, hash_stddev, hash_sum, hash_tdigest, hash_variance,
175
+ # dictionary_encode,
152
176
  # partition_nth_indices,
153
- # quarter, quarters_between, unique,
154
- # value_counts
177
+ # quarter, quarters_between,
155
178
 
156
179
  # (strings)
157
180
  # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
@@ -180,44 +203,56 @@ module RedAmber
180
203
  # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
181
204
 
182
205
  # (onditional)
183
- # case_when, cast, if_else
206
+ # case_when, cast,
184
207
 
185
208
  # (indices)
186
209
  # choose, index_in, index_in_meta_binary, indices_nonzero
187
210
 
188
211
  # (others)
189
- # coalesce, drop_null, fill_null_backward, fill_null_forward,
190
- # filter, is_in, is_in_meta_binary,
212
+ # coalesce,
213
+ # is_in_meta_binary,
191
214
  # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
192
- # max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
193
- # sort_indices, struct_field, take
215
+ # max_element_wise, min_element_wise, random, select_k_unstable,
216
+ # struct_field,
194
217
 
195
218
  private # =======
196
219
 
197
220
  def exec_func_unary(function, options: nil)
198
- func = Arrow::Function.find(function)
199
- func.execute([data], options)
221
+ find(function).execute([data], options)
200
222
  end
201
223
 
202
224
  def exec_func_binary(function, other, options: nil)
203
- func = Arrow::Function.find(function)
204
225
  case other
205
226
  when Vector
206
- func.execute([data, other.data], options)
207
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
208
- func.execute([data, other], options)
227
+ find(function).execute([data, other.data], options)
228
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
229
+ find(function).execute([data, other], options)
209
230
  else
210
- raise ArgumentError, "Operand is not supported: #{other.class}"
231
+ raise VectorArgumentError, "Operand is not supported: #{other.class}"
211
232
  end
212
233
  end
213
234
 
214
- def take_out_scalar(output)
215
- output = output.value
216
- output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
235
+ def get_scalar(datum)
236
+ output = datum.value
237
+ case output
238
+ when Arrow::StringScalar then output.to_s
239
+ when Arrow::StructScalar
240
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
241
+ else
242
+ output.value
243
+ end
244
+ end
245
+
246
+ module_function # ======
247
+
248
+ def find(function_name)
249
+ Arrow::Function.find(function_name)
217
250
  end
218
251
 
219
- def take_out_element_wise(output)
220
- Vector.new(output.value)
252
+ # temporary API until RedAmber document prepared.
253
+ def arrow_doc(function_name)
254
+ f = find(function_name)
255
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
221
256
  end
222
257
  end
223
258
  end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to select some data.
9
+ module VectorSelectable
10
+ def drop_nil
11
+ datum = find(:drop_null).execute([data])
12
+ Vector.new(datum.value)
13
+ end
14
+
15
+ # vector calculation version of selection by indices
16
+ # TODO: support for option {boundscheck: true}
17
+ def take(*indices)
18
+ indices.flatten!
19
+ return Vector.new([]) if indices.empty?
20
+
21
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
23
+
24
+ take_by_vector(indices) # returns sub Vector
25
+ end
26
+
27
+ # TODO: support for option {null_selection_behavior: :drop}
28
+ def filter(*booleans)
29
+ booleans.flatten!
30
+ return Vector.new([]) if booleans.empty?
31
+
32
+ b = booleans[0]
33
+ boolean_array =
34
+ case b
35
+ when Vector
36
+ raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
37
+
38
+ b.data
39
+ when Arrow::BooleanArray
40
+ b
41
+ else
42
+ raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
43
+
44
+ Arrow::BooleanArray.new(booleans)
45
+ end
46
+
47
+ filter_by_array(boolean_array) # returns sub Vector
48
+ end
49
+
50
+ # @param indices
51
+ # @param booleans
52
+ def [](*args)
53
+ args.flatten!
54
+ return Vector.new([]) if args.empty?
55
+
56
+ arg = args[0]
57
+ case arg
58
+ when Vector
59
+ return take_by_vector(arg) if arg.numeric?
60
+ return filter_by_array(arg.data) if arg.boolean?
61
+
62
+ raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
63
+ when Arrow::BooleanArray
64
+ return filter_by_array(arg)
65
+ when Arrow::Array
66
+ array = arg
67
+ else
68
+ unless arg.is_a?(Numeric) || booleans?([arg])
69
+ raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
70
+ end
71
+ end
72
+ array ||= Arrow::Array.new(args)
73
+ return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
74
+
75
+ vector = Vector.new(array)
76
+ return take_by_vector(vector) if vector.numeric?
77
+
78
+ raise VectorArgumentError, "Invalid argument: #{args}"
79
+ end
80
+
81
+ # @param values [Array, Arrow::Array, Vector]
82
+ def is_in(*values)
83
+ values.flatten!
84
+ array =
85
+ case values[0]
86
+ when Vector
87
+ values[0].data
88
+ when Arrow::Array
89
+ values[0]
90
+ end
91
+ array ||= data.class.new(values)
92
+ Vector.new(data.is_in(array))
93
+ end
94
+
95
+ # Arrow's support required
96
+ def index(element)
97
+ to_a.index(element)
98
+ end
99
+
100
+ private
101
+
102
+ # Accepts indices by numeric Vector
103
+ def take_by_vector(indices)
104
+ raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
105
+ raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
106
+
107
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
108
+ raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
109
+
110
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
111
+
112
+ datum = find(:array_take).execute([data, index_array])
113
+ Vector.new(datum.value)
114
+ end
115
+
116
+ # Accepts booleans by Arrow::BooleanArray
117
+ def filter_by_array(boolean_array)
118
+ raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
119
+
120
+ datum = find(:array_filter).execute([data, boolean_array])
121
+ Vector.new(datum.value)
122
+ end
123
+ end
124
+ end