red_amber 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +31 -7
  3. data/CHANGELOG.md +214 -10
  4. data/Gemfile +4 -0
  5. data/README.md +117 -342
  6. data/benchmark/csv_load_penguins.yml +15 -0
  7. data/benchmark/drop_nil.yml +11 -0
  8. data/doc/DataFrame.md +854 -0
  9. data/doc/Vector.md +449 -0
  10. data/doc/image/arrow_table_new.png +0 -0
  11. data/doc/image/dataframe/assign.png +0 -0
  12. data/doc/image/dataframe/drop.png +0 -0
  13. data/doc/image/dataframe/pick.png +0 -0
  14. data/doc/image/dataframe/remove.png +0 -0
  15. data/doc/image/dataframe/rename.png +0 -0
  16. data/doc/image/dataframe/slice.png +0 -0
  17. data/doc/image/dataframe_model.png +0 -0
  18. data/doc/image/example_in_red_arrow.png +0 -0
  19. data/doc/image/tdr.png +0 -0
  20. data/doc/image/tdr_and_table.png +0 -0
  21. data/doc/image/tidy_data_in_TDR.png +0 -0
  22. data/doc/image/vector/binary_element_wise.png +0 -0
  23. data/doc/image/vector/unary_aggregation.png +0 -0
  24. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  25. data/doc/image/vector/unary_element_wise.png +0 -0
  26. data/doc/tdr.md +56 -0
  27. data/doc/tdr_ja.md +56 -0
  28. data/lib/red-amber.rb +27 -0
  29. data/lib/red_amber/data_frame.rb +91 -37
  30. data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
  31. data/lib/red_amber/data_frame_indexable.rb +38 -0
  32. data/lib/red_amber/data_frame_observation_operation.rb +11 -0
  33. data/lib/red_amber/data_frame_selectable.rb +155 -48
  34. data/lib/red_amber/data_frame_variable_operation.rb +137 -0
  35. data/lib/red_amber/helper.rb +61 -0
  36. data/lib/red_amber/vector.rb +69 -16
  37. data/lib/red_amber/vector_functions.rb +80 -45
  38. data/lib/red_amber/vector_selectable.rb +124 -0
  39. data/lib/red_amber/vector_updatable.rb +104 -0
  40. data/lib/red_amber/version.rb +1 -1
  41. data/lib/red_amber.rb +1 -16
  42. data/red_amber.gemspec +3 -6
  43. metadata +38 -9
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameVariableOperation
6
+ # pick up some variables to create sub DataFrame
7
+ def pick(*args, &block)
8
+ picker = args
9
+ if block
10
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
11
+
12
+ picker = instance_eval(&block)
13
+ end
14
+ picker = [picker].flatten
15
+ return DataFrame.new if picker.empty? || picker == [nil]
16
+
17
+ picker = keys_by_booleans(picker) if booleans?(picker)
18
+
19
+ # DataFrame#[] creates a Vector with single key is specified.
20
+ # DataFrame#pick creates a DataFrame with single key.
21
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
22
+
23
+ raise DataFrameArgumentError, "Invalid argument #{args}"
24
+ end
25
+
26
+ # drop some variables to create remainer sub DataFrame
27
+ def drop(*args, &block)
28
+ dropper = args
29
+ if block
30
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
31
+
32
+ dropper = instance_eval(&block)
33
+ end
34
+ dropper = [dropper].flatten
35
+ dropper = keys_by_booleans(dropper) if booleans?(dropper)
36
+
37
+ picker = keys - dropper
38
+ return DataFrame.new if picker.empty?
39
+
40
+ # DataFrame#[] creates a Vector with single key is specified.
41
+ # DataFrame#drop creates a DataFrame with single key.
42
+ return DataFrame.new(@table[picker]) if sym_or_str?(picker)
43
+
44
+ raise DataFrameArgumentError, "Invalid argument #{args}"
45
+ end
46
+
47
+ # rename variables to create new DataFrame
48
+ def rename(*args, &block)
49
+ renamer = args
50
+ if block
51
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
52
+
53
+ renamer = instance_eval(&block)
54
+ end
55
+ renamer = [renamer].flatten
56
+ return self if renamer.empty?
57
+
58
+ return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
59
+ return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
60
+
61
+ raise DataFrameArgumentError, "Invalid argument #{args}"
62
+ end
63
+
64
+ # assign variables to create new DataFrame
65
+ def assign(*args, &block)
66
+ assigner = args
67
+ if block
68
+ raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
69
+
70
+ assigner = instance_eval(&block)
71
+ end
72
+ assigner = [assigner].flatten
73
+ return self if assigner.empty? || assigner == [nil]
74
+
75
+ raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
76
+
77
+ updater = {}
78
+ appender = {}
79
+ assigner[0].each do |key, value|
80
+ if keys.include? key
81
+ updater[key] = value
82
+ else
83
+ appender[key] = value
84
+ end
85
+ end
86
+ fields, arrays = update_fields_and_arrays(updater)
87
+ append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
88
+
89
+ DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
90
+ end
91
+
92
+ private
93
+
94
+ def rename_by_hash(key_pairs)
95
+ fields = keys.map do |key|
96
+ new_key = key_pairs[key]
97
+ if new_key
98
+ Arrow::Field.new(new_key.to_sym, @table[key].data_type)
99
+ else
100
+ @table.schema[key]
101
+ end
102
+ end
103
+ schema = Arrow::Schema.new(fields)
104
+ DataFrame.new(Arrow::Table.new(schema, @table.columns))
105
+ end
106
+
107
+ def update_fields_and_arrays(updater)
108
+ fields = @table.columns.map(&:field)
109
+ arrays = @table.columns.map(&:data) # chunked_arrays
110
+ keys.each_with_index do |key, i|
111
+ data = updater[key]
112
+ next unless data
113
+
114
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
115
+
116
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
117
+ fields[i] = Arrow::Field.new(key, a.value_data_type)
118
+ arrays[i] = Arrow::ChunkedArray.new([a])
119
+ end
120
+ [fields, arrays]
121
+ end
122
+
123
+ def append_to_fields_and_arrays(appender, fields, arrays)
124
+ appender.each do |key, data|
125
+ raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
126
+
127
+ a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
128
+ fields << Arrow::Field.new(key.to_sym, a.value_data_type)
129
+ arrays << Arrow::ChunkedArray.new([a])
130
+ end
131
+ end
132
+
133
+ def keys_by_booleans(booleans)
134
+ keys.select.with_index { |_, i| booleans[i] }
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module Helper
6
+ private
7
+
8
+ def pl(num)
9
+ num > 1 ? 's' : ''
10
+ end
11
+
12
+ def out_of_range?(indeces)
13
+ indeces.max >= size || indeces.min < -size
14
+ end
15
+
16
+ def integers?(enum)
17
+ enum.all?(Integer)
18
+ end
19
+
20
+ def sym_or_str?(enum)
21
+ enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
22
+ end
23
+
24
+ def booleans?(enum)
25
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
26
+ end
27
+
28
+ def create_dataframe_from_vector(key, vector)
29
+ DataFrame.new(key => vector.data)
30
+ end
31
+
32
+ def parse_to_vector(args)
33
+ a = args.reduce([]) do |accum, elem|
34
+ accum.concat(normalize_element(elem))
35
+ end
36
+ Vector.new(a)
37
+ end
38
+
39
+ def normalize_element(elem)
40
+ case elem
41
+ when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
42
+ [elem]
43
+ when Range
44
+ both_end = [elem.begin, elem.end]
45
+ both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
46
+
47
+ if both_end.any?(Integer) || both_end.all?(&:nil?)
48
+ if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
49
+ raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
50
+ end
51
+
52
+ (0...size).to_a[elem]
53
+ else
54
+ elem.to_a
55
+ end
56
+ else
57
+ Array(elem)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,27 +1,42 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # Columnar data object
4
+ # Values in variable (columnar) data object
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
8
  include VectorFunctions
9
-
10
- # chunked_array may come from column.data
11
- def initialize(array)
12
- case array
13
- when Vector
14
- @data = array.data
15
- when Arrow::Array, Arrow::ChunkedArray
16
- @data = array
17
- when Array
18
- @data = Arrow::Array.new(array)
9
+ include VectorUpdatable
10
+ include VectorSelectable
11
+ include Helper
12
+
13
+ def initialize(*array)
14
+ @key = nil # default is 'headless'
15
+ if array.empty? || array[0].nil?
16
+ Vector.new([])
19
17
  else
20
- raise ArgumentError, 'Unknown array in argument'
18
+ array.flatten!
19
+ case array[0]
20
+ when Vector
21
+ @data = array[0].data
22
+ return
23
+ when Arrow::Array, Arrow::ChunkedArray
24
+ @data = array[0]
25
+ return
26
+ when Range
27
+ @data = Arrow::Array.new(Array(array[0]))
28
+ return
29
+ end
30
+ begin
31
+ @data = Arrow::Array.new(Array(array))
32
+ rescue Error
33
+ raise VectorArgumentError, "Invalid argument: #{array}"
34
+ end
21
35
  end
22
36
  end
23
37
 
24
38
  attr_reader :data
39
+ attr_accessor :key
25
40
 
26
41
  def to_s
27
42
  @data.to_a.inspect
@@ -49,6 +64,16 @@ module RedAmber
49
64
  alias_method :to_a, :values
50
65
  alias_method :entries, :values
51
66
 
67
+ def indices
68
+ (0...size).to_a
69
+ end
70
+ alias_method :indexes, :indices
71
+ alias_method :indeces, :indices
72
+
73
+ def to_ary
74
+ to_a
75
+ end
76
+
52
77
  def size
53
78
  # only defined :length in Arrow?
54
79
  @data.length
@@ -57,6 +82,10 @@ module RedAmber
57
82
  alias_method :n_rows, :size
58
83
  alias_method :nrow, :size
59
84
 
85
+ def empty?
86
+ size.zero?
87
+ end
88
+
60
89
  def type
61
90
  @data.value_type.nick.to_sym
62
91
  end
@@ -66,15 +95,19 @@ module RedAmber
66
95
  end
67
96
 
68
97
  def numeric?
69
- %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
98
+ type_class < Arrow::NumericDataType
70
99
  end
71
100
 
72
101
  def string?
73
102
  type == :string
74
103
  end
75
104
 
76
- def data_type
77
- @data.value_type
105
+ def temporal?
106
+ type_class < Arrow::TemporalDataType
107
+ end
108
+
109
+ def type_class
110
+ @data.value_data_type.class
78
111
  end
79
112
 
80
113
  # def each() end
@@ -90,7 +123,23 @@ module RedAmber
90
123
  # def each_chunk() end
91
124
 
92
125
  def tally
93
- values.tally
126
+ hash = values.tally
127
+ if (type_class < Arrow::FloatingPointDataType) && is_nan.any
128
+ a = 0
129
+ hash.each do |key, value|
130
+ if key.is_a?(Float) && key.nan?
131
+ hash.delete(key)
132
+ a += value
133
+ end
134
+ end
135
+ hash[Float::NAN] = a
136
+ end
137
+ hash
138
+ end
139
+
140
+ def value_counts
141
+ values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
142
+ values.zip(counts).to_h
94
143
  end
95
144
 
96
145
  def n_nulls
@@ -101,5 +150,9 @@ module RedAmber
101
150
  def n_nans
102
151
  numeric? ? is_nan.to_a.count(true) : 0
103
152
  end
153
+
154
+ def has_nil?
155
+ is_nil.any
156
+ end
104
157
  end
105
158
  end
@@ -12,32 +12,44 @@ module RedAmber
12
12
  module VectorFunctions
13
13
  # [Unary aggregations]: vector.func => scalar
14
14
  unary_aggregations =
15
- %i[all any approximate_median count count_distinct max mean min product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
16
16
  unary_aggregations.each do |function|
17
17
  define_method(function) do |opts: nil|
18
- output = exec_func_unary(function, options: opts)
19
- take_out_scalar(output)
18
+ datum = exec_func_unary(function, options: opts)
19
+ get_scalar(datum)
20
20
  end
21
21
  end
22
22
  alias_method :median, :approximate_median
23
23
  alias_method :count_uniq, :count_distinct
24
+ alias_method :all?, :all
25
+ alias_method :any?, :any
26
+
27
+ def unbiased_variance
28
+ variance(opts: { ddof: 1 })
29
+ end
30
+ alias_method :var, :unbiased_variance
31
+
32
+ def sd
33
+ stddev(opts: { ddof: 1 })
34
+ end
35
+ alias_method :std, :sd
24
36
 
25
37
  # option(s) required
26
38
  # - index
27
39
 
28
40
  # Returns other than value
29
- # - min_max
30
41
  # - mode
31
42
  # - quantile
32
43
  # - tdigest
33
44
 
34
45
  # [Unary element-wise]: vector.func => vector
35
46
  unary_element_wise =
36
- %i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
47
+ %i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
48
+ is_inf is_nan is_null is_valid round round_to_multiple sign sin tan trunc unique]
37
49
  unary_element_wise.each do |function|
38
50
  define_method(function) do |opts: nil|
39
- output = exec_func_unary(function, options: opts)
40
- take_out_element_wise(output)
51
+ datum = exec_func_unary(function, options: opts)
52
+ Vector.new(datum.value)
41
53
  end
42
54
  end
43
55
  alias_method :is_nil, :is_null
@@ -46,6 +58,14 @@ module RedAmber
46
58
  numeric? ? (is_nil | is_nan) : is_nil
47
59
  end
48
60
 
61
+ alias_method :fill_nil_backward, :fill_null_backward
62
+ alias_method :fill_nil_forward, :fill_null_forward
63
+
64
+ alias_method :sort_indexes, :array_sort_indices
65
+ alias_method :sort_indices, :array_sort_indices
66
+
67
+ alias_method :uniq, :unique
68
+
49
69
  # [Unary element-wise with operator]: vector.func => vector, op vector
50
70
  unary_element_wise_op = {
51
71
  invert: '!',
@@ -53,20 +73,17 @@ module RedAmber
53
73
  }
54
74
  unary_element_wise_op.each do |function, operator|
55
75
  define_method(function) do |opts: nil|
56
- output = exec_func_unary(function, options: opts)
57
- take_out_element_wise(output)
76
+ datum = exec_func_unary(function, options: opts)
77
+ Vector.new(datum.value)
58
78
  end
59
79
 
60
80
  define_method(operator) do |opts: nil|
61
- output = exec_func_unary(function, options: opts)
62
- take_out_element_wise(output)
81
+ datum = exec_func_unary(function, options: opts)
82
+ Vector.new(datum.value)
63
83
  end
64
84
  end
65
85
  alias_method :not, :invert
66
86
 
67
- # option(s) required
68
- # - round, round_to_multiple
69
-
70
87
  # NaN support needed
71
88
  # - acos asin ln log10 log1p log2
72
89
 
@@ -79,8 +96,8 @@ module RedAmber
79
96
  %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
80
97
  binary_element_wise.each do |function|
81
98
  define_method(function) do |other, opts: nil|
82
- output = exec_func_binary(function, other, options: opts)
83
- take_out_element_wise(output)
99
+ datum = exec_func_binary(function, other, options: opts)
100
+ Vector.new(datum.value)
84
101
  end
85
102
  end
86
103
 
@@ -95,8 +112,8 @@ module RedAmber
95
112
  }
96
113
  logical_binary_element_wise.each do |method, function|
97
114
  define_method(method) do |other, opts: nil|
98
- output = exec_func_binary(function, other, options: opts)
99
- take_out_element_wise(output)
115
+ datum = exec_func_binary(function, other, options: opts)
116
+ Vector.new(datum.value)
100
117
  end
101
118
  end
102
119
 
@@ -128,13 +145,13 @@ module RedAmber
128
145
  }
129
146
  binary_element_wise_op.each do |function, operator|
130
147
  define_method(function) do |other, opts: nil|
131
- output = exec_func_binary(function, other, options: opts)
132
- take_out_element_wise(output)
148
+ datum = exec_func_binary(function, other, options: opts)
149
+ Vector.new(datum.value)
133
150
  end
134
151
 
135
152
  define_method(operator) do |other, opts: nil|
136
- output = exec_func_binary(function, other, options: opts)
137
- take_out_element_wise(output)
153
+ datum = exec_func_binary(function, other, options: opts)
154
+ Vector.new(datum.value)
138
155
  end
139
156
  end
140
157
  alias_method :eq, :equal
@@ -144,14 +161,20 @@ module RedAmber
144
161
  alias_method :lt, :less
145
162
  alias_method :ne, :not_equal
146
163
 
164
+ def coerce(other)
165
+ case other
166
+ when Vector, Array, Arrow::Array
167
+ raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
168
+
169
+ [Vector.new(Array(other)), self]
170
+ end
171
+ [Vector.new(Array(other) * size), self]
172
+ end
173
+
147
174
  # (array functions)
148
- # array_filter, array_sort_indices, array_take
149
- # dictionary_encode, hash_all, hash_any, hash_approximate_median,
150
- # hash_count, hash_count_distinct, hash_distinct, hash_max, hash_mean, hash_min,
151
- # hash_min_max, hash_product, hash_stddev, hash_sum, hash_tdigest, hash_variance,
175
+ # dictionary_encode,
152
176
  # partition_nth_indices,
153
- # quarter, quarters_between, unique,
154
- # value_counts
177
+ # quarter, quarters_between,
155
178
 
156
179
  # (strings)
157
180
  # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
@@ -180,44 +203,56 @@ module RedAmber
180
203
  # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
181
204
 
182
205
  # (onditional)
183
- # case_when, cast, if_else
206
+ # case_when, cast,
184
207
 
185
208
  # (indices)
186
209
  # choose, index_in, index_in_meta_binary, indices_nonzero
187
210
 
188
211
  # (others)
189
- # coalesce, drop_null, fill_null_backward, fill_null_forward,
190
- # filter, is_in, is_in_meta_binary,
212
+ # coalesce,
213
+ # is_in_meta_binary,
191
214
  # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
192
- # max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
193
- # sort_indices, struct_field, take
215
+ # max_element_wise, min_element_wise, random, select_k_unstable,
216
+ # struct_field,
194
217
 
195
218
  private # =======
196
219
 
197
220
  def exec_func_unary(function, options: nil)
198
- func = Arrow::Function.find(function)
199
- func.execute([data], options)
221
+ find(function).execute([data], options)
200
222
  end
201
223
 
202
224
  def exec_func_binary(function, other, options: nil)
203
- func = Arrow::Function.find(function)
204
225
  case other
205
226
  when Vector
206
- func.execute([data, other.data], options)
207
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
208
- func.execute([data, other], options)
227
+ find(function).execute([data, other.data], options)
228
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
229
+ find(function).execute([data, other], options)
209
230
  else
210
- raise ArgumentError, "Operand is not supported: #{other.class}"
231
+ raise VectorArgumentError, "Operand is not supported: #{other.class}"
211
232
  end
212
233
  end
213
234
 
214
- def take_out_scalar(output)
215
- output = output.value
216
- output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
235
+ def get_scalar(datum)
236
+ output = datum.value
237
+ case output
238
+ when Arrow::StringScalar then output.to_s
239
+ when Arrow::StructScalar
240
+ output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
241
+ else
242
+ output.value
243
+ end
244
+ end
245
+
246
+ module_function # ======
247
+
248
+ def find(function_name)
249
+ Arrow::Function.find(function_name)
217
250
  end
218
251
 
219
- def take_out_element_wise(output)
220
- Vector.new(output.value)
252
+ # temporary API until RedAmber document prepared.
253
+ def arrow_doc(function_name)
254
+ f = find(function_name)
255
+ "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
221
256
  end
222
257
  end
223
258
  end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to select some data.
9
+ module VectorSelectable
10
+ def drop_nil
11
+ datum = find(:drop_null).execute([data])
12
+ Vector.new(datum.value)
13
+ end
14
+
15
+ # vector calculation version of selection by indices
16
+ # TODO: support for option {boundscheck: true}
17
+ def take(*indices)
18
+ indices.flatten!
19
+ return Vector.new([]) if indices.empty?
20
+
21
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
23
+
24
+ take_by_vector(indices) # returns sub Vector
25
+ end
26
+
27
+ # TODO: support for option {null_selection_behavior: :drop}
28
+ def filter(*booleans)
29
+ booleans.flatten!
30
+ return Vector.new([]) if booleans.empty?
31
+
32
+ b = booleans[0]
33
+ boolean_array =
34
+ case b
35
+ when Vector
36
+ raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
37
+
38
+ b.data
39
+ when Arrow::BooleanArray
40
+ b
41
+ else
42
+ raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
43
+
44
+ Arrow::BooleanArray.new(booleans)
45
+ end
46
+
47
+ filter_by_array(boolean_array) # returns sub Vector
48
+ end
49
+
50
+ # @param indices
51
+ # @param booleans
52
+ def [](*args)
53
+ args.flatten!
54
+ return Vector.new([]) if args.empty?
55
+
56
+ arg = args[0]
57
+ case arg
58
+ when Vector
59
+ return take_by_vector(arg) if arg.numeric?
60
+ return filter_by_array(arg.data) if arg.boolean?
61
+
62
+ raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
63
+ when Arrow::BooleanArray
64
+ return filter_by_array(arg)
65
+ when Arrow::Array
66
+ array = arg
67
+ else
68
+ unless arg.is_a?(Numeric) || booleans?([arg])
69
+ raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
70
+ end
71
+ end
72
+ array ||= Arrow::Array.new(args)
73
+ return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
74
+
75
+ vector = Vector.new(array)
76
+ return take_by_vector(vector) if vector.numeric?
77
+
78
+ raise VectorArgumentError, "Invalid argument: #{args}"
79
+ end
80
+
81
+ # @param values [Array, Arrow::Array, Vector]
82
+ def is_in(*values)
83
+ values.flatten!
84
+ array =
85
+ case values[0]
86
+ when Vector
87
+ values[0].data
88
+ when Arrow::Array
89
+ values[0]
90
+ end
91
+ array ||= data.class.new(values)
92
+ Vector.new(data.is_in(array))
93
+ end
94
+
95
+ # Arrow's support required
96
+ def index(element)
97
+ to_a.index(element)
98
+ end
99
+
100
+ private
101
+
102
+ # Accepts indices by numeric Vector
103
+ def take_by_vector(indices)
104
+ raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
105
+ raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
106
+
107
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
108
+ raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
109
+
110
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
111
+
112
+ datum = find(:array_take).execute([data, index_array])
113
+ Vector.new(datum.value)
114
+ end
115
+
116
+ # Accepts booleans by Arrow::BooleanArray
117
+ def filter_by_array(boolean_array)
118
+ raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
119
+
120
+ datum = find(:array_filter).execute([data, boolean_array])
121
+ Vector.new(datum.value)
122
+ end
123
+ end
124
+ end