red_amber 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +31 -7
- data/CHANGELOG.md +214 -10
- data/Gemfile +4 -0
- data/README.md +117 -342
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +854 -0
- data/doc/Vector.md +449 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +56 -0
- data/doc/tdr_ja.md +56 -0
- data/lib/red-amber.rb +27 -0
- data/lib/red_amber/data_frame.rb +91 -37
- data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +11 -0
- data/lib/red_amber/data_frame_selectable.rb +155 -48
- data/lib/red_amber/data_frame_variable_operation.rb +137 -0
- data/lib/red_amber/helper.rb +61 -0
- data/lib/red_amber/vector.rb +69 -16
- data/lib/red_amber/vector_functions.rb +80 -45
- data/lib/red_amber/vector_selectable.rb +124 -0
- data/lib/red_amber/vector_updatable.rb +104 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -16
- data/red_amber.gemspec +3 -6
- metadata +38 -9
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameVariableOperation
|
6
|
+
# pick up some variables to create sub DataFrame
|
7
|
+
def pick(*args, &block)
|
8
|
+
picker = args
|
9
|
+
if block
|
10
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
11
|
+
|
12
|
+
picker = instance_eval(&block)
|
13
|
+
end
|
14
|
+
picker = [picker].flatten
|
15
|
+
return DataFrame.new if picker.empty? || picker == [nil]
|
16
|
+
|
17
|
+
picker = keys_by_booleans(picker) if booleans?(picker)
|
18
|
+
|
19
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
20
|
+
# DataFrame#pick creates a DataFrame with single key.
|
21
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
22
|
+
|
23
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
24
|
+
end
|
25
|
+
|
26
|
+
# drop some variables to create remainer sub DataFrame
|
27
|
+
def drop(*args, &block)
|
28
|
+
dropper = args
|
29
|
+
if block
|
30
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
31
|
+
|
32
|
+
dropper = instance_eval(&block)
|
33
|
+
end
|
34
|
+
dropper = [dropper].flatten
|
35
|
+
dropper = keys_by_booleans(dropper) if booleans?(dropper)
|
36
|
+
|
37
|
+
picker = keys - dropper
|
38
|
+
return DataFrame.new if picker.empty?
|
39
|
+
|
40
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
41
|
+
# DataFrame#drop creates a DataFrame with single key.
|
42
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
43
|
+
|
44
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# rename variables to create new DataFrame
|
48
|
+
def rename(*args, &block)
|
49
|
+
renamer = args
|
50
|
+
if block
|
51
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
52
|
+
|
53
|
+
renamer = instance_eval(&block)
|
54
|
+
end
|
55
|
+
renamer = [renamer].flatten
|
56
|
+
return self if renamer.empty?
|
57
|
+
|
58
|
+
return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
|
59
|
+
return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
|
60
|
+
|
61
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
62
|
+
end
|
63
|
+
|
64
|
+
# assign variables to create new DataFrame
|
65
|
+
def assign(*args, &block)
|
66
|
+
assigner = args
|
67
|
+
if block
|
68
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
69
|
+
|
70
|
+
assigner = instance_eval(&block)
|
71
|
+
end
|
72
|
+
assigner = [assigner].flatten
|
73
|
+
return self if assigner.empty? || assigner == [nil]
|
74
|
+
|
75
|
+
raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
|
76
|
+
|
77
|
+
updater = {}
|
78
|
+
appender = {}
|
79
|
+
assigner[0].each do |key, value|
|
80
|
+
if keys.include? key
|
81
|
+
updater[key] = value
|
82
|
+
else
|
83
|
+
appender[key] = value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
fields, arrays = update_fields_and_arrays(updater)
|
87
|
+
append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
|
88
|
+
|
89
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def rename_by_hash(key_pairs)
|
95
|
+
fields = keys.map do |key|
|
96
|
+
new_key = key_pairs[key]
|
97
|
+
if new_key
|
98
|
+
Arrow::Field.new(new_key.to_sym, @table[key].data_type)
|
99
|
+
else
|
100
|
+
@table.schema[key]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
schema = Arrow::Schema.new(fields)
|
104
|
+
DataFrame.new(Arrow::Table.new(schema, @table.columns))
|
105
|
+
end
|
106
|
+
|
107
|
+
def update_fields_and_arrays(updater)
|
108
|
+
fields = @table.columns.map(&:field)
|
109
|
+
arrays = @table.columns.map(&:data) # chunked_arrays
|
110
|
+
keys.each_with_index do |key, i|
|
111
|
+
data = updater[key]
|
112
|
+
next unless data
|
113
|
+
|
114
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
115
|
+
|
116
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
117
|
+
fields[i] = Arrow::Field.new(key, a.value_data_type)
|
118
|
+
arrays[i] = Arrow::ChunkedArray.new([a])
|
119
|
+
end
|
120
|
+
[fields, arrays]
|
121
|
+
end
|
122
|
+
|
123
|
+
def append_to_fields_and_arrays(appender, fields, arrays)
|
124
|
+
appender.each do |key, data|
|
125
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
126
|
+
|
127
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
128
|
+
fields << Arrow::Field.new(key.to_sym, a.value_data_type)
|
129
|
+
arrays << Arrow::ChunkedArray.new([a])
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def keys_by_booleans(booleans)
|
134
|
+
keys.select.with_index { |_, i| booleans[i] }
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module Helper
|
6
|
+
private
|
7
|
+
|
8
|
+
def pl(num)
|
9
|
+
num > 1 ? 's' : ''
|
10
|
+
end
|
11
|
+
|
12
|
+
def out_of_range?(indeces)
|
13
|
+
indeces.max >= size || indeces.min < -size
|
14
|
+
end
|
15
|
+
|
16
|
+
def integers?(enum)
|
17
|
+
enum.all?(Integer)
|
18
|
+
end
|
19
|
+
|
20
|
+
def sym_or_str?(enum)
|
21
|
+
enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
22
|
+
end
|
23
|
+
|
24
|
+
def booleans?(enum)
|
25
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def create_dataframe_from_vector(key, vector)
|
29
|
+
DataFrame.new(key => vector.data)
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_to_vector(args)
|
33
|
+
a = args.reduce([]) do |accum, elem|
|
34
|
+
accum.concat(normalize_element(elem))
|
35
|
+
end
|
36
|
+
Vector.new(a)
|
37
|
+
end
|
38
|
+
|
39
|
+
def normalize_element(elem)
|
40
|
+
case elem
|
41
|
+
when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
|
42
|
+
[elem]
|
43
|
+
when Range
|
44
|
+
both_end = [elem.begin, elem.end]
|
45
|
+
both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
|
46
|
+
|
47
|
+
if both_end.any?(Integer) || both_end.all?(&:nil?)
|
48
|
+
if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
|
49
|
+
raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
|
50
|
+
end
|
51
|
+
|
52
|
+
(0...size).to_a[elem]
|
53
|
+
else
|
54
|
+
elem.to_a
|
55
|
+
end
|
56
|
+
else
|
57
|
+
Array(elem)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -1,27 +1,42 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Values in variable (columnar) data object
|
5
5
|
# @data : holds Arrow::ChunkedArray
|
6
6
|
class Vector
|
7
7
|
# mix-in
|
8
8
|
include VectorFunctions
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
when Array
|
18
|
-
@data = Arrow::Array.new(array)
|
9
|
+
include VectorUpdatable
|
10
|
+
include VectorSelectable
|
11
|
+
include Helper
|
12
|
+
|
13
|
+
def initialize(*array)
|
14
|
+
@key = nil # default is 'headless'
|
15
|
+
if array.empty? || array[0].nil?
|
16
|
+
Vector.new([])
|
19
17
|
else
|
20
|
-
|
18
|
+
array.flatten!
|
19
|
+
case array[0]
|
20
|
+
when Vector
|
21
|
+
@data = array[0].data
|
22
|
+
return
|
23
|
+
when Arrow::Array, Arrow::ChunkedArray
|
24
|
+
@data = array[0]
|
25
|
+
return
|
26
|
+
when Range
|
27
|
+
@data = Arrow::Array.new(Array(array[0]))
|
28
|
+
return
|
29
|
+
end
|
30
|
+
begin
|
31
|
+
@data = Arrow::Array.new(Array(array))
|
32
|
+
rescue Error
|
33
|
+
raise VectorArgumentError, "Invalid argument: #{array}"
|
34
|
+
end
|
21
35
|
end
|
22
36
|
end
|
23
37
|
|
24
38
|
attr_reader :data
|
39
|
+
attr_accessor :key
|
25
40
|
|
26
41
|
def to_s
|
27
42
|
@data.to_a.inspect
|
@@ -49,6 +64,16 @@ module RedAmber
|
|
49
64
|
alias_method :to_a, :values
|
50
65
|
alias_method :entries, :values
|
51
66
|
|
67
|
+
def indices
|
68
|
+
(0...size).to_a
|
69
|
+
end
|
70
|
+
alias_method :indexes, :indices
|
71
|
+
alias_method :indeces, :indices
|
72
|
+
|
73
|
+
def to_ary
|
74
|
+
to_a
|
75
|
+
end
|
76
|
+
|
52
77
|
def size
|
53
78
|
# only defined :length in Arrow?
|
54
79
|
@data.length
|
@@ -57,6 +82,10 @@ module RedAmber
|
|
57
82
|
alias_method :n_rows, :size
|
58
83
|
alias_method :nrow, :size
|
59
84
|
|
85
|
+
def empty?
|
86
|
+
size.zero?
|
87
|
+
end
|
88
|
+
|
60
89
|
def type
|
61
90
|
@data.value_type.nick.to_sym
|
62
91
|
end
|
@@ -66,15 +95,19 @@ module RedAmber
|
|
66
95
|
end
|
67
96
|
|
68
97
|
def numeric?
|
69
|
-
|
98
|
+
type_class < Arrow::NumericDataType
|
70
99
|
end
|
71
100
|
|
72
101
|
def string?
|
73
102
|
type == :string
|
74
103
|
end
|
75
104
|
|
76
|
-
def
|
77
|
-
|
105
|
+
def temporal?
|
106
|
+
type_class < Arrow::TemporalDataType
|
107
|
+
end
|
108
|
+
|
109
|
+
def type_class
|
110
|
+
@data.value_data_type.class
|
78
111
|
end
|
79
112
|
|
80
113
|
# def each() end
|
@@ -90,7 +123,23 @@ module RedAmber
|
|
90
123
|
# def each_chunk() end
|
91
124
|
|
92
125
|
def tally
|
93
|
-
values.tally
|
126
|
+
hash = values.tally
|
127
|
+
if (type_class < Arrow::FloatingPointDataType) && is_nan.any
|
128
|
+
a = 0
|
129
|
+
hash.each do |key, value|
|
130
|
+
if key.is_a?(Float) && key.nan?
|
131
|
+
hash.delete(key)
|
132
|
+
a += value
|
133
|
+
end
|
134
|
+
end
|
135
|
+
hash[Float::NAN] = a
|
136
|
+
end
|
137
|
+
hash
|
138
|
+
end
|
139
|
+
|
140
|
+
def value_counts
|
141
|
+
values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
|
142
|
+
values.zip(counts).to_h
|
94
143
|
end
|
95
144
|
|
96
145
|
def n_nulls
|
@@ -101,5 +150,9 @@ module RedAmber
|
|
101
150
|
def n_nans
|
102
151
|
numeric? ? is_nan.to_a.count(true) : 0
|
103
152
|
end
|
153
|
+
|
154
|
+
def has_nil?
|
155
|
+
is_nil.any
|
156
|
+
end
|
104
157
|
end
|
105
158
|
end
|
@@ -12,32 +12,44 @@ module RedAmber
|
|
12
12
|
module VectorFunctions
|
13
13
|
# [Unary aggregations]: vector.func => scalar
|
14
14
|
unary_aggregations =
|
15
|
-
%i[all any approximate_median count count_distinct max mean min product stddev sum variance]
|
15
|
+
%i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
|
16
16
|
unary_aggregations.each do |function|
|
17
17
|
define_method(function) do |opts: nil|
|
18
|
-
|
19
|
-
|
18
|
+
datum = exec_func_unary(function, options: opts)
|
19
|
+
get_scalar(datum)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
alias_method :median, :approximate_median
|
23
23
|
alias_method :count_uniq, :count_distinct
|
24
|
+
alias_method :all?, :all
|
25
|
+
alias_method :any?, :any
|
26
|
+
|
27
|
+
def unbiased_variance
|
28
|
+
variance(opts: { ddof: 1 })
|
29
|
+
end
|
30
|
+
alias_method :var, :unbiased_variance
|
31
|
+
|
32
|
+
def sd
|
33
|
+
stddev(opts: { ddof: 1 })
|
34
|
+
end
|
35
|
+
alias_method :std, :sd
|
24
36
|
|
25
37
|
# option(s) required
|
26
38
|
# - index
|
27
39
|
|
28
40
|
# Returns other than value
|
29
|
-
# - min_max
|
30
41
|
# - mode
|
31
42
|
# - quantile
|
32
43
|
# - tdigest
|
33
44
|
|
34
45
|
# [Unary element-wise]: vector.func => vector
|
35
46
|
unary_element_wise =
|
36
|
-
%i[abs atan bit_wise_not ceil cos floor is_finite
|
47
|
+
%i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
|
48
|
+
is_inf is_nan is_null is_valid round round_to_multiple sign sin tan trunc unique]
|
37
49
|
unary_element_wise.each do |function|
|
38
50
|
define_method(function) do |opts: nil|
|
39
|
-
|
40
|
-
|
51
|
+
datum = exec_func_unary(function, options: opts)
|
52
|
+
Vector.new(datum.value)
|
41
53
|
end
|
42
54
|
end
|
43
55
|
alias_method :is_nil, :is_null
|
@@ -46,6 +58,14 @@ module RedAmber
|
|
46
58
|
numeric? ? (is_nil | is_nan) : is_nil
|
47
59
|
end
|
48
60
|
|
61
|
+
alias_method :fill_nil_backward, :fill_null_backward
|
62
|
+
alias_method :fill_nil_forward, :fill_null_forward
|
63
|
+
|
64
|
+
alias_method :sort_indexes, :array_sort_indices
|
65
|
+
alias_method :sort_indices, :array_sort_indices
|
66
|
+
|
67
|
+
alias_method :uniq, :unique
|
68
|
+
|
49
69
|
# [Unary element-wise with operator]: vector.func => vector, op vector
|
50
70
|
unary_element_wise_op = {
|
51
71
|
invert: '!',
|
@@ -53,20 +73,17 @@ module RedAmber
|
|
53
73
|
}
|
54
74
|
unary_element_wise_op.each do |function, operator|
|
55
75
|
define_method(function) do |opts: nil|
|
56
|
-
|
57
|
-
|
76
|
+
datum = exec_func_unary(function, options: opts)
|
77
|
+
Vector.new(datum.value)
|
58
78
|
end
|
59
79
|
|
60
80
|
define_method(operator) do |opts: nil|
|
61
|
-
|
62
|
-
|
81
|
+
datum = exec_func_unary(function, options: opts)
|
82
|
+
Vector.new(datum.value)
|
63
83
|
end
|
64
84
|
end
|
65
85
|
alias_method :not, :invert
|
66
86
|
|
67
|
-
# option(s) required
|
68
|
-
# - round, round_to_multiple
|
69
|
-
|
70
87
|
# NaN support needed
|
71
88
|
# - acos asin ln log10 log1p log2
|
72
89
|
|
@@ -79,8 +96,8 @@ module RedAmber
|
|
79
96
|
%i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
|
80
97
|
binary_element_wise.each do |function|
|
81
98
|
define_method(function) do |other, opts: nil|
|
82
|
-
|
83
|
-
|
99
|
+
datum = exec_func_binary(function, other, options: opts)
|
100
|
+
Vector.new(datum.value)
|
84
101
|
end
|
85
102
|
end
|
86
103
|
|
@@ -95,8 +112,8 @@ module RedAmber
|
|
95
112
|
}
|
96
113
|
logical_binary_element_wise.each do |method, function|
|
97
114
|
define_method(method) do |other, opts: nil|
|
98
|
-
|
99
|
-
|
115
|
+
datum = exec_func_binary(function, other, options: opts)
|
116
|
+
Vector.new(datum.value)
|
100
117
|
end
|
101
118
|
end
|
102
119
|
|
@@ -128,13 +145,13 @@ module RedAmber
|
|
128
145
|
}
|
129
146
|
binary_element_wise_op.each do |function, operator|
|
130
147
|
define_method(function) do |other, opts: nil|
|
131
|
-
|
132
|
-
|
148
|
+
datum = exec_func_binary(function, other, options: opts)
|
149
|
+
Vector.new(datum.value)
|
133
150
|
end
|
134
151
|
|
135
152
|
define_method(operator) do |other, opts: nil|
|
136
|
-
|
137
|
-
|
153
|
+
datum = exec_func_binary(function, other, options: opts)
|
154
|
+
Vector.new(datum.value)
|
138
155
|
end
|
139
156
|
end
|
140
157
|
alias_method :eq, :equal
|
@@ -144,14 +161,20 @@ module RedAmber
|
|
144
161
|
alias_method :lt, :less
|
145
162
|
alias_method :ne, :not_equal
|
146
163
|
|
164
|
+
def coerce(other)
|
165
|
+
case other
|
166
|
+
when Vector, Array, Arrow::Array
|
167
|
+
raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
|
168
|
+
|
169
|
+
[Vector.new(Array(other)), self]
|
170
|
+
end
|
171
|
+
[Vector.new(Array(other) * size), self]
|
172
|
+
end
|
173
|
+
|
147
174
|
# (array functions)
|
148
|
-
#
|
149
|
-
# dictionary_encode, hash_all, hash_any, hash_approximate_median,
|
150
|
-
# hash_count, hash_count_distinct, hash_distinct, hash_max, hash_mean, hash_min,
|
151
|
-
# hash_min_max, hash_product, hash_stddev, hash_sum, hash_tdigest, hash_variance,
|
175
|
+
# dictionary_encode,
|
152
176
|
# partition_nth_indices,
|
153
|
-
# quarter, quarters_between,
|
154
|
-
# value_counts
|
177
|
+
# quarter, quarters_between,
|
155
178
|
|
156
179
|
# (strings)
|
157
180
|
# ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
|
@@ -180,44 +203,56 @@ module RedAmber
|
|
180
203
|
# strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
|
181
204
|
|
182
205
|
# (onditional)
|
183
|
-
# case_when, cast,
|
206
|
+
# case_when, cast,
|
184
207
|
|
185
208
|
# (indices)
|
186
209
|
# choose, index_in, index_in_meta_binary, indices_nonzero
|
187
210
|
|
188
211
|
# (others)
|
189
|
-
# coalesce,
|
190
|
-
#
|
212
|
+
# coalesce,
|
213
|
+
# is_in_meta_binary,
|
191
214
|
# list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
|
192
|
-
# max_element_wise, min_element_wise, random,
|
193
|
-
#
|
215
|
+
# max_element_wise, min_element_wise, random, select_k_unstable,
|
216
|
+
# struct_field,
|
194
217
|
|
195
218
|
private # =======
|
196
219
|
|
197
220
|
def exec_func_unary(function, options: nil)
|
198
|
-
|
199
|
-
func.execute([data], options)
|
221
|
+
find(function).execute([data], options)
|
200
222
|
end
|
201
223
|
|
202
224
|
def exec_func_binary(function, other, options: nil)
|
203
|
-
func = Arrow::Function.find(function)
|
204
225
|
case other
|
205
226
|
when Vector
|
206
|
-
|
207
|
-
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
|
208
|
-
|
227
|
+
find(function).execute([data, other.data], options)
|
228
|
+
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
|
229
|
+
find(function).execute([data, other], options)
|
209
230
|
else
|
210
|
-
raise
|
231
|
+
raise VectorArgumentError, "Operand is not supported: #{other.class}"
|
211
232
|
end
|
212
233
|
end
|
213
234
|
|
214
|
-
def
|
215
|
-
output =
|
216
|
-
|
235
|
+
def get_scalar(datum)
|
236
|
+
output = datum.value
|
237
|
+
case output
|
238
|
+
when Arrow::StringScalar then output.to_s
|
239
|
+
when Arrow::StructScalar
|
240
|
+
output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
|
241
|
+
else
|
242
|
+
output.value
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
module_function # ======
|
247
|
+
|
248
|
+
def find(function_name)
|
249
|
+
Arrow::Function.find(function_name)
|
217
250
|
end
|
218
251
|
|
219
|
-
|
220
|
-
|
252
|
+
# temporary API until RedAmber document prepared.
|
253
|
+
def arrow_doc(function_name)
|
254
|
+
f = find(function_name)
|
255
|
+
"#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
|
221
256
|
end
|
222
257
|
end
|
223
258
|
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# mix-ins for class Vector
|
8
|
+
# Functions to select some data.
|
9
|
+
module VectorSelectable
|
10
|
+
def drop_nil
|
11
|
+
datum = find(:drop_null).execute([data])
|
12
|
+
Vector.new(datum.value)
|
13
|
+
end
|
14
|
+
|
15
|
+
# vector calculation version of selection by indices
|
16
|
+
# TODO: support for option {boundscheck: true}
|
17
|
+
def take(*indices)
|
18
|
+
indices.flatten!
|
19
|
+
return Vector.new([]) if indices.empty?
|
20
|
+
|
21
|
+
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
22
|
+
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
23
|
+
|
24
|
+
take_by_vector(indices) # returns sub Vector
|
25
|
+
end
|
26
|
+
|
27
|
+
# TODO: support for option {null_selection_behavior: :drop}
|
28
|
+
def filter(*booleans)
|
29
|
+
booleans.flatten!
|
30
|
+
return Vector.new([]) if booleans.empty?
|
31
|
+
|
32
|
+
b = booleans[0]
|
33
|
+
boolean_array =
|
34
|
+
case b
|
35
|
+
when Vector
|
36
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
|
37
|
+
|
38
|
+
b.data
|
39
|
+
when Arrow::BooleanArray
|
40
|
+
b
|
41
|
+
else
|
42
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
|
43
|
+
|
44
|
+
Arrow::BooleanArray.new(booleans)
|
45
|
+
end
|
46
|
+
|
47
|
+
filter_by_array(boolean_array) # returns sub Vector
|
48
|
+
end
|
49
|
+
|
50
|
+
# @param indices
|
51
|
+
# @param booleans
|
52
|
+
def [](*args)
|
53
|
+
args.flatten!
|
54
|
+
return Vector.new([]) if args.empty?
|
55
|
+
|
56
|
+
arg = args[0]
|
57
|
+
case arg
|
58
|
+
when Vector
|
59
|
+
return take_by_vector(arg) if arg.numeric?
|
60
|
+
return filter_by_array(arg.data) if arg.boolean?
|
61
|
+
|
62
|
+
raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
|
63
|
+
when Arrow::BooleanArray
|
64
|
+
return filter_by_array(arg)
|
65
|
+
when Arrow::Array
|
66
|
+
array = arg
|
67
|
+
else
|
68
|
+
unless arg.is_a?(Numeric) || booleans?([arg])
|
69
|
+
raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
array ||= Arrow::Array.new(args)
|
73
|
+
return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
|
74
|
+
|
75
|
+
vector = Vector.new(array)
|
76
|
+
return take_by_vector(vector) if vector.numeric?
|
77
|
+
|
78
|
+
raise VectorArgumentError, "Invalid argument: #{args}"
|
79
|
+
end
|
80
|
+
|
81
|
+
# @param values [Array, Arrow::Array, Vector]
|
82
|
+
def is_in(*values)
|
83
|
+
values.flatten!
|
84
|
+
array =
|
85
|
+
case values[0]
|
86
|
+
when Vector
|
87
|
+
values[0].data
|
88
|
+
when Arrow::Array
|
89
|
+
values[0]
|
90
|
+
end
|
91
|
+
array ||= data.class.new(values)
|
92
|
+
Vector.new(data.is_in(array))
|
93
|
+
end
|
94
|
+
|
95
|
+
# Arrow's support required
|
96
|
+
def index(element)
|
97
|
+
to_a.index(element)
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
# Accepts indices by numeric Vector
|
103
|
+
def take_by_vector(indices)
|
104
|
+
raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
|
105
|
+
raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
106
|
+
|
107
|
+
normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
|
108
|
+
raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
|
109
|
+
|
110
|
+
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
111
|
+
|
112
|
+
datum = find(:array_take).execute([data, index_array])
|
113
|
+
Vector.new(datum.value)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Accepts booleans by Arrow::BooleanArray
|
117
|
+
def filter_by_array(boolean_array)
|
118
|
+
raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
|
119
|
+
|
120
|
+
datum = find(:array_filter).execute([data, boolean_array])
|
121
|
+
Vector.new(datum.value)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|