red_amber 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +31 -7
- data/CHANGELOG.md +214 -10
- data/Gemfile +4 -0
- data/README.md +117 -342
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +854 -0
- data/doc/Vector.md +449 -0
- data/doc/image/arrow_table_new.png +0 -0
- data/doc/image/dataframe/assign.png +0 -0
- data/doc/image/dataframe/drop.png +0 -0
- data/doc/image/dataframe/pick.png +0 -0
- data/doc/image/dataframe/remove.png +0 -0
- data/doc/image/dataframe/rename.png +0 -0
- data/doc/image/dataframe/slice.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/doc/image/example_in_red_arrow.png +0 -0
- data/doc/image/tdr.png +0 -0
- data/doc/image/tdr_and_table.png +0 -0
- data/doc/image/tidy_data_in_TDR.png +0 -0
- data/doc/image/vector/binary_element_wise.png +0 -0
- data/doc/image/vector/unary_aggregation.png +0 -0
- data/doc/image/vector/unary_aggregation_w_option.png +0 -0
- data/doc/image/vector/unary_element_wise.png +0 -0
- data/doc/tdr.md +56 -0
- data/doc/tdr_ja.md +56 -0
- data/lib/red-amber.rb +27 -0
- data/lib/red_amber/data_frame.rb +91 -37
- data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +11 -0
- data/lib/red_amber/data_frame_selectable.rb +155 -48
- data/lib/red_amber/data_frame_variable_operation.rb +137 -0
- data/lib/red_amber/helper.rb +61 -0
- data/lib/red_amber/vector.rb +69 -16
- data/lib/red_amber/vector_functions.rb +80 -45
- data/lib/red_amber/vector_selectable.rb +124 -0
- data/lib/red_amber/vector_updatable.rb +104 -0
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -16
- data/red_amber.gemspec +3 -6
- metadata +38 -9
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameVariableOperation
|
6
|
+
# pick up some variables to create sub DataFrame
|
7
|
+
def pick(*args, &block)
|
8
|
+
picker = args
|
9
|
+
if block
|
10
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
11
|
+
|
12
|
+
picker = instance_eval(&block)
|
13
|
+
end
|
14
|
+
picker = [picker].flatten
|
15
|
+
return DataFrame.new if picker.empty? || picker == [nil]
|
16
|
+
|
17
|
+
picker = keys_by_booleans(picker) if booleans?(picker)
|
18
|
+
|
19
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
20
|
+
# DataFrame#pick creates a DataFrame with single key.
|
21
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
22
|
+
|
23
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
24
|
+
end
|
25
|
+
|
26
|
+
# drop some variables to create remainer sub DataFrame
|
27
|
+
def drop(*args, &block)
|
28
|
+
dropper = args
|
29
|
+
if block
|
30
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
|
31
|
+
|
32
|
+
dropper = instance_eval(&block)
|
33
|
+
end
|
34
|
+
dropper = [dropper].flatten
|
35
|
+
dropper = keys_by_booleans(dropper) if booleans?(dropper)
|
36
|
+
|
37
|
+
picker = keys - dropper
|
38
|
+
return DataFrame.new if picker.empty?
|
39
|
+
|
40
|
+
# DataFrame#[] creates a Vector with single key is specified.
|
41
|
+
# DataFrame#drop creates a DataFrame with single key.
|
42
|
+
return DataFrame.new(@table[picker]) if sym_or_str?(picker)
|
43
|
+
|
44
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# rename variables to create new DataFrame
|
48
|
+
def rename(*args, &block)
|
49
|
+
renamer = args
|
50
|
+
if block
|
51
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
52
|
+
|
53
|
+
renamer = instance_eval(&block)
|
54
|
+
end
|
55
|
+
renamer = [renamer].flatten
|
56
|
+
return self if renamer.empty?
|
57
|
+
|
58
|
+
return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
|
59
|
+
return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
|
60
|
+
|
61
|
+
raise DataFrameArgumentError, "Invalid argument #{args}"
|
62
|
+
end
|
63
|
+
|
64
|
+
# assign variables to create new DataFrame
|
65
|
+
def assign(*args, &block)
|
66
|
+
assigner = args
|
67
|
+
if block
|
68
|
+
raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
|
69
|
+
|
70
|
+
assigner = instance_eval(&block)
|
71
|
+
end
|
72
|
+
assigner = [assigner].flatten
|
73
|
+
return self if assigner.empty? || assigner == [nil]
|
74
|
+
|
75
|
+
raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
|
76
|
+
|
77
|
+
updater = {}
|
78
|
+
appender = {}
|
79
|
+
assigner[0].each do |key, value|
|
80
|
+
if keys.include? key
|
81
|
+
updater[key] = value
|
82
|
+
else
|
83
|
+
appender[key] = value
|
84
|
+
end
|
85
|
+
end
|
86
|
+
fields, arrays = update_fields_and_arrays(updater)
|
87
|
+
append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
|
88
|
+
|
89
|
+
DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def rename_by_hash(key_pairs)
|
95
|
+
fields = keys.map do |key|
|
96
|
+
new_key = key_pairs[key]
|
97
|
+
if new_key
|
98
|
+
Arrow::Field.new(new_key.to_sym, @table[key].data_type)
|
99
|
+
else
|
100
|
+
@table.schema[key]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
schema = Arrow::Schema.new(fields)
|
104
|
+
DataFrame.new(Arrow::Table.new(schema, @table.columns))
|
105
|
+
end
|
106
|
+
|
107
|
+
def update_fields_and_arrays(updater)
|
108
|
+
fields = @table.columns.map(&:field)
|
109
|
+
arrays = @table.columns.map(&:data) # chunked_arrays
|
110
|
+
keys.each_with_index do |key, i|
|
111
|
+
data = updater[key]
|
112
|
+
next unless data
|
113
|
+
|
114
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
115
|
+
|
116
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
117
|
+
fields[i] = Arrow::Field.new(key, a.value_data_type)
|
118
|
+
arrays[i] = Arrow::ChunkedArray.new([a])
|
119
|
+
end
|
120
|
+
[fields, arrays]
|
121
|
+
end
|
122
|
+
|
123
|
+
def append_to_fields_and_arrays(appender, fields, arrays)
|
124
|
+
appender.each do |key, data|
|
125
|
+
raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
|
126
|
+
|
127
|
+
a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
|
128
|
+
fields << Arrow::Field.new(key.to_sym, a.value_data_type)
|
129
|
+
arrays << Arrow::ChunkedArray.new([a])
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def keys_by_booleans(booleans)
|
134
|
+
keys.select.with_index { |_, i| booleans[i] }
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-in for the class DataFrame
|
5
|
+
module Helper
|
6
|
+
private
|
7
|
+
|
8
|
+
def pl(num)
|
9
|
+
num > 1 ? 's' : ''
|
10
|
+
end
|
11
|
+
|
12
|
+
def out_of_range?(indeces)
|
13
|
+
indeces.max >= size || indeces.min < -size
|
14
|
+
end
|
15
|
+
|
16
|
+
def integers?(enum)
|
17
|
+
enum.all?(Integer)
|
18
|
+
end
|
19
|
+
|
20
|
+
def sym_or_str?(enum)
|
21
|
+
enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
|
22
|
+
end
|
23
|
+
|
24
|
+
def booleans?(enum)
|
25
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def create_dataframe_from_vector(key, vector)
|
29
|
+
DataFrame.new(key => vector.data)
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_to_vector(args)
|
33
|
+
a = args.reduce([]) do |accum, elem|
|
34
|
+
accum.concat(normalize_element(elem))
|
35
|
+
end
|
36
|
+
Vector.new(a)
|
37
|
+
end
|
38
|
+
|
39
|
+
def normalize_element(elem)
|
40
|
+
case elem
|
41
|
+
when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
|
42
|
+
[elem]
|
43
|
+
when Range
|
44
|
+
both_end = [elem.begin, elem.end]
|
45
|
+
both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
|
46
|
+
|
47
|
+
if both_end.any?(Integer) || both_end.all?(&:nil?)
|
48
|
+
if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
|
49
|
+
raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
|
50
|
+
end
|
51
|
+
|
52
|
+
(0...size).to_a[elem]
|
53
|
+
else
|
54
|
+
elem.to_a
|
55
|
+
end
|
56
|
+
else
|
57
|
+
Array(elem)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -1,27 +1,42 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Values in variable (columnar) data object
|
5
5
|
# @data : holds Arrow::ChunkedArray
|
6
6
|
class Vector
|
7
7
|
# mix-in
|
8
8
|
include VectorFunctions
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
when Array
|
18
|
-
@data = Arrow::Array.new(array)
|
9
|
+
include VectorUpdatable
|
10
|
+
include VectorSelectable
|
11
|
+
include Helper
|
12
|
+
|
13
|
+
def initialize(*array)
|
14
|
+
@key = nil # default is 'headless'
|
15
|
+
if array.empty? || array[0].nil?
|
16
|
+
Vector.new([])
|
19
17
|
else
|
20
|
-
|
18
|
+
array.flatten!
|
19
|
+
case array[0]
|
20
|
+
when Vector
|
21
|
+
@data = array[0].data
|
22
|
+
return
|
23
|
+
when Arrow::Array, Arrow::ChunkedArray
|
24
|
+
@data = array[0]
|
25
|
+
return
|
26
|
+
when Range
|
27
|
+
@data = Arrow::Array.new(Array(array[0]))
|
28
|
+
return
|
29
|
+
end
|
30
|
+
begin
|
31
|
+
@data = Arrow::Array.new(Array(array))
|
32
|
+
rescue Error
|
33
|
+
raise VectorArgumentError, "Invalid argument: #{array}"
|
34
|
+
end
|
21
35
|
end
|
22
36
|
end
|
23
37
|
|
24
38
|
attr_reader :data
|
39
|
+
attr_accessor :key
|
25
40
|
|
26
41
|
def to_s
|
27
42
|
@data.to_a.inspect
|
@@ -49,6 +64,16 @@ module RedAmber
|
|
49
64
|
alias_method :to_a, :values
|
50
65
|
alias_method :entries, :values
|
51
66
|
|
67
|
+
def indices
|
68
|
+
(0...size).to_a
|
69
|
+
end
|
70
|
+
alias_method :indexes, :indices
|
71
|
+
alias_method :indeces, :indices
|
72
|
+
|
73
|
+
def to_ary
|
74
|
+
to_a
|
75
|
+
end
|
76
|
+
|
52
77
|
def size
|
53
78
|
# only defined :length in Arrow?
|
54
79
|
@data.length
|
@@ -57,6 +82,10 @@ module RedAmber
|
|
57
82
|
alias_method :n_rows, :size
|
58
83
|
alias_method :nrow, :size
|
59
84
|
|
85
|
+
def empty?
|
86
|
+
size.zero?
|
87
|
+
end
|
88
|
+
|
60
89
|
def type
|
61
90
|
@data.value_type.nick.to_sym
|
62
91
|
end
|
@@ -66,15 +95,19 @@ module RedAmber
|
|
66
95
|
end
|
67
96
|
|
68
97
|
def numeric?
|
69
|
-
|
98
|
+
type_class < Arrow::NumericDataType
|
70
99
|
end
|
71
100
|
|
72
101
|
def string?
|
73
102
|
type == :string
|
74
103
|
end
|
75
104
|
|
76
|
-
def
|
77
|
-
|
105
|
+
def temporal?
|
106
|
+
type_class < Arrow::TemporalDataType
|
107
|
+
end
|
108
|
+
|
109
|
+
def type_class
|
110
|
+
@data.value_data_type.class
|
78
111
|
end
|
79
112
|
|
80
113
|
# def each() end
|
@@ -90,7 +123,23 @@ module RedAmber
|
|
90
123
|
# def each_chunk() end
|
91
124
|
|
92
125
|
def tally
|
93
|
-
values.tally
|
126
|
+
hash = values.tally
|
127
|
+
if (type_class < Arrow::FloatingPointDataType) && is_nan.any
|
128
|
+
a = 0
|
129
|
+
hash.each do |key, value|
|
130
|
+
if key.is_a?(Float) && key.nan?
|
131
|
+
hash.delete(key)
|
132
|
+
a += value
|
133
|
+
end
|
134
|
+
end
|
135
|
+
hash[Float::NAN] = a
|
136
|
+
end
|
137
|
+
hash
|
138
|
+
end
|
139
|
+
|
140
|
+
def value_counts
|
141
|
+
values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
|
142
|
+
values.zip(counts).to_h
|
94
143
|
end
|
95
144
|
|
96
145
|
def n_nulls
|
@@ -101,5 +150,9 @@ module RedAmber
|
|
101
150
|
def n_nans
|
102
151
|
numeric? ? is_nan.to_a.count(true) : 0
|
103
152
|
end
|
153
|
+
|
154
|
+
def has_nil?
|
155
|
+
is_nil.any
|
156
|
+
end
|
104
157
|
end
|
105
158
|
end
|
@@ -12,32 +12,44 @@ module RedAmber
|
|
12
12
|
module VectorFunctions
|
13
13
|
# [Unary aggregations]: vector.func => scalar
|
14
14
|
unary_aggregations =
|
15
|
-
%i[all any approximate_median count count_distinct max mean min product stddev sum variance]
|
15
|
+
%i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
|
16
16
|
unary_aggregations.each do |function|
|
17
17
|
define_method(function) do |opts: nil|
|
18
|
-
|
19
|
-
|
18
|
+
datum = exec_func_unary(function, options: opts)
|
19
|
+
get_scalar(datum)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
alias_method :median, :approximate_median
|
23
23
|
alias_method :count_uniq, :count_distinct
|
24
|
+
alias_method :all?, :all
|
25
|
+
alias_method :any?, :any
|
26
|
+
|
27
|
+
def unbiased_variance
|
28
|
+
variance(opts: { ddof: 1 })
|
29
|
+
end
|
30
|
+
alias_method :var, :unbiased_variance
|
31
|
+
|
32
|
+
def sd
|
33
|
+
stddev(opts: { ddof: 1 })
|
34
|
+
end
|
35
|
+
alias_method :std, :sd
|
24
36
|
|
25
37
|
# option(s) required
|
26
38
|
# - index
|
27
39
|
|
28
40
|
# Returns other than value
|
29
|
-
# - min_max
|
30
41
|
# - mode
|
31
42
|
# - quantile
|
32
43
|
# - tdigest
|
33
44
|
|
34
45
|
# [Unary element-wise]: vector.func => vector
|
35
46
|
unary_element_wise =
|
36
|
-
%i[abs atan bit_wise_not ceil cos floor is_finite
|
47
|
+
%i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
|
48
|
+
is_inf is_nan is_null is_valid round round_to_multiple sign sin tan trunc unique]
|
37
49
|
unary_element_wise.each do |function|
|
38
50
|
define_method(function) do |opts: nil|
|
39
|
-
|
40
|
-
|
51
|
+
datum = exec_func_unary(function, options: opts)
|
52
|
+
Vector.new(datum.value)
|
41
53
|
end
|
42
54
|
end
|
43
55
|
alias_method :is_nil, :is_null
|
@@ -46,6 +58,14 @@ module RedAmber
|
|
46
58
|
numeric? ? (is_nil | is_nan) : is_nil
|
47
59
|
end
|
48
60
|
|
61
|
+
alias_method :fill_nil_backward, :fill_null_backward
|
62
|
+
alias_method :fill_nil_forward, :fill_null_forward
|
63
|
+
|
64
|
+
alias_method :sort_indexes, :array_sort_indices
|
65
|
+
alias_method :sort_indices, :array_sort_indices
|
66
|
+
|
67
|
+
alias_method :uniq, :unique
|
68
|
+
|
49
69
|
# [Unary element-wise with operator]: vector.func => vector, op vector
|
50
70
|
unary_element_wise_op = {
|
51
71
|
invert: '!',
|
@@ -53,20 +73,17 @@ module RedAmber
|
|
53
73
|
}
|
54
74
|
unary_element_wise_op.each do |function, operator|
|
55
75
|
define_method(function) do |opts: nil|
|
56
|
-
|
57
|
-
|
76
|
+
datum = exec_func_unary(function, options: opts)
|
77
|
+
Vector.new(datum.value)
|
58
78
|
end
|
59
79
|
|
60
80
|
define_method(operator) do |opts: nil|
|
61
|
-
|
62
|
-
|
81
|
+
datum = exec_func_unary(function, options: opts)
|
82
|
+
Vector.new(datum.value)
|
63
83
|
end
|
64
84
|
end
|
65
85
|
alias_method :not, :invert
|
66
86
|
|
67
|
-
# option(s) required
|
68
|
-
# - round, round_to_multiple
|
69
|
-
|
70
87
|
# NaN support needed
|
71
88
|
# - acos asin ln log10 log1p log2
|
72
89
|
|
@@ -79,8 +96,8 @@ module RedAmber
|
|
79
96
|
%i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
|
80
97
|
binary_element_wise.each do |function|
|
81
98
|
define_method(function) do |other, opts: nil|
|
82
|
-
|
83
|
-
|
99
|
+
datum = exec_func_binary(function, other, options: opts)
|
100
|
+
Vector.new(datum.value)
|
84
101
|
end
|
85
102
|
end
|
86
103
|
|
@@ -95,8 +112,8 @@ module RedAmber
|
|
95
112
|
}
|
96
113
|
logical_binary_element_wise.each do |method, function|
|
97
114
|
define_method(method) do |other, opts: nil|
|
98
|
-
|
99
|
-
|
115
|
+
datum = exec_func_binary(function, other, options: opts)
|
116
|
+
Vector.new(datum.value)
|
100
117
|
end
|
101
118
|
end
|
102
119
|
|
@@ -128,13 +145,13 @@ module RedAmber
|
|
128
145
|
}
|
129
146
|
binary_element_wise_op.each do |function, operator|
|
130
147
|
define_method(function) do |other, opts: nil|
|
131
|
-
|
132
|
-
|
148
|
+
datum = exec_func_binary(function, other, options: opts)
|
149
|
+
Vector.new(datum.value)
|
133
150
|
end
|
134
151
|
|
135
152
|
define_method(operator) do |other, opts: nil|
|
136
|
-
|
137
|
-
|
153
|
+
datum = exec_func_binary(function, other, options: opts)
|
154
|
+
Vector.new(datum.value)
|
138
155
|
end
|
139
156
|
end
|
140
157
|
alias_method :eq, :equal
|
@@ -144,14 +161,20 @@ module RedAmber
|
|
144
161
|
alias_method :lt, :less
|
145
162
|
alias_method :ne, :not_equal
|
146
163
|
|
164
|
+
def coerce(other)
|
165
|
+
case other
|
166
|
+
when Vector, Array, Arrow::Array
|
167
|
+
raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
|
168
|
+
|
169
|
+
[Vector.new(Array(other)), self]
|
170
|
+
end
|
171
|
+
[Vector.new(Array(other) * size), self]
|
172
|
+
end
|
173
|
+
|
147
174
|
# (array functions)
|
148
|
-
#
|
149
|
-
# dictionary_encode, hash_all, hash_any, hash_approximate_median,
|
150
|
-
# hash_count, hash_count_distinct, hash_distinct, hash_max, hash_mean, hash_min,
|
151
|
-
# hash_min_max, hash_product, hash_stddev, hash_sum, hash_tdigest, hash_variance,
|
175
|
+
# dictionary_encode,
|
152
176
|
# partition_nth_indices,
|
153
|
-
# quarter, quarters_between,
|
154
|
-
# value_counts
|
177
|
+
# quarter, quarters_between,
|
155
178
|
|
156
179
|
# (strings)
|
157
180
|
# ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
|
@@ -180,44 +203,56 @@ module RedAmber
|
|
180
203
|
# strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
|
181
204
|
|
182
205
|
# (onditional)
|
183
|
-
# case_when, cast,
|
206
|
+
# case_when, cast,
|
184
207
|
|
185
208
|
# (indices)
|
186
209
|
# choose, index_in, index_in_meta_binary, indices_nonzero
|
187
210
|
|
188
211
|
# (others)
|
189
|
-
# coalesce,
|
190
|
-
#
|
212
|
+
# coalesce,
|
213
|
+
# is_in_meta_binary,
|
191
214
|
# list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
|
192
|
-
# max_element_wise, min_element_wise, random,
|
193
|
-
#
|
215
|
+
# max_element_wise, min_element_wise, random, select_k_unstable,
|
216
|
+
# struct_field,
|
194
217
|
|
195
218
|
private # =======
|
196
219
|
|
197
220
|
def exec_func_unary(function, options: nil)
|
198
|
-
|
199
|
-
func.execute([data], options)
|
221
|
+
find(function).execute([data], options)
|
200
222
|
end
|
201
223
|
|
202
224
|
def exec_func_binary(function, other, options: nil)
|
203
|
-
func = Arrow::Function.find(function)
|
204
225
|
case other
|
205
226
|
when Vector
|
206
|
-
|
207
|
-
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
|
208
|
-
|
227
|
+
find(function).execute([data, other.data], options)
|
228
|
+
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
|
229
|
+
find(function).execute([data, other], options)
|
209
230
|
else
|
210
|
-
raise
|
231
|
+
raise VectorArgumentError, "Operand is not supported: #{other.class}"
|
211
232
|
end
|
212
233
|
end
|
213
234
|
|
214
|
-
def
|
215
|
-
output =
|
216
|
-
|
235
|
+
def get_scalar(datum)
|
236
|
+
output = datum.value
|
237
|
+
case output
|
238
|
+
when Arrow::StringScalar then output.to_s
|
239
|
+
when Arrow::StructScalar
|
240
|
+
output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
|
241
|
+
else
|
242
|
+
output.value
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
module_function # ======
|
247
|
+
|
248
|
+
def find(function_name)
|
249
|
+
Arrow::Function.find(function_name)
|
217
250
|
end
|
218
251
|
|
219
|
-
|
220
|
-
|
252
|
+
# temporary API until RedAmber document prepared.
|
253
|
+
def arrow_doc(function_name)
|
254
|
+
f = find(function_name)
|
255
|
+
"#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
|
221
256
|
end
|
222
257
|
end
|
223
258
|
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# mix-ins for class Vector
|
8
|
+
# Functions to select some data.
|
9
|
+
module VectorSelectable
|
10
|
+
def drop_nil
|
11
|
+
datum = find(:drop_null).execute([data])
|
12
|
+
Vector.new(datum.value)
|
13
|
+
end
|
14
|
+
|
15
|
+
# vector calculation version of selection by indices
|
16
|
+
# TODO: support for option {boundscheck: true}
|
17
|
+
def take(*indices)
|
18
|
+
indices.flatten!
|
19
|
+
return Vector.new([]) if indices.empty?
|
20
|
+
|
21
|
+
indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
|
22
|
+
indices = Vector.new(indices) unless indices.is_a?(Vector)
|
23
|
+
|
24
|
+
take_by_vector(indices) # returns sub Vector
|
25
|
+
end
|
26
|
+
|
27
|
+
# TODO: support for option {null_selection_behavior: :drop}
|
28
|
+
def filter(*booleans)
|
29
|
+
booleans.flatten!
|
30
|
+
return Vector.new([]) if booleans.empty?
|
31
|
+
|
32
|
+
b = booleans[0]
|
33
|
+
boolean_array =
|
34
|
+
case b
|
35
|
+
when Vector
|
36
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
|
37
|
+
|
38
|
+
b.data
|
39
|
+
when Arrow::BooleanArray
|
40
|
+
b
|
41
|
+
else
|
42
|
+
raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
|
43
|
+
|
44
|
+
Arrow::BooleanArray.new(booleans)
|
45
|
+
end
|
46
|
+
|
47
|
+
filter_by_array(boolean_array) # returns sub Vector
|
48
|
+
end
|
49
|
+
|
50
|
+
# @param indices
|
51
|
+
# @param booleans
|
52
|
+
def [](*args)
|
53
|
+
args.flatten!
|
54
|
+
return Vector.new([]) if args.empty?
|
55
|
+
|
56
|
+
arg = args[0]
|
57
|
+
case arg
|
58
|
+
when Vector
|
59
|
+
return take_by_vector(arg) if arg.numeric?
|
60
|
+
return filter_by_array(arg.data) if arg.boolean?
|
61
|
+
|
62
|
+
raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
|
63
|
+
when Arrow::BooleanArray
|
64
|
+
return filter_by_array(arg)
|
65
|
+
when Arrow::Array
|
66
|
+
array = arg
|
67
|
+
else
|
68
|
+
unless arg.is_a?(Numeric) || booleans?([arg])
|
69
|
+
raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
array ||= Arrow::Array.new(args)
|
73
|
+
return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
|
74
|
+
|
75
|
+
vector = Vector.new(array)
|
76
|
+
return take_by_vector(vector) if vector.numeric?
|
77
|
+
|
78
|
+
raise VectorArgumentError, "Invalid argument: #{args}"
|
79
|
+
end
|
80
|
+
|
81
|
+
# @param values [Array, Arrow::Array, Vector]
|
82
|
+
def is_in(*values)
|
83
|
+
values.flatten!
|
84
|
+
array =
|
85
|
+
case values[0]
|
86
|
+
when Vector
|
87
|
+
values[0].data
|
88
|
+
when Arrow::Array
|
89
|
+
values[0]
|
90
|
+
end
|
91
|
+
array ||= data.class.new(values)
|
92
|
+
Vector.new(data.is_in(array))
|
93
|
+
end
|
94
|
+
|
95
|
+
# Arrow's support required
|
96
|
+
def index(element)
|
97
|
+
to_a.index(element)
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
# Accepts indices by numeric Vector
|
103
|
+
def take_by_vector(indices)
|
104
|
+
raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
|
105
|
+
raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
|
106
|
+
|
107
|
+
normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
|
108
|
+
raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
|
109
|
+
|
110
|
+
index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
|
111
|
+
|
112
|
+
datum = find(:array_take).execute([data, index_array])
|
113
|
+
Vector.new(datum.value)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Accepts booleans by Arrow::BooleanArray
|
117
|
+
def filter_by_array(boolean_array)
|
118
|
+
raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
|
119
|
+
|
120
|
+
datum = find(:array_filter).execute([data, boolean_array])
|
121
|
+
Vector.new(datum.value)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|