red_amber 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,35 +3,94 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select columns: [symbol] or [string]
7
- # select rows: [array of index], [range]
6
+ # select variables: [symbol] or [string]
7
+ # select observations: [array of index], [range]
8
8
  def [](*args)
9
+ args.flatten!
9
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
10
- raise DataFrameArgumentError, 'Empty argument' if args.empty?
11
-
12
- if args.one?
13
- case args[0]
14
- when Vector
15
- return select_obs_by_boolean(Arrow::BooleanArray.new(args[0].data))
16
- when Arrow::BooleanArray
17
- return select_obs_by_boolean(args[0])
18
- when Array
19
- return select_obs_by_boolean(Arrow::BooleanArray.new(args[0]))
20
-
21
- # when Hash
22
- # specify conditions to select by a Hash
23
- end
11
+ return remove_all_values if args.empty? || args[0].nil?
12
+
13
+ vector = parse_to_vector(args)
14
+ if vector.boolean?
15
+ return filter_by_vector(vector.data) if vector.size == size
16
+
17
+ raise DataFrameArgumentError, "Size is not match in booleans: #{args}"
18
+ end
19
+ return take_by_array(vector) if vector.numeric?
20
+ return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary
21
+
22
+ raise DataFrameArgumentError, "Invalid argument: #{args}"
23
+ end
24
+
25
+ # slice and select some observations to create sub DataFrame
26
+ def slice(*args, &block)
27
+ slicer = args
28
+ if block
29
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
+
31
+ slicer = instance_eval(&block)
32
+ end
33
+ slicer = [slicer].flatten
34
+
35
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
36
+ return remove_all_values if slicer.empty? || slicer[0].nil?
37
+
38
+ vector = parse_to_vector(slicer)
39
+ if vector.boolean?
40
+ return filter_by_vector(vector.data) if vector.size == size
41
+
42
+ raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}"
24
43
  end
44
+ return take_by_array(vector) if vector.numeric?
45
+
46
+ raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
+ end
48
+
49
+ # remove selected observations to create sub DataFrame
50
+ def remove(*args, &block)
51
+ remover = args
52
+ if block
53
+ raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
+
55
+ remover = instance_eval(&block)
56
+ end
57
+ remover = [remover].flatten
58
+
59
+ raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
+ return self if remover.empty? || remover[0].nil?
61
+
62
+ vector = parse_to_vector(remover)
63
+ if vector.boolean?
64
+ return filter_by_vector(vector.primitive_invert.data) if vector.size == size
65
+
66
+ raise DataFrameArgumentError, "Size is not match in booleans: #{remover}"
67
+ end
68
+ if vector.numeric?
69
+ raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1
70
+
71
+ normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail
72
+ if normalized_indices.max >= size
73
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}"
74
+ end
25
75
 
26
- return select_obs_by_boolean(args) if booleans?(args)
76
+ normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array
77
+ return remove_all_values if normalized_indices == indices
78
+ return self if normalized_indices.empty?
27
79
 
28
- # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
- expanded = expand_range(args)
30
- return map_indices(*expanded) if integers?(expanded)
31
- return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
80
+ index_array = indices - normalized_indices
32
81
 
33
- raise DataFrameArgumentError, "Invalid argument #{args}"
82
+ datum = Arrow::Function.find(:take).execute([table, index_array])
83
+ return DataFrame.new(datum.value)
84
+ end
85
+
86
+ raise DataFrameArgumentError, "Invalid argument #{remover}"
87
+ end
88
+
89
+ def remove_nil
90
+ func = Arrow::Function.find(:drop_null)
91
+ DataFrame.new(func.execute([table]).value)
34
92
  end
93
+ alias_method :drop_nil, :remove_nil
35
94
 
36
95
  # Select a variable by a key in String or Symbol
37
96
  def v(key)
@@ -43,24 +102,57 @@ module RedAmber
43
102
  variables[key.to_sym]
44
103
  end
45
104
 
46
- def head(n_rows = 5)
47
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
105
+ def head(n_obs = 5)
106
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
48
107
 
49
- self[0...[n_rows, size].min]
108
+ self[0...[n_obs, size].min]
50
109
  end
51
110
 
52
- def tail(n_rows = 5)
53
- raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
111
+ def tail(n_obs = 5)
112
+ raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative?
54
113
 
55
- self[-[n_rows, size].min..]
114
+ self[-[n_obs, size].min..]
56
115
  end
57
116
 
58
- def first(n_rows = 1)
59
- head(n_rows)
117
+ def first(n_obs = 1)
118
+ head(n_obs)
60
119
  end
61
120
 
62
- def last(n_rows = 1)
63
- tail(n_rows)
121
+ def last(n_obs = 1)
122
+ tail(n_obs)
123
+ end
124
+
125
+ # Undocumented
126
+ # TODO: support for option {boundscheck: true}
127
+ def take(*indices)
128
+ indices.flatten!
129
+ return remove_all_values if indices.empty?
130
+
131
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
132
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
133
+
134
+ take_by_array(indices)
135
+ end
136
+
137
+ # Undocumented
138
+ # TODO: support for option {null_selection_behavior: :drop}
139
+ def filter(*booleans)
140
+ booleans.flatten!
141
+ return remove_all_values if booleans.empty?
142
+
143
+ b = booleans[0]
144
+ case b
145
+ when Vector
146
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean?
147
+
148
+ filter_by_vector(b.data)
149
+ when Arrow::BooleanArray
150
+ filter_by_vector(b)
151
+ else
152
+ raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans)
153
+
154
+ filter_by_vector(Arrow::BooleanArray.new(booleans))
155
+ end
64
156
  end
65
157
 
66
158
  private
@@ -75,5 +167,32 @@ module RedAmber
75
167
  DataFrame.new(@table[keys])
76
168
  end
77
169
  end
170
+
171
+ # Accepts indices by numeric Vector
172
+ def take_by_array(indices)
173
+ raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric?
174
+ raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
175
+
176
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
177
+ raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
178
+
179
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
180
+
181
+ datum = Arrow::Function.find(:take).execute([table, index_array])
182
+ DataFrame.new(datum.value)
183
+ end
184
+
185
+ # Accepts booleans by Arrow::BooleanArray
186
+ def filter_by_vector(boolean_array)
187
+ raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
188
+
189
+ datum = Arrow::Function.find(:filter).execute([table, boolean_array])
190
+ DataFrame.new(datum.value)
191
+ end
192
+
193
+ # return a DataFrame with same keys as self without values
194
+ def remove_all_values
195
+ filter_by_vector(Arrow::BooleanArray.new([false] * size))
196
+ end
78
197
  end
79
198
  end
@@ -129,5 +129,9 @@ module RedAmber
129
129
  arrays << Arrow::ChunkedArray.new([a])
130
130
  end
131
131
  end
132
+
133
+ def keys_by_booleans(booleans)
134
+ keys.select.with_index { |_, i| booleans[i] }
135
+ end
132
136
  end
133
137
  end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module Helper
6
+ private
7
+
8
+ def pl(num)
9
+ num > 1 ? 's' : ''
10
+ end
11
+
12
+ def out_of_range?(indeces)
13
+ indeces.max >= size || indeces.min < -size
14
+ end
15
+
16
+ def integers?(enum)
17
+ enum.all?(Integer)
18
+ end
19
+
20
+ def sym_or_str?(enum)
21
+ enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
22
+ end
23
+
24
+ def booleans?(enum)
25
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
26
+ end
27
+
28
+ def create_dataframe_from_vector(key, vector)
29
+ DataFrame.new(key => vector.data)
30
+ end
31
+
32
+ def parse_to_vector(args)
33
+ a = args.reduce([]) do |accum, elem|
34
+ accum.concat(normalize_element(elem))
35
+ end
36
+ Vector.new(a)
37
+ end
38
+
39
+ def normalize_element(elem)
40
+ case elem
41
+ when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
42
+ [elem]
43
+ when Range
44
+ both_end = [elem.begin, elem.end]
45
+ both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
46
+
47
+ if both_end.any?(Integer) || both_end.all?(&:nil?)
48
+ if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
49
+ raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
50
+ end
51
+
52
+ (0...size).to_a[elem]
53
+ else
54
+ elem.to_a
55
+ end
56
+ else
57
+ Array(elem)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,25 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # Columnar data object
4
+ # Values in variable (columnar) data object
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
- include VectorCompensable
9
8
  include VectorFunctions
9
+ include VectorUpdatable
10
+ include VectorSelectable
11
+ include Helper
10
12
 
11
- # chunked_array may come from column.data
12
- def initialize(array)
13
+ def initialize(*array)
13
14
  @key = nil # default is 'headless'
14
- case array
15
- when Vector
16
- @data = array.data
17
- when Arrow::Array, Arrow::ChunkedArray
18
- @data = array
19
- when Array
20
- @data = Arrow::Array.new(array)
15
+ if array.empty? || array[0].nil?
16
+ Vector.new([])
21
17
  else
22
- raise VectorArgumentError, 'Unknown array in argument'
18
+ array.flatten!
19
+ case array[0]
20
+ when Vector
21
+ @data = array[0].data
22
+ return
23
+ when Arrow::Array, Arrow::ChunkedArray
24
+ @data = array[0]
25
+ return
26
+ when Range
27
+ @data = Arrow::Array.new(Array(array[0]))
28
+ return
29
+ end
30
+ begin
31
+ @data = Arrow::Array.new(Array(array))
32
+ rescue Error
33
+ raise VectorArgumentError, "Invalid argument: #{array}"
34
+ end
23
35
  end
24
36
  end
25
37
 
@@ -52,6 +64,16 @@ module RedAmber
52
64
  alias_method :to_a, :values
53
65
  alias_method :entries, :values
54
66
 
67
+ def indices
68
+ (0...size).to_a
69
+ end
70
+ alias_method :indexes, :indices
71
+ alias_method :indeces, :indices
72
+
73
+ def to_ary
74
+ to_a
75
+ end
76
+
55
77
  def size
56
78
  # only defined :length in Arrow?
57
79
  @data.length
@@ -60,6 +82,10 @@ module RedAmber
60
82
  alias_method :n_rows, :size
61
83
  alias_method :nrow, :size
62
84
 
85
+ def empty?
86
+ size.zero?
87
+ end
88
+
63
89
  def type
64
90
  @data.value_type.nick.to_sym
65
91
  end
@@ -124,5 +150,9 @@ module RedAmber
124
150
  def n_nans
125
151
  numeric? ? is_nan.to_a.count(true) : 0
126
152
  end
153
+
154
+ def has_nil?
155
+ is_nil.any
156
+ end
127
157
  end
128
158
  end
@@ -16,11 +16,13 @@ module RedAmber
16
16
  unary_aggregations.each do |function|
17
17
  define_method(function) do |opts: nil|
18
18
  datum = exec_func_unary(function, options: opts)
19
- take_out_scalar(datum)
19
+ get_scalar(datum)
20
20
  end
21
21
  end
22
22
  alias_method :median, :approximate_median
23
23
  alias_method :count_uniq, :count_distinct
24
+ alias_method :all?, :all
25
+ alias_method :any?, :any
24
26
 
25
27
  def unbiased_variance
26
28
  variance(opts: { ddof: 1 })
@@ -47,7 +49,7 @@ module RedAmber
47
49
  unary_element_wise.each do |function|
48
50
  define_method(function) do |opts: nil|
49
51
  datum = exec_func_unary(function, options: opts)
50
- take_out_element_wise(datum)
52
+ Vector.new(datum.value)
51
53
  end
52
54
  end
53
55
  alias_method :is_nil, :is_null
@@ -72,12 +74,12 @@ module RedAmber
72
74
  unary_element_wise_op.each do |function, operator|
73
75
  define_method(function) do |opts: nil|
74
76
  datum = exec_func_unary(function, options: opts)
75
- take_out_element_wise(datum)
77
+ Vector.new(datum.value)
76
78
  end
77
79
 
78
80
  define_method(operator) do |opts: nil|
79
81
  datum = exec_func_unary(function, options: opts)
80
- take_out_element_wise(datum)
82
+ Vector.new(datum.value)
81
83
  end
82
84
  end
83
85
  alias_method :not, :invert
@@ -95,7 +97,7 @@ module RedAmber
95
97
  binary_element_wise.each do |function|
96
98
  define_method(function) do |other, opts: nil|
97
99
  datum = exec_func_binary(function, other, options: opts)
98
- take_out_element_wise(datum)
100
+ Vector.new(datum.value)
99
101
  end
100
102
  end
101
103
 
@@ -111,7 +113,7 @@ module RedAmber
111
113
  logical_binary_element_wise.each do |method, function|
112
114
  define_method(method) do |other, opts: nil|
113
115
  datum = exec_func_binary(function, other, options: opts)
114
- take_out_element_wise(datum)
116
+ Vector.new(datum.value)
115
117
  end
116
118
  end
117
119
 
@@ -144,12 +146,12 @@ module RedAmber
144
146
  binary_element_wise_op.each do |function, operator|
145
147
  define_method(function) do |other, opts: nil|
146
148
  datum = exec_func_binary(function, other, options: opts)
147
- take_out_element_wise(datum)
149
+ Vector.new(datum.value)
148
150
  end
149
151
 
150
152
  define_method(operator) do |other, opts: nil|
151
153
  datum = exec_func_binary(function, other, options: opts)
152
- take_out_element_wise(datum)
154
+ Vector.new(datum.value)
153
155
  end
154
156
  end
155
157
  alias_method :eq, :equal
@@ -159,8 +161,17 @@ module RedAmber
159
161
  alias_method :lt, :less
160
162
  alias_method :ne, :not_equal
161
163
 
164
+ def coerce(other)
165
+ case other
166
+ when Vector, Array, Arrow::Array
167
+ raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
168
+
169
+ [Vector.new(Array(other)), self]
170
+ end
171
+ [Vector.new(Array(other) * size), self]
172
+ end
173
+
162
174
  # (array functions)
163
- # array_filter, array_take
164
175
  # dictionary_encode,
165
176
  # partition_nth_indices,
166
177
  # quarter, quarters_between,
@@ -192,17 +203,17 @@ module RedAmber
192
203
  # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
193
204
 
194
205
  # (onditional)
195
- # case_when, cast, if_else
206
+ # case_when, cast,
196
207
 
197
208
  # (indices)
198
209
  # choose, index_in, index_in_meta_binary, indices_nonzero
199
210
 
200
211
  # (others)
201
- # coalesce, drop_null,
202
- # filter, is_in, is_in_meta_binary,
212
+ # coalesce,
213
+ # is_in_meta_binary,
203
214
  # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
204
215
  # max_element_wise, min_element_wise, random, select_k_unstable,
205
- # sort_indices, struct_field, take
216
+ # struct_field,
206
217
 
207
218
  private # =======
208
219
 
@@ -221,7 +232,7 @@ module RedAmber
221
232
  end
222
233
  end
223
234
 
224
- def take_out_scalar(datum)
235
+ def get_scalar(datum)
225
236
  output = datum.value
226
237
  case output
227
238
  when Arrow::StringScalar then output.to_s
@@ -232,10 +243,6 @@ module RedAmber
232
243
  end
233
244
  end
234
245
 
235
- def take_out_element_wise(datum)
236
- Vector.new(datum.value)
237
- end
238
-
239
246
  module_function # ======
240
247
 
241
248
  def find(function_name)
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to select some data.
9
+ module VectorSelectable
10
+ def drop_nil
11
+ datum = find(:drop_null).execute([data])
12
+ Vector.new(datum.value)
13
+ end
14
+
15
+ # vector calculation version of selection by indices
16
+ # TODO: support for option {boundscheck: true}
17
+ def take(*indices)
18
+ indices.flatten!
19
+ return Vector.new([]) if indices.empty?
20
+
21
+ indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
22
+ indices = Vector.new(indices) unless indices.is_a?(Vector)
23
+
24
+ take_by_vector(indices) # returns sub Vector
25
+ end
26
+
27
+ # TODO: support for option {null_selection_behavior: :drop}
28
+ def filter(*booleans)
29
+ booleans.flatten!
30
+ return Vector.new([]) if booleans.empty?
31
+
32
+ b = booleans[0]
33
+ boolean_array =
34
+ case b
35
+ when Vector
36
+ raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
37
+
38
+ b.data
39
+ when Arrow::BooleanArray
40
+ b
41
+ else
42
+ raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
43
+
44
+ Arrow::BooleanArray.new(booleans)
45
+ end
46
+
47
+ filter_by_array(boolean_array) # returns sub Vector
48
+ end
49
+
50
+ # @param indices
51
+ # @param booleans
52
+ def [](*args)
53
+ args.flatten!
54
+ return Vector.new([]) if args.empty?
55
+
56
+ arg = args[0]
57
+ case arg
58
+ when Vector
59
+ return take_by_vector(arg) if arg.numeric?
60
+ return filter_by_array(arg.data) if arg.boolean?
61
+
62
+ raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
63
+ when Arrow::BooleanArray
64
+ return filter_by_array(arg)
65
+ when Arrow::Array
66
+ array = arg
67
+ else
68
+ unless arg.is_a?(Numeric) || booleans?([arg])
69
+ raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
70
+ end
71
+ end
72
+ array ||= Arrow::Array.new(args)
73
+ return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
74
+
75
+ vector = Vector.new(array)
76
+ return take_by_vector(vector) if vector.numeric?
77
+
78
+ raise VectorArgumentError, "Invalid argument: #{args}"
79
+ end
80
+
81
+ # @param values [Array, Arrow::Array, Vector]
82
+ def is_in(*values)
83
+ values.flatten!
84
+ array =
85
+ case values[0]
86
+ when Vector
87
+ values[0].data
88
+ when Arrow::Array
89
+ values[0]
90
+ end
91
+ array ||= data.class.new(values)
92
+ Vector.new(data.is_in(array))
93
+ end
94
+
95
+ # Arrow's support required
96
+ def index(element)
97
+ to_a.index(element)
98
+ end
99
+
100
+ private
101
+
102
+ # Accepts indices by numeric Vector
103
+ def take_by_vector(indices)
104
+ raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
105
+ raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
106
+
107
+ normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
108
+ raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
109
+
110
+ index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
111
+
112
+ datum = find(:array_take).execute([data, index_array])
113
+ Vector.new(datum.value)
114
+ end
115
+
116
+ # Accepts booleans by Arrow::BooleanArray
117
+ def filter_by_array(boolean_array)
118
+ raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
119
+
120
+ datum = find(:array_filter).execute([data, boolean_array])
121
+ Vector.new(datum.value)
122
+ end
123
+ end
124
+ end