red_amber 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/doc/Vector.md CHANGED
@@ -33,7 +33,9 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
33
33
 
34
34
  ### `type`
35
35
 
36
- ### `data_type`
36
+ ### `boolean?`, `numeric?`, `string?`, `temporal?`
37
+
38
+ ### `type_class`
37
39
 
38
40
  ### [ ] `each` (not impremented yet)
39
41
 
@@ -43,8 +45,6 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
43
45
 
44
46
  ### [ ] `each_chunk` (not impremented yet)
45
47
 
46
- ### `tally`
47
-
48
48
  ### `n_nils`, `n_nans`
49
49
 
50
50
  - `n_nulls` is an alias of `n_nils`
@@ -126,20 +126,23 @@ boolean.all(opts: {skip_nulls: false}) #=> false
126
126
  |[ ]`asin` | | [ ] | | | |
127
127
  | ✓ `atan` | | ✓ | | | |
128
128
  | ✓ `bit_wise_not`| | (✓) | | |integer only|
129
- |[ ]`ceil` | | ✓ | | | |
129
+ | `ceil` | | ✓ | | | |
130
130
  | ✓ `cos` | | ✓ | | | |
131
- |[ ]`floor` | | | | | |
131
+ | ✓`fill_nil_backward`| ✓ || | | |
132
+ | ✓`fill_nil_forward` | ✓ | ✓ | ✓ | | |
133
+ | ✓ `floor` | | ✓ | | | |
132
134
  | ✓ `invert` | ✓ | | | |`!`, alias `not`|
133
135
  |[ ]`ln` | | [ ] | | | |
134
136
  |[ ]`log10` | | [ ] | | | |
135
137
  |[ ]`log1p` | | [ ] | | | |
136
138
  |[ ]`log2` | | [ ] | | | |
137
- |[ ]`round` | | [ ] | |[ ] Round| |
138
- |[ ]`round_to_multiple`| | [ ] | |[ ] RoundToMultiple| |
139
+ | `round` | || | Round (:mode, :n_digits)| |
140
+ | `round_to_multiple`| | | | RoundToMultiple :mode, :multiple| multiple must be an Arrow::Scalar|
139
141
  | ✓ `sign` | | ✓ | | | |
140
142
  | ✓ `sin` | | ✓ | | | |
143
+ | ✓`sort_indexes`| ✓ | ✓ | ✓ |:order|alias `sort_indices`|
141
144
  | ✓ `tan` | | ✓ | | | |
142
- |[ ]`trunc` | | ✓ | | | |
145
+ | `trunc` | | ✓ | | | |
143
146
 
144
147
  ### Binary element-wise: `vector.func(vector) => vector`
145
148
 
@@ -180,8 +183,30 @@ boolean.all(opts: {skip_nulls: false}) #=> false
180
183
  | ✓ `shift_right` | | (✓) | | |`>>`, integer only|
181
184
  | ✓ `xor` | ✓ | | | | `^` |
182
185
 
186
+ ### `uniq`
187
+
188
+ Returns a new array with distinct elements.
189
+
183
190
  (Not impremented functions)
184
- ### [ ] sort, sort_index
191
+
192
+ ### `tally` and `value_counts`
193
+
194
+ Compute counts of unique elements and return a Hash.
195
+
196
+ It returns almost same result as Ruby's tally. These methods consider NaNs are same.
197
+
198
+ ```ruby
199
+ array = [0.0/0, Float::NAN]
200
+ array.tally #=> {NaN=>1, NaN=>1}
201
+
202
+ vector = RedAmber::Vector.new(array)
203
+ vector.tally #=> {NaN=>2}
204
+ vector.value_counts #=> {NaN=>2}
205
+ ```
206
+
207
+ ### `sort_indexes`, `sort_indices`, `array_sort_indices`
208
+
209
+ ### [ ] `sort`, `sort_by`
185
210
  ### [ ] argmin, argmax
186
211
  ### [ ] (array functions)
187
212
  ### [ ] (strings functions)
@@ -192,4 +217,101 @@ boolean.all(opts: {skip_nulls: false}) #=> false
192
217
 
193
218
  ## Coerce (not impremented)
194
219
 
195
- ## Updating (not impremented)
220
+ ## Update vector's value
221
+ ### `replace_with(booleans, replacements)` => vector
222
+
223
+ - Accepts Vector, Array, Arrow::Array for booleans and replacements.
224
+ - Replacements can accept scalar
225
+ - Booleans specifies the position of replacement in true.
226
+ - Replacements specifies the vaues to be replaced.
227
+ - The number of true in booleans must be equal to the length of replacement
228
+
229
+ ```ruby
230
+ vector = RedAmber::Vector.new([1, 2, 3])
231
+ booleans = [true, false, true]
232
+ replacemants = [4, 5]
233
+ vector.replace_with(booleans, replacemants)
234
+ # =>
235
+ #<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
236
+ [4, 2, 5]
237
+ ```
238
+
239
+ - Scalar value in replacements can be broadcasted.
240
+
241
+ ```ruby
242
+ replacemant = 0
243
+ vector.replace_with(booleans, replacement)
244
+ # =>
245
+ #<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
246
+ [0, 2, 0]
247
+ ```
248
+
249
+ - Returned data type is automatically up-casted by replacement.
250
+
251
+ ```ruby
252
+ replacement = 1.0
253
+ vector.replace_with(booleans, replacement)
254
+ # =>
255
+ #<RedAmber::Vector(:double, size=3):0x0000000000025d78>
256
+ [1.0, 2.0, 1.0]
257
+ ```
258
+
259
+ - Position of nil in booleans is replaced with nil.
260
+
261
+ ```ruby
262
+ booleans = [true, false, nil]
263
+ replacemant = -1
264
+ vec.replace_with(booleans, replacement)
265
+ =>
266
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
267
+ [-1, 2, nil]
268
+ ```
269
+
270
+ - Replacemants can have nil in it.
271
+
272
+ ```ruby
273
+ booleans = [true, false, true]
274
+ replacemants = [nil]
275
+ vec.replace_with(booleans, replacemants)
276
+ =>
277
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
278
+ [nil, 2, nil]
279
+ ```
280
+
281
+ - If no replacemants specified, it is same as to specify nil.
282
+
283
+ ```ruby
284
+ booleans = [true, false, true]
285
+ vec.replace_with(booleans)
286
+ =>
287
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
288
+ [nil, 2, nil]
289
+ ```
290
+
291
+ - An example to replace 'NA' to nil.
292
+
293
+ ```ruby
294
+ vector = RedAmber::Vector.new(['A', 'B', 'NA'])
295
+ vector.replace_with(vector == 'NA', nil)
296
+ # =>
297
+ #<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
298
+ ["A", "B", nil]
299
+ ```
300
+
301
+ ### `fill_nil_forward`, `fill_nil_backward` => vector
302
+
303
+ Propagate the last valid observation forward (or backward).
304
+ Or preserve nil if all previous values are nil or at the end.
305
+
306
+ ```ruby
307
+ integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
308
+ integer.fill_nil_forward
309
+ # =>
310
+ #<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
311
+ [0, 1, 1, 3, 3]
312
+
313
+ integer.fill_nil_backward
314
+ # =>
315
+ #<RedAmber::Vector(:uint8, size=5):0x000000000000f974>
316
+ [0, 1, 3, 3, nil]
317
+ ```
Binary file
data/doc/tdr.md CHANGED
@@ -36,17 +36,20 @@ The API based on TDR is draft and RedAmber is a small experiment to test the TDR
36
36
  | |Basic Table|Transposed DataFrame|Comment for TDR|
37
37
  |-----------|---------|------------|---|
38
38
  |name in TDR|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentation|
39
- |variable |located in a column|a key and a `Vector` in lateral|select by key|
40
- |observation|located in a row|intersection in a vertical axis|select by index|
41
- |number of rows|n_rows etc. |`size` |`n_row` is available as an alias|
42
- |number of columns|n_columns etc. |`n_keys` |`n_col` is available as an alias|
43
- |shape |[n_rows, n_columns] |`[size, n_keys]` |same order as Table|
44
- |merge/join left| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom|
45
- |merge/join right| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom|
46
-
47
- ## Operation example with TDR API
48
-
49
- [Operation example with TDR API](TDR_operation.pdf) (draft)
39
+ |variable |located in a column|a key and a `Vector` in lateral|select by keys|
40
+ |observation|located in a row|sliced in vertical|select by indices|
41
+ |number of variables|n_columns etc. |`n_keys` |`n_cols` is available as an alias|
42
+ |number of observations|n_rows etc. |`size` |`n_rows` is available as an alias|
43
+ |shape |[n_rows, n_columns] |`shape`=`[size, n_keys]` |same order as Table|
44
+ |Select variables|select, filter, [ ], etc.|`pick` or `[keys]` |accepts arguments or a block|
45
+ |Reject variables|drop, etc.|`drop` |accepts arguments or a block|
46
+ |Select observations|slice, [ ], iloc, etc.|`slice` or `[indices]` |accepts arguments or a block|
47
+ |Reject observations|drop, etc.|`remove` |accepts arguments or a block|
48
+ |Add variables|mutate, assign, etc.|`assign` |accepts arguments or a block|
49
+ |update variables|transmute, [ ]=, etc.|`assign` |accepts arguments or a block|
50
+ |inner join| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |with a option on:|
51
+ |left join| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom<br>with a option on:|
52
+ |right join| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom<br>with a option on:|
50
53
 
51
54
  ## Q and A for TDR
52
55
 
data/doc/tdr_ja.md CHANGED
@@ -37,16 +37,19 @@ TDR に基づいた API はまだ暫定板の段階であり、RedAmber は TDR
37
37
  |-----------|---------|------------|---|
38
38
  |TDRでの呼称|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentationの略|
39
39
  |変数 |列に配置|`variables`<br>key と `Vector` として横方向に配置|key で選択|
40
- |観測 |行に配置|`observations`<br>縦方向に切った一つ一つは`slice`|index や `slice` メソッドで選択|
41
- |行の数|nrow, n_rows など |`size` |`n_row` をエイリアスとして設定|
42
- |列の数|ncol, n_columns など |`n_keys` |`n_col` をエイリアスとして設定|
43
- |shape |[nrow, ncol] |`[size, n_keys]` |行, 列の順番は同じ|
44
- |merge/join left| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける|
45
- |merge/join right| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける|
46
-
47
- ## Operation example with TDR API
48
-
49
- [TDR の操作例](TDR_operation.pdf) (暫定版)
40
+ |観測 |行に配置|`observations`<br>縦方向に切った一つ一つはslice|index や `slice` メソッドで選択|
41
+ |変数(列)の数|ncol, n_columns など |`n_keys` |`n_cols` をエイリアスとして設定|
42
+ |観測(行)の数|nrow, n_rows など |`size` |`n_rows` をエイリアスとして設定|
43
+ |形状 |[nrow, ncol] |`shape`=`[size, n_keys]` |行, 列の順番は同じ|
44
+ |変数()の選択|select, filter, [ ], など|`pick` or `[keys]` |引数またはブロックで指定|
45
+ |変数()の削除|drop, など|`drop` |引数またはブロックで指定|
46
+ |観測(行)の選択|slice, [ ], iloc, など|`slice` or `[indices]` |引数またはブロックで指定|
47
+ |観測(行)の削除|drop, など|`remove` |引数またはブロックで指定|
48
+ |変数(列)の追加|mutate, assign, など|`assign` |引数またはブロックで指定|
49
+ |変数(列)の更新|transmute, [ ]=, など|`assign` |引数またはブロックで指定|
50
+ |内部結合| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |オプション on:|
51
+ |左結合| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける<br>オプション on:|
52
+ |右結合| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける<br>オプション on:|
50
53
 
51
54
  ## Q and A for TDR
52
55
 
@@ -7,20 +7,21 @@ module RedAmber
7
7
  # mix-in
8
8
  include DataFrameDisplayable
9
9
  include DataFrameHelper
10
+ include DataFrameIndexable
10
11
  include DataFrameSelectable
11
12
  include DataFrameObservationOperation
12
13
  include DataFrameVariableOperation
13
14
 
14
15
  def initialize(*args)
15
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
16
- # returns empty DataFrame
17
- @table = Arrow::Table.new({}, [])
16
+ @variables = @keys = @vectors = @types = @data_types = nil
18
17
  # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
19
18
  # [Arrow::Table] == [nil] shows ArgumentError
20
19
  # temporary use yoda condition to workaround
21
- return if args.empty? || args == [[]] || args == [{}] || [nil] == args
22
-
23
- if args.size > 1
20
+ if args.empty? || args == [[]] || args == [{}] || [nil] == args
21
+ # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
22
+ # returns empty DataFrame
23
+ @table = Arrow::Table.new({}, [])
24
+ elsif args.size > 1
24
25
  @table = Arrow::Table.new(*args)
25
26
  else
26
27
  arg = args[0]
@@ -42,11 +43,14 @@ module RedAmber
42
43
 
43
44
  attr_reader :table
44
45
 
46
+ def to_arrow
47
+ table
48
+ end
49
+
45
50
  def save(output, options = {})
46
51
  @table.save(output, options)
47
52
  end
48
53
 
49
- # Properties ===
50
54
  def size
51
55
  @table.n_rows
52
56
  end
@@ -63,8 +67,13 @@ module RedAmber
63
67
  [size, n_keys]
64
68
  end
65
69
 
70
+ def variables
71
+ @variables || @variables = init_instance_vars(:variables)
72
+ end
73
+ alias_method :vars, :variables
74
+
66
75
  def keys
67
- @table.columns.map { |column| column.name.to_sym }
76
+ @keys || @keys = init_instance_vars(:keys)
68
77
  end
69
78
  alias_method :column_names, :keys
70
79
  alias_method :var_names, :keys
@@ -81,21 +90,15 @@ module RedAmber
81
90
  alias_method :index, :key_index
82
91
 
83
92
  def types
84
- @table.columns.map do |column|
85
- column.data.value_type.nick.to_sym
86
- end
93
+ @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
87
94
  end
88
95
 
89
- def data_types
90
- @table.columns.map do |column|
91
- column.data_type.class
92
- end
96
+ def type_classes
97
+ @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
93
98
  end
94
99
 
95
100
  def vectors
96
- @table.columns.map do |column|
97
- Vector.new(column.data)
98
- end
101
+ @vectors || @vectors = init_instance_vars(:vectors)
99
102
  end
100
103
 
101
104
  def indexes
@@ -104,9 +107,7 @@ module RedAmber
104
107
  alias_method :indices, :indexes
105
108
 
106
109
  def to_h
107
- @table.columns.each_with_object({}) do |column, result|
108
- result[column.name.to_sym] = column.entries
109
- end
110
+ variables.transform_values(&:to_a)
110
111
  end
111
112
 
112
113
  def to_a
@@ -125,13 +126,27 @@ module RedAmber
125
126
  end
126
127
 
127
128
  def empty?
128
- @table.columns.empty?
129
+ variables.empty?
129
130
  end
130
131
 
131
132
  def to_rover
132
133
  Rover::DataFrame.new(to_h)
133
134
  end
134
135
 
135
- # def to_parquet() end
136
+ private
137
+
138
+ # initialize @variable, @keys, @vectors and return one of them
139
+ def init_instance_vars(var)
140
+ ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
141
+ v = Vector.new(column.data)
142
+ k = column.name.to_sym
143
+ v.key = k
144
+ variables[k] = v
145
+ keys << k
146
+ vectors << v
147
+ end
148
+ @variables, @keys, @vectors = ary
149
+ ary[%i[variables keys vectors].index(var)]
150
+ end
136
151
  end
137
152
  end
@@ -73,7 +73,7 @@ module RedAmber
73
73
  [shorthand(vector, size, max_element)].concat na_string(vector)
74
74
  end
75
75
  else
76
- shorthand(vector, size, max_element)
76
+ [shorthand(vector, size, max_element)]
77
77
  end
78
78
  sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
79
79
  end
@@ -111,9 +111,10 @@ module RedAmber
111
111
  end
112
112
 
113
113
  def shorthand(vector, size, max_element)
114
- a = vector.to_a.take(max_element)
114
+ max = vector.temporal? ? 2 : max_element
115
+ a = vector.to_a.take(max)
115
116
  a.map! { |e| e.nil? ? 'nil' : e.inspect }
116
- a << '... ' if size > max_element
117
+ a << '... ' if size > max
117
118
  "[#{a.join(', ')}]"
118
119
  end
119
120
 
@@ -6,9 +6,16 @@ module RedAmber
6
6
  private
7
7
 
8
8
  def expand_range(args)
9
- args.each_with_object([]) do |e, a|
9
+ ary = args.each_with_object([]) do |e, a|
10
10
  e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
11
11
  end
12
+ ary.map do |e|
13
+ if e.is_a?(Integer) && e.negative?
14
+ e + size
15
+ else
16
+ e
17
+ end
18
+ end
12
19
  end
13
20
 
14
21
  def normalized_array(range)
@@ -50,13 +57,6 @@ module RedAmber
50
57
  DataFrame.new(@table.filter(array))
51
58
  end
52
59
 
53
- def select_obs_by_indeces(indeces)
54
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
55
-
56
- a = indeces.map { |i| @table.slice(i).to_a }
57
- DataFrame.new(@table.schema, a)
58
- end
59
-
60
60
  def keys_by_booleans(booleans)
61
61
  keys.select.with_index { |_, i| booleans[i] }
62
62
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameIndexable
6
+ # Common method
7
+ def map_indices(*indices)
8
+ return self if indices.empty?
9
+
10
+ indices = indices[0].data if indices[0].is_a?(Vector)
11
+
12
+ new_dataframe_by(indices)
13
+ end
14
+
15
+ # @param sort_keys [Arrow::SortKey]
16
+ # :key, "key" or "+key" denotes ascending,
17
+ # "-key" denotes descending order
18
+ # @return [RedAmber::Vector] Sorted indices in Vector
19
+ def sort_indices(*sort_keys)
20
+ indices = @table.sort_indices(sort_keys.flatten)
21
+ Vector.new(indices)
22
+ end
23
+
24
+ # @return [RedAmber::DataFrame] Sorted DataFrame
25
+ def sort(*sort_keys)
26
+ indices = @table.sort_indices(sort_keys.flatten)
27
+
28
+ new_dataframe_by(indices)
29
+ end
30
+
31
+ private
32
+
33
+ def new_dataframe_by(index_array)
34
+ t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
+ RedAmber::DataFrame.new(t)
36
+ end
37
+ end
38
+ end
@@ -25,7 +25,7 @@ module RedAmber
25
25
 
26
26
  # filter with indexes
27
27
  slicer = expand_range(slicer)
28
- return select_obs_by_indeces(slicer) if integers?(slicer)
28
+ return map_indices(*slicer) if integers?(slicer)
29
29
 
30
30
  raise DataFrameArgumentError, "Invalid argument #{args}"
31
31
  end
@@ -57,11 +57,22 @@ module RedAmber
57
57
  # filter with indexes
58
58
  slicer = indexes.to_a - expand_range(remover)
59
59
  return remove_all_values if slicer.empty?
60
- return select_obs_by_indeces(slicer) if integers?(slicer)
60
+ return map_indices(*slicer) if integers?(slicer)
61
61
 
62
62
  raise DataFrameArgumentError, "Invalid argument #{args}"
63
63
  end
64
64
 
65
+ def remove_nil
66
+ func = Arrow::Function.find(:drop_null)
67
+ DataFrame.new(func.execute([table]).value)
68
+ end
69
+ alias_method :drop_nil, :remove_nil
70
+
71
+ def group(aggregating_keys, func, target_keys)
72
+ t = table.group(*aggregating_keys)
73
+ RedAmber::DataFrame.new(t.send(func, *target_keys))
74
+ end
75
+
65
76
  private
66
77
 
67
78
  # return a DataFrame with same keys as self without values
@@ -27,12 +27,22 @@ module RedAmber
27
27
 
28
28
  # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
29
  expanded = expand_range(args)
30
- return select_obs_by_indeces(expanded) if integers?(expanded)
30
+ return map_indices(*expanded) if integers?(expanded)
31
31
  return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
32
32
 
33
33
  raise DataFrameArgumentError, "Invalid argument #{args}"
34
34
  end
35
35
 
36
+ # Select a variable by a key in String or Symbol
37
+ def v(key)
38
+ unless key.is_a?(Symbol) || key.is_a?(String)
39
+ raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
40
+ end
41
+ raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
42
+
43
+ variables[key.to_sym]
44
+ end
45
+
36
46
  def head(n_rows = 5)
37
47
  raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
38
48
 
@@ -57,10 +67,10 @@ module RedAmber
57
67
 
58
68
  def select_vars_by_keys(keys)
59
69
  if keys.one?
60
- t = @table[*keys]
61
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless t
70
+ key = keys[0].to_sym
71
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
62
72
 
63
- Vector.new(t.data)
73
+ variables[key]
64
74
  else
65
75
  DataFrame.new(@table[keys])
66
76
  end
@@ -5,10 +5,12 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
+ include VectorCompensable
8
9
  include VectorFunctions
9
10
 
10
11
  # chunked_array may come from column.data
11
12
  def initialize(array)
13
+ @key = nil # default is 'headless'
12
14
  case array
13
15
  when Vector
14
16
  @data = array.data
@@ -17,11 +19,12 @@ module RedAmber
17
19
  when Array
18
20
  @data = Arrow::Array.new(array)
19
21
  else
20
- raise ArgumentError, 'Unknown array in argument'
22
+ raise VectorArgumentError, 'Unknown array in argument'
21
23
  end
22
24
  end
23
25
 
24
26
  attr_reader :data
27
+ attr_accessor :key
25
28
 
26
29
  def to_s
27
30
  @data.to_a.inspect
@@ -66,15 +69,19 @@ module RedAmber
66
69
  end
67
70
 
68
71
  def numeric?
69
- %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
72
+ type_class < Arrow::NumericDataType
70
73
  end
71
74
 
72
75
  def string?
73
76
  type == :string
74
77
  end
75
78
 
76
- def data_type
77
- @data.value_type
79
+ def temporal?
80
+ type_class < Arrow::TemporalDataType
81
+ end
82
+
83
+ def type_class
84
+ @data.value_data_type.class
78
85
  end
79
86
 
80
87
  # def each() end
@@ -90,7 +97,23 @@ module RedAmber
90
97
  # def each_chunk() end
91
98
 
92
99
  def tally
93
- values.tally
100
+ hash = values.tally
101
+ if (type_class < Arrow::FloatingPointDataType) && is_nan.any
102
+ a = 0
103
+ hash.each do |key, value|
104
+ if key.is_a?(Float) && key.nan?
105
+ hash.delete(key)
106
+ a += value
107
+ end
108
+ end
109
+ hash[Float::NAN] = a
110
+ end
111
+ hash
112
+ end
113
+
114
+ def value_counts
115
+ values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
116
+ values.zip(counts).to_h
94
117
  end
95
118
 
96
119
  def n_nulls
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to make up some data (especially missing) for new data.
9
+ module VectorCompensable
10
+ # [Ternary]: replace_with(booleans, replacements) => vector
11
+ # Replace items selected with a boolean mask
12
+ #
13
+ # (from Arrow C++ inline doc.)
14
+ # Given an array and a boolean mask (either scalar or of equal length),
15
+ # along with replacement values (either scalar or array),
16
+ # each element of the array for which the corresponding mask element is
17
+ # true will be replaced by the next value from the replacements,
18
+ # or with null if the mask is null.
19
+ # Hence, for replacement arrays, len(replacements) == sum(mask == true).
20
+
21
+ def replace_with(booleans, replacements = nil)
22
+ specifier =
23
+ if booleans.is_a?(Arrow::BooleanArray)
24
+ booleans
25
+ elsif booleans.is_a?(Vector) && booleans.boolean?
26
+ booleans.data
27
+ elsif booleans.is_a?(Array) && booleans?(booleans)
28
+ Arrow::BooleanArray.new(booleans)
29
+ else
30
+ raise VectorTypeError, 'Not a valid type'
31
+ end
32
+ raise VectorArgumentError, 'Booleans size unmatch' if specifier.length != size
33
+ raise VectorArgumentError, 'Booleans not have any `true`' unless specifier.any?
34
+
35
+ r = Array(replacements) # scalar to [scalar]
36
+ r = [nil] if r.empty?
37
+
38
+ replacer =
39
+ if r.size == 1
40
+ case replacements
41
+ when Arrow::Array then replacements
42
+ when Vector then replacements.data
43
+ else
44
+ Arrow::Array.new(r * specifier.to_a.count(true)) # broadcast
45
+ end
46
+ else
47
+ Arrow::Array.new(r)
48
+ end
49
+ replacer = data.class.new(replacer) if replacer.uniq == [nil]
50
+
51
+ raise VectorArgumentError, 'Replacements size unmatch' if Array(specifier).count(true) != replacer.length
52
+
53
+ values = replacer.class.new(data)
54
+
55
+ datum = find('replace_with_mask').execute([values, specifier, replacer])
56
+ take_out_element_wise(datum)
57
+ end
58
+
59
+ # (related functions)
60
+ # fill_null_backward, fill_null_forward
61
+
62
+ private
63
+
64
+ def booleans?(enum)
65
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
66
+ end
67
+ end
68
+ end