red_amber 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/doc/Vector.md CHANGED
@@ -33,7 +33,9 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
33
33
 
34
34
  ### `type`
35
35
 
36
- ### `data_type`
36
+ ### `boolean?`, `numeric?`, `string?`, `temporal?`
37
+
38
+ ### `type_class`
37
39
 
38
40
  ### [ ] `each` (not impremented yet)
39
41
 
@@ -43,8 +45,6 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
43
45
 
44
46
  ### [ ] `each_chunk` (not impremented yet)
45
47
 
46
- ### `tally`
47
-
48
48
  ### `n_nils`, `n_nans`
49
49
 
50
50
  - `n_nulls` is an alias of `n_nils`
@@ -126,20 +126,23 @@ boolean.all(opts: {skip_nulls: false}) #=> false
126
126
  |[ ]`asin` | | [ ] | | | |
127
127
  | ✓ `atan` | | ✓ | | | |
128
128
  | ✓ `bit_wise_not`| | (✓) | | |integer only|
129
- |[ ]`ceil` | | ✓ | | | |
129
+ | `ceil` | | ✓ | | | |
130
130
  | ✓ `cos` | | ✓ | | | |
131
- |[ ]`floor` | | | | | |
131
+ | ✓`fill_nil_backward`| ✓ || | | |
132
+ | ✓`fill_nil_forward` | ✓ | ✓ | ✓ | | |
133
+ | ✓ `floor` | | ✓ | | | |
132
134
  | ✓ `invert` | ✓ | | | |`!`, alias `not`|
133
135
  |[ ]`ln` | | [ ] | | | |
134
136
  |[ ]`log10` | | [ ] | | | |
135
137
  |[ ]`log1p` | | [ ] | | | |
136
138
  |[ ]`log2` | | [ ] | | | |
137
- |[ ]`round` | | [ ] | |[ ] Round| |
138
- |[ ]`round_to_multiple`| | [ ] | |[ ] RoundToMultiple| |
139
+ | `round` | || | Round (:mode, :n_digits)| |
140
+ | `round_to_multiple`| | | | RoundToMultiple :mode, :multiple| multiple must be an Arrow::Scalar|
139
141
  | ✓ `sign` | | ✓ | | | |
140
142
  | ✓ `sin` | | ✓ | | | |
143
+ | ✓`sort_indexes`| ✓ | ✓ | ✓ |:order|alias `sort_indices`|
141
144
  | ✓ `tan` | | ✓ | | | |
142
- |[ ]`trunc` | | ✓ | | | |
145
+ | `trunc` | | ✓ | | | |
143
146
 
144
147
  ### Binary element-wise: `vector.func(vector) => vector`
145
148
 
@@ -180,8 +183,30 @@ boolean.all(opts: {skip_nulls: false}) #=> false
180
183
  | ✓ `shift_right` | | (✓) | | |`>>`, integer only|
181
184
  | ✓ `xor` | ✓ | | | | `^` |
182
185
 
186
+ ### `uniq`
187
+
188
+ Returns a new array with distinct elements.
189
+
183
190
  (Not impremented functions)
184
- ### [ ] sort, sort_index
191
+
192
+ ### `tally` and `value_counts`
193
+
194
+ Compute counts of unique elements and return a Hash.
195
+
196
+ It returns almost same result as Ruby's tally. These methods consider NaNs are same.
197
+
198
+ ```ruby
199
+ array = [0.0/0, Float::NAN]
200
+ array.tally #=> {NaN=>1, NaN=>1}
201
+
202
+ vector = RedAmber::Vector.new(array)
203
+ vector.tally #=> {NaN=>2}
204
+ vector.value_counts #=> {NaN=>2}
205
+ ```
206
+
207
+ ### `sort_indexes`, `sort_indices`, `array_sort_indices`
208
+
209
+ ### [ ] `sort`, `sort_by`
185
210
  ### [ ] argmin, argmax
186
211
  ### [ ] (array functions)
187
212
  ### [ ] (strings functions)
@@ -192,4 +217,101 @@ boolean.all(opts: {skip_nulls: false}) #=> false
192
217
 
193
218
  ## Coerce (not impremented)
194
219
 
195
- ## Updating (not impremented)
220
+ ## Update vector's value
221
+ ### `replace_with(booleans, replacements)` => vector
222
+
223
+ - Accepts Vector, Array, Arrow::Array for booleans and replacements.
224
+ - Replacements can accept scalar
225
+ - Booleans specifies the position of replacement in true.
226
+ - Replacements specifies the vaues to be replaced.
227
+ - The number of true in booleans must be equal to the length of replacement
228
+
229
+ ```ruby
230
+ vector = RedAmber::Vector.new([1, 2, 3])
231
+ booleans = [true, false, true]
232
+ replacemants = [4, 5]
233
+ vector.replace_with(booleans, replacemants)
234
+ # =>
235
+ #<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
236
+ [4, 2, 5]
237
+ ```
238
+
239
+ - Scalar value in replacements can be broadcasted.
240
+
241
+ ```ruby
242
+ replacemant = 0
243
+ vector.replace_with(booleans, replacement)
244
+ # =>
245
+ #<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
246
+ [0, 2, 0]
247
+ ```
248
+
249
+ - Returned data type is automatically up-casted by replacement.
250
+
251
+ ```ruby
252
+ replacement = 1.0
253
+ vector.replace_with(booleans, replacement)
254
+ # =>
255
+ #<RedAmber::Vector(:double, size=3):0x0000000000025d78>
256
+ [1.0, 2.0, 1.0]
257
+ ```
258
+
259
+ - Position of nil in booleans is replaced with nil.
260
+
261
+ ```ruby
262
+ booleans = [true, false, nil]
263
+ replacemant = -1
264
+ vec.replace_with(booleans, replacement)
265
+ =>
266
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
267
+ [-1, 2, nil]
268
+ ```
269
+
270
+ - Replacemants can have nil in it.
271
+
272
+ ```ruby
273
+ booleans = [true, false, true]
274
+ replacemants = [nil]
275
+ vec.replace_with(booleans, replacemants)
276
+ =>
277
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
278
+ [nil, 2, nil]
279
+ ```
280
+
281
+ - If no replacemants specified, it is same as to specify nil.
282
+
283
+ ```ruby
284
+ booleans = [true, false, true]
285
+ vec.replace_with(booleans)
286
+ =>
287
+ #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
288
+ [nil, 2, nil]
289
+ ```
290
+
291
+ - An example to replace 'NA' to nil.
292
+
293
+ ```ruby
294
+ vector = RedAmber::Vector.new(['A', 'B', 'NA'])
295
+ vector.replace_with(vector == 'NA', nil)
296
+ # =>
297
+ #<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
298
+ ["A", "B", nil]
299
+ ```
300
+
301
+ ### `fill_nil_forward`, `fill_nil_backward` => vector
302
+
303
+ Propagate the last valid observation forward (or backward).
304
+ Or preserve nil if all previous values are nil or at the end.
305
+
306
+ ```ruby
307
+ integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
308
+ integer.fill_nil_forward
309
+ # =>
310
+ #<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
311
+ [0, 1, 1, 3, 3]
312
+
313
+ integer.fill_nil_backward
314
+ # =>
315
+ #<RedAmber::Vector(:uint8, size=5):0x000000000000f974>
316
+ [0, 1, 3, 3, nil]
317
+ ```
Binary file
data/doc/tdr.md CHANGED
@@ -36,17 +36,20 @@ The API based on TDR is draft and RedAmber is a small experiment to test the TDR
36
36
  | |Basic Table|Transposed DataFrame|Comment for TDR|
37
37
  |-----------|---------|------------|---|
38
38
  |name in TDR|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentation|
39
- |variable |located in a column|a key and a `Vector` in lateral|select by key|
40
- |observation|located in a row|intersection in a vertical axis|select by index|
41
- |number of rows|n_rows etc. |`size` |`n_row` is available as an alias|
42
- |number of columns|n_columns etc. |`n_keys` |`n_col` is available as an alias|
43
- |shape |[n_rows, n_columns] |`[size, n_keys]` |same order as Table|
44
- |merge/join left| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom|
45
- |merge/join right| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom|
46
-
47
- ## Operation example with TDR API
48
-
49
- [Operation example with TDR API](TDR_operation.pdf) (draft)
39
+ |variable |located in a column|a key and a `Vector` in lateral|select by keys|
40
+ |observation|located in a row|sliced in vertical|select by indices|
41
+ |number of variables|n_columns etc. |`n_keys` |`n_cols` is available as an alias|
42
+ |number of observations|n_rows etc. |`size` |`n_rows` is available as an alias|
43
+ |shape |[n_rows, n_columns] |`shape`=`[size, n_keys]` |same order as Table|
44
+ |Select variables|select, filter, [ ], etc.|`pick` or `[keys]` |accepts arguments or a block|
45
+ |Reject variables|drop, etc.|`drop` |accepts arguments or a block|
46
+ |Select observations|slice, [ ], iloc, etc.|`slice` or `[indices]` |accepts arguments or a block|
47
+ |Reject observations|drop, etc.|`remove` |accepts arguments or a block|
48
+ |Add variables|mutate, assign, etc.|`assign` |accepts arguments or a block|
49
+ |update variables|transmute, [ ]=, etc.|`assign` |accepts arguments or a block|
50
+ |inner join| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |with a option on:|
51
+ |left join| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom<br>with a option on:|
52
+ |right join| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom<br>with a option on:|
50
53
 
51
54
  ## Q and A for TDR
52
55
 
data/doc/tdr_ja.md CHANGED
@@ -37,16 +37,19 @@ TDR に基づいた API はまだ暫定板の段階であり、RedAmber は TDR
37
37
  |-----------|---------|------------|---|
38
38
  |TDRでの呼称|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentationの略|
39
39
  |変数 |列に配置|`variables`<br>key と `Vector` として横方向に配置|key で選択|
40
- |観測 |行に配置|`observations`<br>縦方向に切った一つ一つは`slice`|index や `slice` メソッドで選択|
41
- |行の数|nrow, n_rows など |`size` |`n_row` をエイリアスとして設定|
42
- |列の数|ncol, n_columns など |`n_keys` |`n_col` をエイリアスとして設定|
43
- |shape |[nrow, ncol] |`[size, n_keys]` |行, 列の順番は同じ|
44
- |merge/join left| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける|
45
- |merge/join right| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける|
46
-
47
- ## Operation example with TDR API
48
-
49
- [TDR の操作例](TDR_operation.pdf) (暫定版)
40
+ |観測 |行に配置|`observations`<br>縦方向に切った一つ一つはslice|index や `slice` メソッドで選択|
41
+ |変数(列)の数|ncol, n_columns など |`n_keys` |`n_cols` をエイリアスとして設定|
42
+ |観測(行)の数|nrow, n_rows など |`size` |`n_rows` をエイリアスとして設定|
43
+ |形状 |[nrow, ncol] |`shape`=`[size, n_keys]` |行, 列の順番は同じ|
44
+ |変数()の選択|select, filter, [ ], など|`pick` or `[keys]` |引数またはブロックで指定|
45
+ |変数()の削除|drop, など|`drop` |引数またはブロックで指定|
46
+ |観測(行)の選択|slice, [ ], iloc, など|`slice` or `[indices]` |引数またはブロックで指定|
47
+ |観測(行)の削除|drop, など|`remove` |引数またはブロックで指定|
48
+ |変数(列)の追加|mutate, assign, など|`assign` |引数またはブロックで指定|
49
+ |変数(列)の更新|transmute, [ ]=, など|`assign` |引数またはブロックで指定|
50
+ |内部結合| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |オプション on:|
51
+ |左結合| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける<br>オプション on:|
52
+ |右結合| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける<br>オプション on:|
50
53
 
51
54
  ## Q and A for TDR
52
55
 
@@ -7,20 +7,21 @@ module RedAmber
7
7
  # mix-in
8
8
  include DataFrameDisplayable
9
9
  include DataFrameHelper
10
+ include DataFrameIndexable
10
11
  include DataFrameSelectable
11
12
  include DataFrameObservationOperation
12
13
  include DataFrameVariableOperation
13
14
 
14
15
  def initialize(*args)
15
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
16
- # returns empty DataFrame
17
- @table = Arrow::Table.new({}, [])
16
+ @variables = @keys = @vectors = @types = @data_types = nil
18
17
  # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
19
18
  # [Arrow::Table] == [nil] shows ArgumentError
20
19
  # temporary use yoda condition to workaround
21
- return if args.empty? || args == [[]] || args == [{}] || [nil] == args
22
-
23
- if args.size > 1
20
+ if args.empty? || args == [[]] || args == [{}] || [nil] == args
21
+ # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
22
+ # returns empty DataFrame
23
+ @table = Arrow::Table.new({}, [])
24
+ elsif args.size > 1
24
25
  @table = Arrow::Table.new(*args)
25
26
  else
26
27
  arg = args[0]
@@ -42,11 +43,14 @@ module RedAmber
42
43
 
43
44
  attr_reader :table
44
45
 
46
+ def to_arrow
47
+ table
48
+ end
49
+
45
50
  def save(output, options = {})
46
51
  @table.save(output, options)
47
52
  end
48
53
 
49
- # Properties ===
50
54
  def size
51
55
  @table.n_rows
52
56
  end
@@ -63,8 +67,13 @@ module RedAmber
63
67
  [size, n_keys]
64
68
  end
65
69
 
70
+ def variables
71
+ @variables || @variables = init_instance_vars(:variables)
72
+ end
73
+ alias_method :vars, :variables
74
+
66
75
  def keys
67
- @table.columns.map { |column| column.name.to_sym }
76
+ @keys || @keys = init_instance_vars(:keys)
68
77
  end
69
78
  alias_method :column_names, :keys
70
79
  alias_method :var_names, :keys
@@ -81,21 +90,15 @@ module RedAmber
81
90
  alias_method :index, :key_index
82
91
 
83
92
  def types
84
- @table.columns.map do |column|
85
- column.data.value_type.nick.to_sym
86
- end
93
+ @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
87
94
  end
88
95
 
89
- def data_types
90
- @table.columns.map do |column|
91
- column.data_type.class
92
- end
96
+ def type_classes
97
+ @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
93
98
  end
94
99
 
95
100
  def vectors
96
- @table.columns.map do |column|
97
- Vector.new(column.data)
98
- end
101
+ @vectors || @vectors = init_instance_vars(:vectors)
99
102
  end
100
103
 
101
104
  def indexes
@@ -104,9 +107,7 @@ module RedAmber
104
107
  alias_method :indices, :indexes
105
108
 
106
109
  def to_h
107
- @table.columns.each_with_object({}) do |column, result|
108
- result[column.name.to_sym] = column.entries
109
- end
110
+ variables.transform_values(&:to_a)
110
111
  end
111
112
 
112
113
  def to_a
@@ -125,13 +126,27 @@ module RedAmber
125
126
  end
126
127
 
127
128
  def empty?
128
- @table.columns.empty?
129
+ variables.empty?
129
130
  end
130
131
 
131
132
  def to_rover
132
133
  Rover::DataFrame.new(to_h)
133
134
  end
134
135
 
135
- # def to_parquet() end
136
+ private
137
+
138
+ # initialize @variable, @keys, @vectors and return one of them
139
+ def init_instance_vars(var)
140
+ ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
141
+ v = Vector.new(column.data)
142
+ k = column.name.to_sym
143
+ v.key = k
144
+ variables[k] = v
145
+ keys << k
146
+ vectors << v
147
+ end
148
+ @variables, @keys, @vectors = ary
149
+ ary[%i[variables keys vectors].index(var)]
150
+ end
136
151
  end
137
152
  end
@@ -73,7 +73,7 @@ module RedAmber
73
73
  [shorthand(vector, size, max_element)].concat na_string(vector)
74
74
  end
75
75
  else
76
- shorthand(vector, size, max_element)
76
+ [shorthand(vector, size, max_element)]
77
77
  end
78
78
  sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
79
79
  end
@@ -111,9 +111,10 @@ module RedAmber
111
111
  end
112
112
 
113
113
  def shorthand(vector, size, max_element)
114
- a = vector.to_a.take(max_element)
114
+ max = vector.temporal? ? 2 : max_element
115
+ a = vector.to_a.take(max)
115
116
  a.map! { |e| e.nil? ? 'nil' : e.inspect }
116
- a << '... ' if size > max_element
117
+ a << '... ' if size > max
117
118
  "[#{a.join(', ')}]"
118
119
  end
119
120
 
@@ -6,9 +6,16 @@ module RedAmber
6
6
  private
7
7
 
8
8
  def expand_range(args)
9
- args.each_with_object([]) do |e, a|
9
+ ary = args.each_with_object([]) do |e, a|
10
10
  e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
11
11
  end
12
+ ary.map do |e|
13
+ if e.is_a?(Integer) && e.negative?
14
+ e + size
15
+ else
16
+ e
17
+ end
18
+ end
12
19
  end
13
20
 
14
21
  def normalized_array(range)
@@ -50,13 +57,6 @@ module RedAmber
50
57
  DataFrame.new(@table.filter(array))
51
58
  end
52
59
 
53
- def select_obs_by_indeces(indeces)
54
- out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
55
-
56
- a = indeces.map { |i| @table.slice(i).to_a }
57
- DataFrame.new(@table.schema, a)
58
- end
59
-
60
60
  def keys_by_booleans(booleans)
61
61
  keys.select.with_index { |_, i| booleans[i] }
62
62
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameIndexable
6
+ # Common method
7
+ def map_indices(*indices)
8
+ return self if indices.empty?
9
+
10
+ indices = indices[0].data if indices[0].is_a?(Vector)
11
+
12
+ new_dataframe_by(indices)
13
+ end
14
+
15
+ # @param sort_keys [Arrow::SortKey]
16
+ # :key, "key" or "+key" denotes ascending,
17
+ # "-key" denotes descending order
18
+ # @return [RedAmber::Vector] Sorted indices in Vector
19
+ def sort_indices(*sort_keys)
20
+ indices = @table.sort_indices(sort_keys.flatten)
21
+ Vector.new(indices)
22
+ end
23
+
24
+ # @return [RedAmber::DataFrame] Sorted DataFrame
25
+ def sort(*sort_keys)
26
+ indices = @table.sort_indices(sort_keys.flatten)
27
+
28
+ new_dataframe_by(indices)
29
+ end
30
+
31
+ private
32
+
33
+ def new_dataframe_by(index_array)
34
+ t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
+ RedAmber::DataFrame.new(t)
36
+ end
37
+ end
38
+ end
@@ -25,7 +25,7 @@ module RedAmber
25
25
 
26
26
  # filter with indexes
27
27
  slicer = expand_range(slicer)
28
- return select_obs_by_indeces(slicer) if integers?(slicer)
28
+ return map_indices(*slicer) if integers?(slicer)
29
29
 
30
30
  raise DataFrameArgumentError, "Invalid argument #{args}"
31
31
  end
@@ -57,11 +57,22 @@ module RedAmber
57
57
  # filter with indexes
58
58
  slicer = indexes.to_a - expand_range(remover)
59
59
  return remove_all_values if slicer.empty?
60
- return select_obs_by_indeces(slicer) if integers?(slicer)
60
+ return map_indices(*slicer) if integers?(slicer)
61
61
 
62
62
  raise DataFrameArgumentError, "Invalid argument #{args}"
63
63
  end
64
64
 
65
+ def remove_nil
66
+ func = Arrow::Function.find(:drop_null)
67
+ DataFrame.new(func.execute([table]).value)
68
+ end
69
+ alias_method :drop_nil, :remove_nil
70
+
71
+ def group(aggregating_keys, func, target_keys)
72
+ t = table.group(*aggregating_keys)
73
+ RedAmber::DataFrame.new(t.send(func, *target_keys))
74
+ end
75
+
65
76
  private
66
77
 
67
78
  # return a DataFrame with same keys as self without values
@@ -27,12 +27,22 @@ module RedAmber
27
27
 
28
28
  # expand Range like [1..3, 4] to [1, 2, 3, 4]
29
29
  expanded = expand_range(args)
30
- return select_obs_by_indeces(expanded) if integers?(expanded)
30
+ return map_indices(*expanded) if integers?(expanded)
31
31
  return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
32
32
 
33
33
  raise DataFrameArgumentError, "Invalid argument #{args}"
34
34
  end
35
35
 
36
+ # Select a variable by a key in String or Symbol
37
+ def v(key)
38
+ unless key.is_a?(Symbol) || key.is_a?(String)
39
+ raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
40
+ end
41
+ raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
42
+
43
+ variables[key.to_sym]
44
+ end
45
+
36
46
  def head(n_rows = 5)
37
47
  raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
38
48
 
@@ -57,10 +67,10 @@ module RedAmber
57
67
 
58
68
  def select_vars_by_keys(keys)
59
69
  if keys.one?
60
- t = @table[*keys]
61
- raise DataFrameArgumentError, "Key does not exist #{keys}" unless t
70
+ key = keys[0].to_sym
71
+ raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
62
72
 
63
- Vector.new(t.data)
73
+ variables[key]
64
74
  else
65
75
  DataFrame.new(@table[keys])
66
76
  end
@@ -5,10 +5,12 @@ module RedAmber
5
5
  # @data : holds Arrow::ChunkedArray
6
6
  class Vector
7
7
  # mix-in
8
+ include VectorCompensable
8
9
  include VectorFunctions
9
10
 
10
11
  # chunked_array may come from column.data
11
12
  def initialize(array)
13
+ @key = nil # default is 'headless'
12
14
  case array
13
15
  when Vector
14
16
  @data = array.data
@@ -17,11 +19,12 @@ module RedAmber
17
19
  when Array
18
20
  @data = Arrow::Array.new(array)
19
21
  else
20
- raise ArgumentError, 'Unknown array in argument'
22
+ raise VectorArgumentError, 'Unknown array in argument'
21
23
  end
22
24
  end
23
25
 
24
26
  attr_reader :data
27
+ attr_accessor :key
25
28
 
26
29
  def to_s
27
30
  @data.to_a.inspect
@@ -66,15 +69,19 @@ module RedAmber
66
69
  end
67
70
 
68
71
  def numeric?
69
- %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
72
+ type_class < Arrow::NumericDataType
70
73
  end
71
74
 
72
75
  def string?
73
76
  type == :string
74
77
  end
75
78
 
76
- def data_type
77
- @data.value_type
79
+ def temporal?
80
+ type_class < Arrow::TemporalDataType
81
+ end
82
+
83
+ def type_class
84
+ @data.value_data_type.class
78
85
  end
79
86
 
80
87
  # def each() end
@@ -90,7 +97,23 @@ module RedAmber
90
97
  # def each_chunk() end
91
98
 
92
99
  def tally
93
- values.tally
100
+ hash = values.tally
101
+ if (type_class < Arrow::FloatingPointDataType) && is_nan.any
102
+ a = 0
103
+ hash.each do |key, value|
104
+ if key.is_a?(Float) && key.nan?
105
+ hash.delete(key)
106
+ a += value
107
+ end
108
+ end
109
+ hash[Float::NAN] = a
110
+ end
111
+ hash
112
+ end
113
+
114
+ def value_counts
115
+ values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
116
+ values.zip(counts).to_h
94
117
  end
95
118
 
96
119
  def n_nulls
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ module RedAmber
7
+ # mix-ins for class Vector
8
+ # Functions to make up some data (especially missing) for new data.
9
+ module VectorCompensable
10
+ # [Ternary]: replace_with(booleans, replacements) => vector
11
+ # Replace items selected with a boolean mask
12
+ #
13
+ # (from Arrow C++ inline doc.)
14
+ # Given an array and a boolean mask (either scalar or of equal length),
15
+ # along with replacement values (either scalar or array),
16
+ # each element of the array for which the corresponding mask element is
17
+ # true will be replaced by the next value from the replacements,
18
+ # or with null if the mask is null.
19
+ # Hence, for replacement arrays, len(replacements) == sum(mask == true).
20
+
21
+ def replace_with(booleans, replacements = nil)
22
+ specifier =
23
+ if booleans.is_a?(Arrow::BooleanArray)
24
+ booleans
25
+ elsif booleans.is_a?(Vector) && booleans.boolean?
26
+ booleans.data
27
+ elsif booleans.is_a?(Array) && booleans?(booleans)
28
+ Arrow::BooleanArray.new(booleans)
29
+ else
30
+ raise VectorTypeError, 'Not a valid type'
31
+ end
32
+ raise VectorArgumentError, 'Booleans size unmatch' if specifier.length != size
33
+ raise VectorArgumentError, 'Booleans not have any `true`' unless specifier.any?
34
+
35
+ r = Array(replacements) # scalar to [scalar]
36
+ r = [nil] if r.empty?
37
+
38
+ replacer =
39
+ if r.size == 1
40
+ case replacements
41
+ when Arrow::Array then replacements
42
+ when Vector then replacements.data
43
+ else
44
+ Arrow::Array.new(r * specifier.to_a.count(true)) # broadcast
45
+ end
46
+ else
47
+ Arrow::Array.new(r)
48
+ end
49
+ replacer = data.class.new(replacer) if replacer.uniq == [nil]
50
+
51
+ raise VectorArgumentError, 'Replacements size unmatch' if Array(specifier).count(true) != replacer.length
52
+
53
+ values = replacer.class.new(data)
54
+
55
+ datum = find('replace_with_mask').execute([values, specifier, replacer])
56
+ take_out_element_wise(datum)
57
+ end
58
+
59
+ # (related functions)
60
+ # fill_null_backward, fill_null_forward
61
+
62
+ private
63
+
64
+ def booleans?(enum)
65
+ enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
66
+ end
67
+ end
68
+ end