red_amber 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -8
- data/CHANGELOG.md +74 -7
- data/Gemfile +3 -0
- data/README.md +47 -13
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +185 -35
- data/doc/Vector.md +132 -10
- data/doc/image/dataframe_model.png +0 -0
- data/doc/tdr.md +14 -11
- data/doc/tdr_ja.md +13 -10
- data/lib/red_amber/data_frame.rb +38 -23
- data/lib/red_amber/data_frame_displayable.rb +4 -3
- data/lib/red_amber/data_frame_helper.rb +8 -8
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +13 -2
- data/lib/red_amber/data_frame_selectable.rb +14 -4
- data/lib/red_amber/vector.rb +28 -5
- data/lib/red_amber/vector_compensable.rb +68 -0
- data/lib/red_amber/vector_functions.rb +16 -13
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +5 -0
- data/red_amber.gemspec +3 -6
- metadata +12 -9
- data/doc/image/TDR_operations.pdf +0 -0
data/doc/Vector.md
CHANGED
@@ -33,7 +33,9 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
33
33
|
|
34
34
|
### `type`
|
35
35
|
|
36
|
-
### `
|
36
|
+
### `boolean?`, `numeric?`, `string?`, `temporal?`
|
37
|
+
|
38
|
+
### `type_class`
|
37
39
|
|
38
40
|
### [ ] `each` (not impremented yet)
|
39
41
|
|
@@ -43,8 +45,6 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
43
45
|
|
44
46
|
### [ ] `each_chunk` (not impremented yet)
|
45
47
|
|
46
|
-
### `tally`
|
47
|
-
|
48
48
|
### `n_nils`, `n_nans`
|
49
49
|
|
50
50
|
- `n_nulls` is an alias of `n_nils`
|
@@ -126,20 +126,23 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
126
126
|
|[ ]`asin` | | [ ] | | | |
|
127
127
|
| ✓ `atan` | | ✓ | | | |
|
128
128
|
| ✓ `bit_wise_not`| | (✓) | | |integer only|
|
129
|
-
|
|
129
|
+
| ✓ `ceil` | | ✓ | | | |
|
130
130
|
| ✓ `cos` | | ✓ | | | |
|
131
|
-
|
|
131
|
+
| ✓`fill_nil_backward`| ✓ | ✓ | ✓ | | |
|
132
|
+
| ✓`fill_nil_forward` | ✓ | ✓ | ✓ | | |
|
133
|
+
| ✓ `floor` | | ✓ | | | |
|
132
134
|
| ✓ `invert` | ✓ | | | |`!`, alias `not`|
|
133
135
|
|[ ]`ln` | | [ ] | | | |
|
134
136
|
|[ ]`log10` | | [ ] | | | |
|
135
137
|
|[ ]`log1p` | | [ ] | | | |
|
136
138
|
|[ ]`log2` | | [ ] | | | |
|
137
|
-
|
|
138
|
-
|
|
139
|
+
| ✓ `round` | | ✓ | | ✓ Round (:mode, :n_digits)| |
|
140
|
+
| ✓ `round_to_multiple`| | ✓ | | ✓ RoundToMultiple :mode, :multiple| multiple must be an Arrow::Scalar|
|
139
141
|
| ✓ `sign` | | ✓ | | | |
|
140
142
|
| ✓ `sin` | | ✓ | | | |
|
143
|
+
| ✓`sort_indexes`| ✓ | ✓ | ✓ |:order|alias `sort_indices`|
|
141
144
|
| ✓ `tan` | | ✓ | | | |
|
142
|
-
|
|
145
|
+
| ✓ `trunc` | | ✓ | | | |
|
143
146
|
|
144
147
|
### Binary element-wise: `vector.func(vector) => vector`
|
145
148
|
|
@@ -180,8 +183,30 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
180
183
|
| ✓ `shift_right` | | (✓) | | |`>>`, integer only|
|
181
184
|
| ✓ `xor` | ✓ | | | | `^` |
|
182
185
|
|
186
|
+
### `uniq`
|
187
|
+
|
188
|
+
Returns a new array with distinct elements.
|
189
|
+
|
183
190
|
(Not impremented functions)
|
184
|
-
|
191
|
+
|
192
|
+
### `tally` and `value_counts`
|
193
|
+
|
194
|
+
Compute counts of unique elements and return a Hash.
|
195
|
+
|
196
|
+
It returns almost same result as Ruby's tally. These methods consider NaNs are same.
|
197
|
+
|
198
|
+
```ruby
|
199
|
+
array = [0.0/0, Float::NAN]
|
200
|
+
array.tally #=> {NaN=>1, NaN=>1}
|
201
|
+
|
202
|
+
vector = RedAmber::Vector.new(array)
|
203
|
+
vector.tally #=> {NaN=>2}
|
204
|
+
vector.value_counts #=> {NaN=>2}
|
205
|
+
```
|
206
|
+
|
207
|
+
### `sort_indexes`, `sort_indices`, `array_sort_indices`
|
208
|
+
|
209
|
+
### [ ] `sort`, `sort_by`
|
185
210
|
### [ ] argmin, argmax
|
186
211
|
### [ ] (array functions)
|
187
212
|
### [ ] (strings functions)
|
@@ -192,4 +217,101 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
192
217
|
|
193
218
|
## Coerce (not impremented)
|
194
219
|
|
195
|
-
##
|
220
|
+
## Update vector's value
|
221
|
+
### `replace_with(booleans, replacements)` => vector
|
222
|
+
|
223
|
+
- Accepts Vector, Array, Arrow::Array for booleans and replacements.
|
224
|
+
- Replacements can accept scalar
|
225
|
+
- Booleans specifies the position of replacement in true.
|
226
|
+
- Replacements specifies the vaues to be replaced.
|
227
|
+
- The number of true in booleans must be equal to the length of replacement
|
228
|
+
|
229
|
+
```ruby
|
230
|
+
vector = RedAmber::Vector.new([1, 2, 3])
|
231
|
+
booleans = [true, false, true]
|
232
|
+
replacemants = [4, 5]
|
233
|
+
vector.replace_with(booleans, replacemants)
|
234
|
+
# =>
|
235
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
|
236
|
+
[4, 2, 5]
|
237
|
+
```
|
238
|
+
|
239
|
+
- Scalar value in replacements can be broadcasted.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
replacemant = 0
|
243
|
+
vector.replace_with(booleans, replacement)
|
244
|
+
# =>
|
245
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
|
246
|
+
[0, 2, 0]
|
247
|
+
```
|
248
|
+
|
249
|
+
- Returned data type is automatically up-casted by replacement.
|
250
|
+
|
251
|
+
```ruby
|
252
|
+
replacement = 1.0
|
253
|
+
vector.replace_with(booleans, replacement)
|
254
|
+
# =>
|
255
|
+
#<RedAmber::Vector(:double, size=3):0x0000000000025d78>
|
256
|
+
[1.0, 2.0, 1.0]
|
257
|
+
```
|
258
|
+
|
259
|
+
- Position of nil in booleans is replaced with nil.
|
260
|
+
|
261
|
+
```ruby
|
262
|
+
booleans = [true, false, nil]
|
263
|
+
replacemant = -1
|
264
|
+
vec.replace_with(booleans, replacement)
|
265
|
+
=>
|
266
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
267
|
+
[-1, 2, nil]
|
268
|
+
```
|
269
|
+
|
270
|
+
- Replacemants can have nil in it.
|
271
|
+
|
272
|
+
```ruby
|
273
|
+
booleans = [true, false, true]
|
274
|
+
replacemants = [nil]
|
275
|
+
vec.replace_with(booleans, replacemants)
|
276
|
+
=>
|
277
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
278
|
+
[nil, 2, nil]
|
279
|
+
```
|
280
|
+
|
281
|
+
- If no replacemants specified, it is same as to specify nil.
|
282
|
+
|
283
|
+
```ruby
|
284
|
+
booleans = [true, false, true]
|
285
|
+
vec.replace_with(booleans)
|
286
|
+
=>
|
287
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
288
|
+
[nil, 2, nil]
|
289
|
+
```
|
290
|
+
|
291
|
+
- An example to replace 'NA' to nil.
|
292
|
+
|
293
|
+
```ruby
|
294
|
+
vector = RedAmber::Vector.new(['A', 'B', 'NA'])
|
295
|
+
vector.replace_with(vector == 'NA', nil)
|
296
|
+
# =>
|
297
|
+
#<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
|
298
|
+
["A", "B", nil]
|
299
|
+
```
|
300
|
+
|
301
|
+
### `fill_nil_forward`, `fill_nil_backward` => vector
|
302
|
+
|
303
|
+
Propagate the last valid observation forward (or backward).
|
304
|
+
Or preserve nil if all previous values are nil or at the end.
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
|
308
|
+
integer.fill_nil_forward
|
309
|
+
# =>
|
310
|
+
#<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
|
311
|
+
[0, 1, 1, 3, 3]
|
312
|
+
|
313
|
+
integer.fill_nil_backward
|
314
|
+
# =>
|
315
|
+
#<RedAmber::Vector(:uint8, size=5):0x000000000000f974>
|
316
|
+
[0, 1, 3, 3, nil]
|
317
|
+
```
|
Binary file
|
data/doc/tdr.md
CHANGED
@@ -36,17 +36,20 @@ The API based on TDR is draft and RedAmber is a small experiment to test the TDR
|
|
36
36
|
| |Basic Table|Transposed DataFrame|Comment for TDR|
|
37
37
|
|-----------|---------|------------|---|
|
38
38
|
|name in TDR|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentation|
|
39
|
-
|variable |located in a column|a key and a `Vector` in lateral|select by
|
40
|
-
|observation|located in a row|
|
41
|
-
|number of
|
42
|
-
|number of
|
43
|
-
|shape |[n_rows, n_columns] |`[size, n_keys]` |same order as Table|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
[
|
39
|
+
|variable |located in a column|a key and a `Vector` in lateral|select by keys|
|
40
|
+
|observation|located in a row|sliced in vertical|select by indices|
|
41
|
+
|number of variables|n_columns etc. |`n_keys` |`n_cols` is available as an alias|
|
42
|
+
|number of observations|n_rows etc. |`size` |`n_rows` is available as an alias|
|
43
|
+
|shape |[n_rows, n_columns] |`shape`=`[size, n_keys]` |same order as Table|
|
44
|
+
|Select variables|select, filter, [ ], etc.|`pick` or `[keys]` |accepts arguments or a block|
|
45
|
+
|Reject variables|drop, etc.|`drop` |accepts arguments or a block|
|
46
|
+
|Select observations|slice, [ ], iloc, etc.|`slice` or `[indices]` |accepts arguments or a block|
|
47
|
+
|Reject observations|drop, etc.|`remove` |accepts arguments or a block|
|
48
|
+
|Add variables|mutate, assign, etc.|`assign` |accepts arguments or a block|
|
49
|
+
|update variables|transmute, [ ]=, etc.|`assign` |accepts arguments or a block|
|
50
|
+
|inner join| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |with a option on:|
|
51
|
+
|left join| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom<br>with a option on:|
|
52
|
+
|right join| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom<br>with a option on:|
|
50
53
|
|
51
54
|
## Q and A for TDR
|
52
55
|
|
data/doc/tdr_ja.md
CHANGED
@@ -37,16 +37,19 @@ TDR に基づいた API はまだ暫定板の段階であり、RedAmber は TDR
|
|
37
37
|
|-----------|---------|------------|---|
|
38
38
|
|TDRでの呼称|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentationの略|
|
39
39
|
|変数 |列に配置|`variables`<br>key と `Vector` として横方向に配置|key で選択|
|
40
|
-
|観測 |行に配置|`observations`<br
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
[
|
40
|
+
|観測 |行に配置|`observations`<br>縦方向に切った一つ一つはslice|index や `slice` メソッドで選択|
|
41
|
+
|変数(列)の数|ncol, n_columns など |`n_keys` |`n_cols` をエイリアスとして設定|
|
42
|
+
|観測(行)の数|nrow, n_rows など |`size` |`n_rows` をエイリアスとして設定|
|
43
|
+
|形状 |[nrow, ncol] |`shape`=`[size, n_keys]` |行, 列の順番は同じ|
|
44
|
+
|変数(列)の選択|select, filter, [ ], など|`pick` or `[keys]` |引数またはブロックで指定|
|
45
|
+
|変数(列)の削除|drop, など|`drop` |引数またはブロックで指定|
|
46
|
+
|観測(行)の選択|slice, [ ], iloc, など|`slice` or `[indices]` |引数またはブロックで指定|
|
47
|
+
|観測(行)の削除|drop, など|`remove` |引数またはブロックで指定|
|
48
|
+
|変数(列)の追加|mutate, assign, など|`assign` |引数またはブロックで指定|
|
49
|
+
|変数(列)の更新|transmute, [ ]=, など|`assign` |引数またはブロックで指定|
|
50
|
+
|内部結合| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |オプション on:|
|
51
|
+
|左結合| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける<br>オプション on:|
|
52
|
+
|右結合| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける<br>オプション on:|
|
50
53
|
|
51
54
|
## Q and A for TDR
|
52
55
|
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -7,20 +7,21 @@ module RedAmber
|
|
7
7
|
# mix-in
|
8
8
|
include DataFrameDisplayable
|
9
9
|
include DataFrameHelper
|
10
|
+
include DataFrameIndexable
|
10
11
|
include DataFrameSelectable
|
11
12
|
include DataFrameObservationOperation
|
12
13
|
include DataFrameVariableOperation
|
13
14
|
|
14
15
|
def initialize(*args)
|
15
|
-
|
16
|
-
# returns empty DataFrame
|
17
|
-
@table = Arrow::Table.new({}, [])
|
16
|
+
@variables = @keys = @vectors = @types = @data_types = nil
|
18
17
|
# bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
|
19
18
|
# [Arrow::Table] == [nil] shows ArgumentError
|
20
19
|
# temporary use yoda condition to workaround
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
if args.empty? || args == [[]] || args == [{}] || [nil] == args
|
21
|
+
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
22
|
+
# returns empty DataFrame
|
23
|
+
@table = Arrow::Table.new({}, [])
|
24
|
+
elsif args.size > 1
|
24
25
|
@table = Arrow::Table.new(*args)
|
25
26
|
else
|
26
27
|
arg = args[0]
|
@@ -42,11 +43,14 @@ module RedAmber
|
|
42
43
|
|
43
44
|
attr_reader :table
|
44
45
|
|
46
|
+
def to_arrow
|
47
|
+
table
|
48
|
+
end
|
49
|
+
|
45
50
|
def save(output, options = {})
|
46
51
|
@table.save(output, options)
|
47
52
|
end
|
48
53
|
|
49
|
-
# Properties ===
|
50
54
|
def size
|
51
55
|
@table.n_rows
|
52
56
|
end
|
@@ -63,8 +67,13 @@ module RedAmber
|
|
63
67
|
[size, n_keys]
|
64
68
|
end
|
65
69
|
|
70
|
+
def variables
|
71
|
+
@variables || @variables = init_instance_vars(:variables)
|
72
|
+
end
|
73
|
+
alias_method :vars, :variables
|
74
|
+
|
66
75
|
def keys
|
67
|
-
@
|
76
|
+
@keys || @keys = init_instance_vars(:keys)
|
68
77
|
end
|
69
78
|
alias_method :column_names, :keys
|
70
79
|
alias_method :var_names, :keys
|
@@ -81,21 +90,15 @@ module RedAmber
|
|
81
90
|
alias_method :index, :key_index
|
82
91
|
|
83
92
|
def types
|
84
|
-
@table.columns.map
|
85
|
-
column.data.value_type.nick.to_sym
|
86
|
-
end
|
93
|
+
@types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
|
87
94
|
end
|
88
95
|
|
89
|
-
def
|
90
|
-
@table.columns.map
|
91
|
-
column.data_type.class
|
92
|
-
end
|
96
|
+
def type_classes
|
97
|
+
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
93
98
|
end
|
94
99
|
|
95
100
|
def vectors
|
96
|
-
@
|
97
|
-
Vector.new(column.data)
|
98
|
-
end
|
101
|
+
@vectors || @vectors = init_instance_vars(:vectors)
|
99
102
|
end
|
100
103
|
|
101
104
|
def indexes
|
@@ -104,9 +107,7 @@ module RedAmber
|
|
104
107
|
alias_method :indices, :indexes
|
105
108
|
|
106
109
|
def to_h
|
107
|
-
|
108
|
-
result[column.name.to_sym] = column.entries
|
109
|
-
end
|
110
|
+
variables.transform_values(&:to_a)
|
110
111
|
end
|
111
112
|
|
112
113
|
def to_a
|
@@ -125,13 +126,27 @@ module RedAmber
|
|
125
126
|
end
|
126
127
|
|
127
128
|
def empty?
|
128
|
-
|
129
|
+
variables.empty?
|
129
130
|
end
|
130
131
|
|
131
132
|
def to_rover
|
132
133
|
Rover::DataFrame.new(to_h)
|
133
134
|
end
|
134
135
|
|
135
|
-
|
136
|
+
private
|
137
|
+
|
138
|
+
# initialize @variable, @keys, @vectors and return one of them
|
139
|
+
def init_instance_vars(var)
|
140
|
+
ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
141
|
+
v = Vector.new(column.data)
|
142
|
+
k = column.name.to_sym
|
143
|
+
v.key = k
|
144
|
+
variables[k] = v
|
145
|
+
keys << k
|
146
|
+
vectors << v
|
147
|
+
end
|
148
|
+
@variables, @keys, @vectors = ary
|
149
|
+
ary[%i[variables keys vectors].index(var)]
|
150
|
+
end
|
136
151
|
end
|
137
152
|
end
|
@@ -73,7 +73,7 @@ module RedAmber
|
|
73
73
|
[shorthand(vector, size, max_element)].concat na_string(vector)
|
74
74
|
end
|
75
75
|
else
|
76
|
-
shorthand(vector, size, max_element)
|
76
|
+
[shorthand(vector, size, max_element)]
|
77
77
|
end
|
78
78
|
sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
79
79
|
end
|
@@ -111,9 +111,10 @@ module RedAmber
|
|
111
111
|
end
|
112
112
|
|
113
113
|
def shorthand(vector, size, max_element)
|
114
|
-
|
114
|
+
max = vector.temporal? ? 2 : max_element
|
115
|
+
a = vector.to_a.take(max)
|
115
116
|
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
116
|
-
a << '... ' if size >
|
117
|
+
a << '... ' if size > max
|
117
118
|
"[#{a.join(', ')}]"
|
118
119
|
end
|
119
120
|
|
@@ -6,9 +6,16 @@ module RedAmber
|
|
6
6
|
private
|
7
7
|
|
8
8
|
def expand_range(args)
|
9
|
-
args.each_with_object([]) do |e, a|
|
9
|
+
ary = args.each_with_object([]) do |e, a|
|
10
10
|
e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
|
11
11
|
end
|
12
|
+
ary.map do |e|
|
13
|
+
if e.is_a?(Integer) && e.negative?
|
14
|
+
e + size
|
15
|
+
else
|
16
|
+
e
|
17
|
+
end
|
18
|
+
end
|
12
19
|
end
|
13
20
|
|
14
21
|
def normalized_array(range)
|
@@ -50,13 +57,6 @@ module RedAmber
|
|
50
57
|
DataFrame.new(@table.filter(array))
|
51
58
|
end
|
52
59
|
|
53
|
-
def select_obs_by_indeces(indeces)
|
54
|
-
out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
|
55
|
-
|
56
|
-
a = indeces.map { |i| @table.slice(i).to_a }
|
57
|
-
DataFrame.new(@table.schema, a)
|
58
|
-
end
|
59
|
-
|
60
60
|
def keys_by_booleans(booleans)
|
61
61
|
keys.select.with_index { |_, i| booleans[i] }
|
62
62
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameIndexable
|
6
|
+
# Common method
|
7
|
+
def map_indices(*indices)
|
8
|
+
return self if indices.empty?
|
9
|
+
|
10
|
+
indices = indices[0].data if indices[0].is_a?(Vector)
|
11
|
+
|
12
|
+
new_dataframe_by(indices)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param sort_keys [Arrow::SortKey]
|
16
|
+
# :key, "key" or "+key" denotes ascending,
|
17
|
+
# "-key" denotes descending order
|
18
|
+
# @return [RedAmber::Vector] Sorted indices in Vector
|
19
|
+
def sort_indices(*sort_keys)
|
20
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
21
|
+
Vector.new(indices)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RedAmber::DataFrame] Sorted DataFrame
|
25
|
+
def sort(*sort_keys)
|
26
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
27
|
+
|
28
|
+
new_dataframe_by(indices)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def new_dataframe_by(index_array)
|
34
|
+
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
+
RedAmber::DataFrame.new(t)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -25,7 +25,7 @@ module RedAmber
|
|
25
25
|
|
26
26
|
# filter with indexes
|
27
27
|
slicer = expand_range(slicer)
|
28
|
-
return
|
28
|
+
return map_indices(*slicer) if integers?(slicer)
|
29
29
|
|
30
30
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
31
31
|
end
|
@@ -57,11 +57,22 @@ module RedAmber
|
|
57
57
|
# filter with indexes
|
58
58
|
slicer = indexes.to_a - expand_range(remover)
|
59
59
|
return remove_all_values if slicer.empty?
|
60
|
-
return
|
60
|
+
return map_indices(*slicer) if integers?(slicer)
|
61
61
|
|
62
62
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
63
63
|
end
|
64
64
|
|
65
|
+
def remove_nil
|
66
|
+
func = Arrow::Function.find(:drop_null)
|
67
|
+
DataFrame.new(func.execute([table]).value)
|
68
|
+
end
|
69
|
+
alias_method :drop_nil, :remove_nil
|
70
|
+
|
71
|
+
def group(aggregating_keys, func, target_keys)
|
72
|
+
t = table.group(*aggregating_keys)
|
73
|
+
RedAmber::DataFrame.new(t.send(func, *target_keys))
|
74
|
+
end
|
75
|
+
|
65
76
|
private
|
66
77
|
|
67
78
|
# return a DataFrame with same keys as self without values
|
@@ -27,12 +27,22 @@ module RedAmber
|
|
27
27
|
|
28
28
|
# expand Range like [1..3, 4] to [1, 2, 3, 4]
|
29
29
|
expanded = expand_range(args)
|
30
|
-
return
|
30
|
+
return map_indices(*expanded) if integers?(expanded)
|
31
31
|
return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
|
32
32
|
|
33
33
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
34
34
|
end
|
35
35
|
|
36
|
+
# Select a variable by a key in String or Symbol
|
37
|
+
def v(key)
|
38
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
39
|
+
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
40
|
+
end
|
41
|
+
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
42
|
+
|
43
|
+
variables[key.to_sym]
|
44
|
+
end
|
45
|
+
|
36
46
|
def head(n_rows = 5)
|
37
47
|
raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
|
38
48
|
|
@@ -57,10 +67,10 @@ module RedAmber
|
|
57
67
|
|
58
68
|
def select_vars_by_keys(keys)
|
59
69
|
if keys.one?
|
60
|
-
|
61
|
-
raise DataFrameArgumentError, "Key does not exist #{keys}" unless
|
70
|
+
key = keys[0].to_sym
|
71
|
+
raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
|
62
72
|
|
63
|
-
|
73
|
+
variables[key]
|
64
74
|
else
|
65
75
|
DataFrame.new(@table[keys])
|
66
76
|
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -5,10 +5,12 @@ module RedAmber
|
|
5
5
|
# @data : holds Arrow::ChunkedArray
|
6
6
|
class Vector
|
7
7
|
# mix-in
|
8
|
+
include VectorCompensable
|
8
9
|
include VectorFunctions
|
9
10
|
|
10
11
|
# chunked_array may come from column.data
|
11
12
|
def initialize(array)
|
13
|
+
@key = nil # default is 'headless'
|
12
14
|
case array
|
13
15
|
when Vector
|
14
16
|
@data = array.data
|
@@ -17,11 +19,12 @@ module RedAmber
|
|
17
19
|
when Array
|
18
20
|
@data = Arrow::Array.new(array)
|
19
21
|
else
|
20
|
-
raise
|
22
|
+
raise VectorArgumentError, 'Unknown array in argument'
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
24
26
|
attr_reader :data
|
27
|
+
attr_accessor :key
|
25
28
|
|
26
29
|
def to_s
|
27
30
|
@data.to_a.inspect
|
@@ -66,15 +69,19 @@ module RedAmber
|
|
66
69
|
end
|
67
70
|
|
68
71
|
def numeric?
|
69
|
-
|
72
|
+
type_class < Arrow::NumericDataType
|
70
73
|
end
|
71
74
|
|
72
75
|
def string?
|
73
76
|
type == :string
|
74
77
|
end
|
75
78
|
|
76
|
-
def
|
77
|
-
|
79
|
+
def temporal?
|
80
|
+
type_class < Arrow::TemporalDataType
|
81
|
+
end
|
82
|
+
|
83
|
+
def type_class
|
84
|
+
@data.value_data_type.class
|
78
85
|
end
|
79
86
|
|
80
87
|
# def each() end
|
@@ -90,7 +97,23 @@ module RedAmber
|
|
90
97
|
# def each_chunk() end
|
91
98
|
|
92
99
|
def tally
|
93
|
-
values.tally
|
100
|
+
hash = values.tally
|
101
|
+
if (type_class < Arrow::FloatingPointDataType) && is_nan.any
|
102
|
+
a = 0
|
103
|
+
hash.each do |key, value|
|
104
|
+
if key.is_a?(Float) && key.nan?
|
105
|
+
hash.delete(key)
|
106
|
+
a += value
|
107
|
+
end
|
108
|
+
end
|
109
|
+
hash[Float::NAN] = a
|
110
|
+
end
|
111
|
+
hash
|
112
|
+
end
|
113
|
+
|
114
|
+
def value_counts
|
115
|
+
values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
|
116
|
+
values.zip(counts).to_h
|
94
117
|
end
|
95
118
|
|
96
119
|
def n_nulls
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# mix-ins for class Vector
|
8
|
+
# Functions to make up some data (especially missing) for new data.
|
9
|
+
module VectorCompensable
|
10
|
+
# [Ternary]: replace_with(booleans, replacements) => vector
|
11
|
+
# Replace items selected with a boolean mask
|
12
|
+
#
|
13
|
+
# (from Arrow C++ inline doc.)
|
14
|
+
# Given an array and a boolean mask (either scalar or of equal length),
|
15
|
+
# along with replacement values (either scalar or array),
|
16
|
+
# each element of the array for which the corresponding mask element is
|
17
|
+
# true will be replaced by the next value from the replacements,
|
18
|
+
# or with null if the mask is null.
|
19
|
+
# Hence, for replacement arrays, len(replacements) == sum(mask == true).
|
20
|
+
|
21
|
+
def replace_with(booleans, replacements = nil)
|
22
|
+
specifier =
|
23
|
+
if booleans.is_a?(Arrow::BooleanArray)
|
24
|
+
booleans
|
25
|
+
elsif booleans.is_a?(Vector) && booleans.boolean?
|
26
|
+
booleans.data
|
27
|
+
elsif booleans.is_a?(Array) && booleans?(booleans)
|
28
|
+
Arrow::BooleanArray.new(booleans)
|
29
|
+
else
|
30
|
+
raise VectorTypeError, 'Not a valid type'
|
31
|
+
end
|
32
|
+
raise VectorArgumentError, 'Booleans size unmatch' if specifier.length != size
|
33
|
+
raise VectorArgumentError, 'Booleans not have any `true`' unless specifier.any?
|
34
|
+
|
35
|
+
r = Array(replacements) # scalar to [scalar]
|
36
|
+
r = [nil] if r.empty?
|
37
|
+
|
38
|
+
replacer =
|
39
|
+
if r.size == 1
|
40
|
+
case replacements
|
41
|
+
when Arrow::Array then replacements
|
42
|
+
when Vector then replacements.data
|
43
|
+
else
|
44
|
+
Arrow::Array.new(r * specifier.to_a.count(true)) # broadcast
|
45
|
+
end
|
46
|
+
else
|
47
|
+
Arrow::Array.new(r)
|
48
|
+
end
|
49
|
+
replacer = data.class.new(replacer) if replacer.uniq == [nil]
|
50
|
+
|
51
|
+
raise VectorArgumentError, 'Replacements size unmatch' if Array(specifier).count(true) != replacer.length
|
52
|
+
|
53
|
+
values = replacer.class.new(data)
|
54
|
+
|
55
|
+
datum = find('replace_with_mask').execute([values, specifier, replacer])
|
56
|
+
take_out_element_wise(datum)
|
57
|
+
end
|
58
|
+
|
59
|
+
# (related functions)
|
60
|
+
# fill_null_backward, fill_null_forward
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def booleans?(enum)
|
65
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|