red_amber 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -8
- data/CHANGELOG.md +74 -7
- data/Gemfile +3 -0
- data/README.md +47 -13
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +185 -35
- data/doc/Vector.md +132 -10
- data/doc/image/dataframe_model.png +0 -0
- data/doc/tdr.md +14 -11
- data/doc/tdr_ja.md +13 -10
- data/lib/red_amber/data_frame.rb +38 -23
- data/lib/red_amber/data_frame_displayable.rb +4 -3
- data/lib/red_amber/data_frame_helper.rb +8 -8
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +13 -2
- data/lib/red_amber/data_frame_selectable.rb +14 -4
- data/lib/red_amber/vector.rb +28 -5
- data/lib/red_amber/vector_compensable.rb +68 -0
- data/lib/red_amber/vector_functions.rb +16 -13
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +5 -0
- data/red_amber.gemspec +3 -6
- metadata +12 -9
- data/doc/image/TDR_operations.pdf +0 -0
data/doc/Vector.md
CHANGED
@@ -33,7 +33,9 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
33
33
|
|
34
34
|
### `type`
|
35
35
|
|
36
|
-
### `
|
36
|
+
### `boolean?`, `numeric?`, `string?`, `temporal?`
|
37
|
+
|
38
|
+
### `type_class`
|
37
39
|
|
38
40
|
### [ ] `each` (not impremented yet)
|
39
41
|
|
@@ -43,8 +45,6 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
|
43
45
|
|
44
46
|
### [ ] `each_chunk` (not impremented yet)
|
45
47
|
|
46
|
-
### `tally`
|
47
|
-
|
48
48
|
### `n_nils`, `n_nans`
|
49
49
|
|
50
50
|
- `n_nulls` is an alias of `n_nils`
|
@@ -126,20 +126,23 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
126
126
|
|[ ]`asin` | | [ ] | | | |
|
127
127
|
| ✓ `atan` | | ✓ | | | |
|
128
128
|
| ✓ `bit_wise_not`| | (✓) | | |integer only|
|
129
|
-
|
|
129
|
+
| ✓ `ceil` | | ✓ | | | |
|
130
130
|
| ✓ `cos` | | ✓ | | | |
|
131
|
-
|
|
131
|
+
| ✓`fill_nil_backward`| ✓ | ✓ | ✓ | | |
|
132
|
+
| ✓`fill_nil_forward` | ✓ | ✓ | ✓ | | |
|
133
|
+
| ✓ `floor` | | ✓ | | | |
|
132
134
|
| ✓ `invert` | ✓ | | | |`!`, alias `not`|
|
133
135
|
|[ ]`ln` | | [ ] | | | |
|
134
136
|
|[ ]`log10` | | [ ] | | | |
|
135
137
|
|[ ]`log1p` | | [ ] | | | |
|
136
138
|
|[ ]`log2` | | [ ] | | | |
|
137
|
-
|
|
138
|
-
|
|
139
|
+
| ✓ `round` | | ✓ | | ✓ Round (:mode, :n_digits)| |
|
140
|
+
| ✓ `round_to_multiple`| | ✓ | | ✓ RoundToMultiple :mode, :multiple| multiple must be an Arrow::Scalar|
|
139
141
|
| ✓ `sign` | | ✓ | | | |
|
140
142
|
| ✓ `sin` | | ✓ | | | |
|
143
|
+
| ✓`sort_indexes`| ✓ | ✓ | ✓ |:order|alias `sort_indices`|
|
141
144
|
| ✓ `tan` | | ✓ | | | |
|
142
|
-
|
|
145
|
+
| ✓ `trunc` | | ✓ | | | |
|
143
146
|
|
144
147
|
### Binary element-wise: `vector.func(vector) => vector`
|
145
148
|
|
@@ -180,8 +183,30 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
180
183
|
| ✓ `shift_right` | | (✓) | | |`>>`, integer only|
|
181
184
|
| ✓ `xor` | ✓ | | | | `^` |
|
182
185
|
|
186
|
+
### `uniq`
|
187
|
+
|
188
|
+
Returns a new array with distinct elements.
|
189
|
+
|
183
190
|
(Not impremented functions)
|
184
|
-
|
191
|
+
|
192
|
+
### `tally` and `value_counts`
|
193
|
+
|
194
|
+
Compute counts of unique elements and return a Hash.
|
195
|
+
|
196
|
+
It returns almost same result as Ruby's tally. These methods consider NaNs are same.
|
197
|
+
|
198
|
+
```ruby
|
199
|
+
array = [0.0/0, Float::NAN]
|
200
|
+
array.tally #=> {NaN=>1, NaN=>1}
|
201
|
+
|
202
|
+
vector = RedAmber::Vector.new(array)
|
203
|
+
vector.tally #=> {NaN=>2}
|
204
|
+
vector.value_counts #=> {NaN=>2}
|
205
|
+
```
|
206
|
+
|
207
|
+
### `sort_indexes`, `sort_indices`, `array_sort_indices`
|
208
|
+
|
209
|
+
### [ ] `sort`, `sort_by`
|
185
210
|
### [ ] argmin, argmax
|
186
211
|
### [ ] (array functions)
|
187
212
|
### [ ] (strings functions)
|
@@ -192,4 +217,101 @@ boolean.all(opts: {skip_nulls: false}) #=> false
|
|
192
217
|
|
193
218
|
## Coerce (not impremented)
|
194
219
|
|
195
|
-
##
|
220
|
+
## Update vector's value
|
221
|
+
### `replace_with(booleans, replacements)` => vector
|
222
|
+
|
223
|
+
- Accepts Vector, Array, Arrow::Array for booleans and replacements.
|
224
|
+
- Replacements can accept scalar
|
225
|
+
- Booleans specifies the position of replacement in true.
|
226
|
+
- Replacements specifies the vaues to be replaced.
|
227
|
+
- The number of true in booleans must be equal to the length of replacement
|
228
|
+
|
229
|
+
```ruby
|
230
|
+
vector = RedAmber::Vector.new([1, 2, 3])
|
231
|
+
booleans = [true, false, true]
|
232
|
+
replacemants = [4, 5]
|
233
|
+
vector.replace_with(booleans, replacemants)
|
234
|
+
# =>
|
235
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
|
236
|
+
[4, 2, 5]
|
237
|
+
```
|
238
|
+
|
239
|
+
- Scalar value in replacements can be broadcasted.
|
240
|
+
|
241
|
+
```ruby
|
242
|
+
replacemant = 0
|
243
|
+
vector.replace_with(booleans, replacement)
|
244
|
+
# =>
|
245
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000001ee10>
|
246
|
+
[0, 2, 0]
|
247
|
+
```
|
248
|
+
|
249
|
+
- Returned data type is automatically up-casted by replacement.
|
250
|
+
|
251
|
+
```ruby
|
252
|
+
replacement = 1.0
|
253
|
+
vector.replace_with(booleans, replacement)
|
254
|
+
# =>
|
255
|
+
#<RedAmber::Vector(:double, size=3):0x0000000000025d78>
|
256
|
+
[1.0, 2.0, 1.0]
|
257
|
+
```
|
258
|
+
|
259
|
+
- Position of nil in booleans is replaced with nil.
|
260
|
+
|
261
|
+
```ruby
|
262
|
+
booleans = [true, false, nil]
|
263
|
+
replacemant = -1
|
264
|
+
vec.replace_with(booleans, replacement)
|
265
|
+
=>
|
266
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
267
|
+
[-1, 2, nil]
|
268
|
+
```
|
269
|
+
|
270
|
+
- Replacemants can have nil in it.
|
271
|
+
|
272
|
+
```ruby
|
273
|
+
booleans = [true, false, true]
|
274
|
+
replacemants = [nil]
|
275
|
+
vec.replace_with(booleans, replacemants)
|
276
|
+
=>
|
277
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
278
|
+
[nil, 2, nil]
|
279
|
+
```
|
280
|
+
|
281
|
+
- If no replacemants specified, it is same as to specify nil.
|
282
|
+
|
283
|
+
```ruby
|
284
|
+
booleans = [true, false, true]
|
285
|
+
vec.replace_with(booleans)
|
286
|
+
=>
|
287
|
+
#<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
|
288
|
+
[nil, 2, nil]
|
289
|
+
```
|
290
|
+
|
291
|
+
- An example to replace 'NA' to nil.
|
292
|
+
|
293
|
+
```ruby
|
294
|
+
vector = RedAmber::Vector.new(['A', 'B', 'NA'])
|
295
|
+
vector.replace_with(vector == 'NA', nil)
|
296
|
+
# =>
|
297
|
+
#<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
|
298
|
+
["A", "B", nil]
|
299
|
+
```
|
300
|
+
|
301
|
+
### `fill_nil_forward`, `fill_nil_backward` => vector
|
302
|
+
|
303
|
+
Propagate the last valid observation forward (or backward).
|
304
|
+
Or preserve nil if all previous values are nil or at the end.
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
|
308
|
+
integer.fill_nil_forward
|
309
|
+
# =>
|
310
|
+
#<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
|
311
|
+
[0, 1, 1, 3, 3]
|
312
|
+
|
313
|
+
integer.fill_nil_backward
|
314
|
+
# =>
|
315
|
+
#<RedAmber::Vector(:uint8, size=5):0x000000000000f974>
|
316
|
+
[0, 1, 3, 3, nil]
|
317
|
+
```
|
Binary file
|
data/doc/tdr.md
CHANGED
@@ -36,17 +36,20 @@ The API based on TDR is draft and RedAmber is a small experiment to test the TDR
|
|
36
36
|
| |Basic Table|Transposed DataFrame|Comment for TDR|
|
37
37
|
|-----------|---------|------------|---|
|
38
38
|
|name in TDR|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentation|
|
39
|
-
|variable |located in a column|a key and a `Vector` in lateral|select by
|
40
|
-
|observation|located in a row|
|
41
|
-
|number of
|
42
|
-
|number of
|
43
|
-
|shape |[n_rows, n_columns] |`[size, n_keys]` |same order as Table|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
[
|
39
|
+
|variable |located in a column|a key and a `Vector` in lateral|select by keys|
|
40
|
+
|observation|located in a row|sliced in vertical|select by indices|
|
41
|
+
|number of variables|n_columns etc. |`n_keys` |`n_cols` is available as an alias|
|
42
|
+
|number of observations|n_rows etc. |`size` |`n_rows` is available as an alias|
|
43
|
+
|shape |[n_rows, n_columns] |`shape`=`[size, n_keys]` |same order as Table|
|
44
|
+
|Select variables|select, filter, [ ], etc.|`pick` or `[keys]` |accepts arguments or a block|
|
45
|
+
|Reject variables|drop, etc.|`drop` |accepts arguments or a block|
|
46
|
+
|Select observations|slice, [ ], iloc, etc.|`slice` or `[indices]` |accepts arguments or a block|
|
47
|
+
|Reject observations|drop, etc.|`remove` |accepts arguments or a block|
|
48
|
+
|Add variables|mutate, assign, etc.|`assign` |accepts arguments or a block|
|
49
|
+
|update variables|transmute, [ ]=, etc.|`assign` |accepts arguments or a block|
|
50
|
+
|inner join| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |with a option on:|
|
51
|
+
|left join| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |naturally join from bottom<br>with a option on:|
|
52
|
+
|right join| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |naturally join from bottom<br>with a option on:|
|
50
53
|
|
51
54
|
## Q and A for TDR
|
52
55
|
|
data/doc/tdr_ja.md
CHANGED
@@ -37,16 +37,19 @@ TDR に基づいた API はまだ暫定板の段階であり、RedAmber は TDR
|
|
37
37
|
|-----------|---------|------------|---|
|
38
38
|
|TDRでの呼称|`Table`|`TDR`|**T**ransposed **D**ataFrame **R**epresentationの略|
|
39
39
|
|変数 |列に配置|`variables`<br>key と `Vector` として横方向に配置|key で選択|
|
40
|
-
|観測 |行に配置|`observations`<br
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
[
|
40
|
+
|観測 |行に配置|`observations`<br>縦方向に切った一つ一つはslice|index や `slice` メソッドで選択|
|
41
|
+
|変数(列)の数|ncol, n_columns など |`n_keys` |`n_cols` をエイリアスとして設定|
|
42
|
+
|観測(行)の数|nrow, n_rows など |`size` |`n_rows` をエイリアスとして設定|
|
43
|
+
|形状 |[nrow, ncol] |`shape`=`[size, n_keys]` |行, 列の順番は同じ|
|
44
|
+
|変数(列)の選択|select, filter, [ ], など|`pick` or `[keys]` |引数またはブロックで指定|
|
45
|
+
|変数(列)の削除|drop, など|`drop` |引数またはブロックで指定|
|
46
|
+
|観測(行)の選択|slice, [ ], iloc, など|`slice` or `[indices]` |引数またはブロックで指定|
|
47
|
+
|観測(行)の削除|drop, など|`remove` |引数またはブロックで指定|
|
48
|
+
|変数(列)の追加|mutate, assign, など|`assign` |引数またはブロックで指定|
|
49
|
+
|変数(列)の更新|transmute, [ ]=, など|`assign` |引数またはブロックで指定|
|
50
|
+
|内部結合| inner_join(a,b)<br>merge(a, b, how='inner')|`a.inner_join(b)` |オプション on:|
|
51
|
+
|左結合| left_join(a,b)<br>merge(a, b, how='left')|`a.join(b)` |自然に下にくっつける<br>オプション on:|
|
52
|
+
|右結合| right_join(a,b))<br>merge(a, b, how='right')|`b.join(a)` |自然に下にくっつける<br>オプション on:|
|
50
53
|
|
51
54
|
## Q and A for TDR
|
52
55
|
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -7,20 +7,21 @@ module RedAmber
|
|
7
7
|
# mix-in
|
8
8
|
include DataFrameDisplayable
|
9
9
|
include DataFrameHelper
|
10
|
+
include DataFrameIndexable
|
10
11
|
include DataFrameSelectable
|
11
12
|
include DataFrameObservationOperation
|
12
13
|
include DataFrameVariableOperation
|
13
14
|
|
14
15
|
def initialize(*args)
|
15
|
-
|
16
|
-
# returns empty DataFrame
|
17
|
-
@table = Arrow::Table.new({}, [])
|
16
|
+
@variables = @keys = @vectors = @types = @data_types = nil
|
18
17
|
# bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
|
19
18
|
# [Arrow::Table] == [nil] shows ArgumentError
|
20
19
|
# temporary use yoda condition to workaround
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
if args.empty? || args == [[]] || args == [{}] || [nil] == args
|
21
|
+
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
22
|
+
# returns empty DataFrame
|
23
|
+
@table = Arrow::Table.new({}, [])
|
24
|
+
elsif args.size > 1
|
24
25
|
@table = Arrow::Table.new(*args)
|
25
26
|
else
|
26
27
|
arg = args[0]
|
@@ -42,11 +43,14 @@ module RedAmber
|
|
42
43
|
|
43
44
|
attr_reader :table
|
44
45
|
|
46
|
+
def to_arrow
|
47
|
+
table
|
48
|
+
end
|
49
|
+
|
45
50
|
def save(output, options = {})
|
46
51
|
@table.save(output, options)
|
47
52
|
end
|
48
53
|
|
49
|
-
# Properties ===
|
50
54
|
def size
|
51
55
|
@table.n_rows
|
52
56
|
end
|
@@ -63,8 +67,13 @@ module RedAmber
|
|
63
67
|
[size, n_keys]
|
64
68
|
end
|
65
69
|
|
70
|
+
def variables
|
71
|
+
@variables || @variables = init_instance_vars(:variables)
|
72
|
+
end
|
73
|
+
alias_method :vars, :variables
|
74
|
+
|
66
75
|
def keys
|
67
|
-
@
|
76
|
+
@keys || @keys = init_instance_vars(:keys)
|
68
77
|
end
|
69
78
|
alias_method :column_names, :keys
|
70
79
|
alias_method :var_names, :keys
|
@@ -81,21 +90,15 @@ module RedAmber
|
|
81
90
|
alias_method :index, :key_index
|
82
91
|
|
83
92
|
def types
|
84
|
-
@table.columns.map
|
85
|
-
column.data.value_type.nick.to_sym
|
86
|
-
end
|
93
|
+
@types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
|
87
94
|
end
|
88
95
|
|
89
|
-
def
|
90
|
-
@table.columns.map
|
91
|
-
column.data_type.class
|
92
|
-
end
|
96
|
+
def type_classes
|
97
|
+
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
93
98
|
end
|
94
99
|
|
95
100
|
def vectors
|
96
|
-
@
|
97
|
-
Vector.new(column.data)
|
98
|
-
end
|
101
|
+
@vectors || @vectors = init_instance_vars(:vectors)
|
99
102
|
end
|
100
103
|
|
101
104
|
def indexes
|
@@ -104,9 +107,7 @@ module RedAmber
|
|
104
107
|
alias_method :indices, :indexes
|
105
108
|
|
106
109
|
def to_h
|
107
|
-
|
108
|
-
result[column.name.to_sym] = column.entries
|
109
|
-
end
|
110
|
+
variables.transform_values(&:to_a)
|
110
111
|
end
|
111
112
|
|
112
113
|
def to_a
|
@@ -125,13 +126,27 @@ module RedAmber
|
|
125
126
|
end
|
126
127
|
|
127
128
|
def empty?
|
128
|
-
|
129
|
+
variables.empty?
|
129
130
|
end
|
130
131
|
|
131
132
|
def to_rover
|
132
133
|
Rover::DataFrame.new(to_h)
|
133
134
|
end
|
134
135
|
|
135
|
-
|
136
|
+
private
|
137
|
+
|
138
|
+
# initialize @variable, @keys, @vectors and return one of them
|
139
|
+
def init_instance_vars(var)
|
140
|
+
ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
141
|
+
v = Vector.new(column.data)
|
142
|
+
k = column.name.to_sym
|
143
|
+
v.key = k
|
144
|
+
variables[k] = v
|
145
|
+
keys << k
|
146
|
+
vectors << v
|
147
|
+
end
|
148
|
+
@variables, @keys, @vectors = ary
|
149
|
+
ary[%i[variables keys vectors].index(var)]
|
150
|
+
end
|
136
151
|
end
|
137
152
|
end
|
@@ -73,7 +73,7 @@ module RedAmber
|
|
73
73
|
[shorthand(vector, size, max_element)].concat na_string(vector)
|
74
74
|
end
|
75
75
|
else
|
76
|
-
shorthand(vector, size, max_element)
|
76
|
+
[shorthand(vector, size, max_element)]
|
77
77
|
end
|
78
78
|
sio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
79
79
|
end
|
@@ -111,9 +111,10 @@ module RedAmber
|
|
111
111
|
end
|
112
112
|
|
113
113
|
def shorthand(vector, size, max_element)
|
114
|
-
|
114
|
+
max = vector.temporal? ? 2 : max_element
|
115
|
+
a = vector.to_a.take(max)
|
115
116
|
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
116
|
-
a << '... ' if size >
|
117
|
+
a << '... ' if size > max
|
117
118
|
"[#{a.join(', ')}]"
|
118
119
|
end
|
119
120
|
|
@@ -6,9 +6,16 @@ module RedAmber
|
|
6
6
|
private
|
7
7
|
|
8
8
|
def expand_range(args)
|
9
|
-
args.each_with_object([]) do |e, a|
|
9
|
+
ary = args.each_with_object([]) do |e, a|
|
10
10
|
e.is_a?(Range) ? a.concat(normalized_array(e)) : a.append(e)
|
11
11
|
end
|
12
|
+
ary.map do |e|
|
13
|
+
if e.is_a?(Integer) && e.negative?
|
14
|
+
e + size
|
15
|
+
else
|
16
|
+
e
|
17
|
+
end
|
18
|
+
end
|
12
19
|
end
|
13
20
|
|
14
21
|
def normalized_array(range)
|
@@ -50,13 +57,6 @@ module RedAmber
|
|
50
57
|
DataFrame.new(@table.filter(array))
|
51
58
|
end
|
52
59
|
|
53
|
-
def select_obs_by_indeces(indeces)
|
54
|
-
out_of_range?(indeces) && raise(DataFrameArgumentError, "Invalid index: #{indeces} for 0..#{size - 1}")
|
55
|
-
|
56
|
-
a = indeces.map { |i| @table.slice(i).to_a }
|
57
|
-
DataFrame.new(@table.schema, a)
|
58
|
-
end
|
59
|
-
|
60
60
|
def keys_by_booleans(booleans)
|
61
61
|
keys.select.with_index { |_, i| booleans[i] }
|
62
62
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RedAmber
|
4
|
+
# mix-ins for the class DataFrame
|
5
|
+
module DataFrameIndexable
|
6
|
+
# Common method
|
7
|
+
def map_indices(*indices)
|
8
|
+
return self if indices.empty?
|
9
|
+
|
10
|
+
indices = indices[0].data if indices[0].is_a?(Vector)
|
11
|
+
|
12
|
+
new_dataframe_by(indices)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param sort_keys [Arrow::SortKey]
|
16
|
+
# :key, "key" or "+key" denotes ascending,
|
17
|
+
# "-key" denotes descending order
|
18
|
+
# @return [RedAmber::Vector] Sorted indices in Vector
|
19
|
+
def sort_indices(*sort_keys)
|
20
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
21
|
+
Vector.new(indices)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RedAmber::DataFrame] Sorted DataFrame
|
25
|
+
def sort(*sort_keys)
|
26
|
+
indices = @table.sort_indices(sort_keys.flatten)
|
27
|
+
|
28
|
+
new_dataframe_by(indices)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def new_dataframe_by(index_array)
|
34
|
+
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
+
RedAmber::DataFrame.new(t)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -25,7 +25,7 @@ module RedAmber
|
|
25
25
|
|
26
26
|
# filter with indexes
|
27
27
|
slicer = expand_range(slicer)
|
28
|
-
return
|
28
|
+
return map_indices(*slicer) if integers?(slicer)
|
29
29
|
|
30
30
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
31
31
|
end
|
@@ -57,11 +57,22 @@ module RedAmber
|
|
57
57
|
# filter with indexes
|
58
58
|
slicer = indexes.to_a - expand_range(remover)
|
59
59
|
return remove_all_values if slicer.empty?
|
60
|
-
return
|
60
|
+
return map_indices(*slicer) if integers?(slicer)
|
61
61
|
|
62
62
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
63
63
|
end
|
64
64
|
|
65
|
+
def remove_nil
|
66
|
+
func = Arrow::Function.find(:drop_null)
|
67
|
+
DataFrame.new(func.execute([table]).value)
|
68
|
+
end
|
69
|
+
alias_method :drop_nil, :remove_nil
|
70
|
+
|
71
|
+
def group(aggregating_keys, func, target_keys)
|
72
|
+
t = table.group(*aggregating_keys)
|
73
|
+
RedAmber::DataFrame.new(t.send(func, *target_keys))
|
74
|
+
end
|
75
|
+
|
65
76
|
private
|
66
77
|
|
67
78
|
# return a DataFrame with same keys as self without values
|
@@ -27,12 +27,22 @@ module RedAmber
|
|
27
27
|
|
28
28
|
# expand Range like [1..3, 4] to [1, 2, 3, 4]
|
29
29
|
expanded = expand_range(args)
|
30
|
-
return
|
30
|
+
return map_indices(*expanded) if integers?(expanded)
|
31
31
|
return select_vars_by_keys(expanded.map(&:to_sym)) if sym_or_str?(expanded)
|
32
32
|
|
33
33
|
raise DataFrameArgumentError, "Invalid argument #{args}"
|
34
34
|
end
|
35
35
|
|
36
|
+
# Select a variable by a key in String or Symbol
|
37
|
+
def v(key)
|
38
|
+
unless key.is_a?(Symbol) || key.is_a?(String)
|
39
|
+
raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]"
|
40
|
+
end
|
41
|
+
raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key)
|
42
|
+
|
43
|
+
variables[key.to_sym]
|
44
|
+
end
|
45
|
+
|
36
46
|
def head(n_rows = 5)
|
37
47
|
raise DataFrameArgumentError, "Index is out of range #{n_rows}" if n_rows.negative?
|
38
48
|
|
@@ -57,10 +67,10 @@ module RedAmber
|
|
57
67
|
|
58
68
|
def select_vars_by_keys(keys)
|
59
69
|
if keys.one?
|
60
|
-
|
61
|
-
raise DataFrameArgumentError, "Key does not exist #{keys}" unless
|
70
|
+
key = keys[0].to_sym
|
71
|
+
raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key
|
62
72
|
|
63
|
-
|
73
|
+
variables[key]
|
64
74
|
else
|
65
75
|
DataFrame.new(@table[keys])
|
66
76
|
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -5,10 +5,12 @@ module RedAmber
|
|
5
5
|
# @data : holds Arrow::ChunkedArray
|
6
6
|
class Vector
|
7
7
|
# mix-in
|
8
|
+
include VectorCompensable
|
8
9
|
include VectorFunctions
|
9
10
|
|
10
11
|
# chunked_array may come from column.data
|
11
12
|
def initialize(array)
|
13
|
+
@key = nil # default is 'headless'
|
12
14
|
case array
|
13
15
|
when Vector
|
14
16
|
@data = array.data
|
@@ -17,11 +19,12 @@ module RedAmber
|
|
17
19
|
when Array
|
18
20
|
@data = Arrow::Array.new(array)
|
19
21
|
else
|
20
|
-
raise
|
22
|
+
raise VectorArgumentError, 'Unknown array in argument'
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
24
26
|
attr_reader :data
|
27
|
+
attr_accessor :key
|
25
28
|
|
26
29
|
def to_s
|
27
30
|
@data.to_a.inspect
|
@@ -66,15 +69,19 @@ module RedAmber
|
|
66
69
|
end
|
67
70
|
|
68
71
|
def numeric?
|
69
|
-
|
72
|
+
type_class < Arrow::NumericDataType
|
70
73
|
end
|
71
74
|
|
72
75
|
def string?
|
73
76
|
type == :string
|
74
77
|
end
|
75
78
|
|
76
|
-
def
|
77
|
-
|
79
|
+
def temporal?
|
80
|
+
type_class < Arrow::TemporalDataType
|
81
|
+
end
|
82
|
+
|
83
|
+
def type_class
|
84
|
+
@data.value_data_type.class
|
78
85
|
end
|
79
86
|
|
80
87
|
# def each() end
|
@@ -90,7 +97,23 @@ module RedAmber
|
|
90
97
|
# def each_chunk() end
|
91
98
|
|
92
99
|
def tally
|
93
|
-
values.tally
|
100
|
+
hash = values.tally
|
101
|
+
if (type_class < Arrow::FloatingPointDataType) && is_nan.any
|
102
|
+
a = 0
|
103
|
+
hash.each do |key, value|
|
104
|
+
if key.is_a?(Float) && key.nan?
|
105
|
+
hash.delete(key)
|
106
|
+
a += value
|
107
|
+
end
|
108
|
+
end
|
109
|
+
hash[Float::NAN] = a
|
110
|
+
end
|
111
|
+
hash
|
112
|
+
end
|
113
|
+
|
114
|
+
def value_counts
|
115
|
+
values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
|
116
|
+
values.zip(counts).to_h
|
94
117
|
end
|
95
118
|
|
96
119
|
def n_nulls
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
module RedAmber
|
7
|
+
# mix-ins for class Vector
|
8
|
+
# Functions to make up some data (especially missing) for new data.
|
9
|
+
module VectorCompensable
|
10
|
+
# [Ternary]: replace_with(booleans, replacements) => vector
|
11
|
+
# Replace items selected with a boolean mask
|
12
|
+
#
|
13
|
+
# (from Arrow C++ inline doc.)
|
14
|
+
# Given an array and a boolean mask (either scalar or of equal length),
|
15
|
+
# along with replacement values (either scalar or array),
|
16
|
+
# each element of the array for which the corresponding mask element is
|
17
|
+
# true will be replaced by the next value from the replacements,
|
18
|
+
# or with null if the mask is null.
|
19
|
+
# Hence, for replacement arrays, len(replacements) == sum(mask == true).
|
20
|
+
|
21
|
+
def replace_with(booleans, replacements = nil)
|
22
|
+
specifier =
|
23
|
+
if booleans.is_a?(Arrow::BooleanArray)
|
24
|
+
booleans
|
25
|
+
elsif booleans.is_a?(Vector) && booleans.boolean?
|
26
|
+
booleans.data
|
27
|
+
elsif booleans.is_a?(Array) && booleans?(booleans)
|
28
|
+
Arrow::BooleanArray.new(booleans)
|
29
|
+
else
|
30
|
+
raise VectorTypeError, 'Not a valid type'
|
31
|
+
end
|
32
|
+
raise VectorArgumentError, 'Booleans size unmatch' if specifier.length != size
|
33
|
+
raise VectorArgumentError, 'Booleans not have any `true`' unless specifier.any?
|
34
|
+
|
35
|
+
r = Array(replacements) # scalar to [scalar]
|
36
|
+
r = [nil] if r.empty?
|
37
|
+
|
38
|
+
replacer =
|
39
|
+
if r.size == 1
|
40
|
+
case replacements
|
41
|
+
when Arrow::Array then replacements
|
42
|
+
when Vector then replacements.data
|
43
|
+
else
|
44
|
+
Arrow::Array.new(r * specifier.to_a.count(true)) # broadcast
|
45
|
+
end
|
46
|
+
else
|
47
|
+
Arrow::Array.new(r)
|
48
|
+
end
|
49
|
+
replacer = data.class.new(replacer) if replacer.uniq == [nil]
|
50
|
+
|
51
|
+
raise VectorArgumentError, 'Replacements size unmatch' if Array(specifier).count(true) != replacer.length
|
52
|
+
|
53
|
+
values = replacer.class.new(data)
|
54
|
+
|
55
|
+
datum = find('replace_with_mask').execute([values, specifier, replacer])
|
56
|
+
take_out_element_wise(datum)
|
57
|
+
end
|
58
|
+
|
59
|
+
# (related functions)
|
60
|
+
# fill_null_backward, fill_null_forward
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def booleans?(enum)
|
65
|
+
enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|