red_amber 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -6
- data/CHANGELOG.md +39 -2
- data/README.md +156 -96
- data/lib/red_amber/data_frame.rb +11 -0
- data/lib/red_amber/data_frame_output.rb +60 -52
- data/lib/red_amber/data_frame_selectable.rb +4 -1
- data/lib/red_amber/vector.rb +31 -2
- data/lib/red_amber/vector_functions.rb +109 -58
- data/lib/red_amber/version.rb +1 -1
- data/red_amber.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0308ff686bf7b49b767b7cd28ddc068e02170c00c093dcd42c7187e438e0adf3'
|
4
|
+
data.tar.gz: 98397e31bce1a440e951357d5d3b475814a6ecc08f21a0908c0fdf58c6189be4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ad71d8259d04535d08567bde6ca0fc419e0d9de15d1e812dbc642fb3901f1c744c69766dbf409e876212e426e309ac0032968b767df3a960a8e6eb40d4f3c19
|
7
|
+
data.tar.gz: eee78ae4316b007d95714d6e2920ad32518497942d9cd5adb373476321e6f9e6e8099f9c721ee8bac05df2617fb3f3c747ce92ec74b1cb84da0b0bd4664051cf
|
data/.rubocop.yml
CHANGED
@@ -55,14 +55,13 @@ Layout/LineLength:
|
|
55
55
|
Metrics/AbcSize:
|
56
56
|
Max: 23
|
57
57
|
Exclude:
|
58
|
-
- 'lib/red_amber/data_frame_output.rb' # Max:
|
58
|
+
- 'lib/red_amber/data_frame_output.rb' # Max: 51
|
59
59
|
|
60
60
|
# Max: 25
|
61
61
|
Metrics/BlockLength:
|
62
62
|
Max: 25
|
63
63
|
Exclude:
|
64
64
|
- 'test/**/*'
|
65
|
-
- '*.gemspec'
|
66
65
|
|
67
66
|
# Max: 100
|
68
67
|
Metrics/ClassLength:
|
@@ -73,20 +72,27 @@ Metrics/ClassLength:
|
|
73
72
|
# Max: 7
|
74
73
|
Metrics/CyclomaticComplexity:
|
75
74
|
Max: 10
|
76
|
-
Exclude:
|
77
|
-
- 'lib/red_amber/data_frame_output.rb' # Max: 11
|
78
75
|
|
79
76
|
# Max: 10
|
80
77
|
Metrics/MethodLength:
|
81
78
|
Max: 18
|
82
79
|
Exclude:
|
83
|
-
- 'lib/red_amber/data_frame_output.rb' # Max:
|
80
|
+
- 'lib/red_amber/data_frame_output.rb' # Max: 31
|
81
|
+
|
82
|
+
# Max: 100
|
83
|
+
Metrics/ModuleLength:
|
84
|
+
Max: 100
|
85
|
+
Exclude:
|
86
|
+
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
84
87
|
|
85
88
|
# Max: 8
|
86
89
|
Metrics/PerceivedComplexity:
|
87
90
|
Max: 11
|
91
|
+
|
92
|
+
# Necessary to define is_na
|
93
|
+
Naming/PredicateName:
|
88
94
|
Exclude:
|
89
|
-
- 'lib/red_amber/
|
95
|
+
- 'lib/red_amber/vector_functions.rb'
|
90
96
|
|
91
97
|
# Necessary to test when range.end == -1
|
92
98
|
Style/SlicingWithRange:
|
data/CHANGELOG.md
CHANGED
@@ -1,12 +1,49 @@
|
|
1
|
-
## [0.1.
|
1
|
+
## [0.1.4] - Unreleased
|
2
|
+
|
3
|
+
- Prepare documents for the 'Transposed DataFrame Representation'
|
4
|
+
- Feedback to Red Arrow
|
5
|
+
- Separate documents
|
2
6
|
|
3
7
|
- `DataFrame`
|
4
8
|
- Introduce updating capabilities
|
5
9
|
- Introduce NA support
|
6
10
|
- Add slice method
|
11
|
+
|
7
12
|
- `Vector`
|
8
13
|
- Add NaN support for functions
|
9
|
-
-
|
14
|
+
- Support more functions
|
15
|
+
|
16
|
+
## [0.1.3] - 2022-05-15 (experimental)
|
17
|
+
|
18
|
+
- Bug fixes
|
19
|
+
- Fix boolean functions in `Vector` to align with Ruby's behavior
|
20
|
+
- `&` == `and_kleene`
|
21
|
+
- `|` == `or_kleene`
|
22
|
+
- Quote strings of data-preview in `DataFrame#inspect`
|
23
|
+
- Quote empty and blank keys in `DataFrame#inspect`
|
24
|
+
- Respond to error for a wrong key in `DataFrame#[]`
|
25
|
+
|
26
|
+
- New features and improvements
|
27
|
+
- `DataFrame`
|
28
|
+
- Display nil elements in `inspect`
|
29
|
+
- Show NaN and nil counts in `inspect`
|
30
|
+
- Refactor `inspect`
|
31
|
+
- Add method `key` and `key_index`
|
32
|
+
- Add how to load/save Parquet to README
|
33
|
+
|
34
|
+
- `Vector`
|
35
|
+
- Add categorization functions
|
36
|
+
|
37
|
+
This is an important step to support `slice` method and NA treatment features.
|
38
|
+
- `is_finite`
|
39
|
+
- `is_inf`
|
40
|
+
- `is_na` (RedAmber original)
|
41
|
+
- `is_nan`
|
42
|
+
- `is_nil`, `is_null`
|
43
|
+
- `is_valid`
|
44
|
+
- Show in a reduced representation for long array in `inspect`
|
45
|
+
- Support options in aggregatiton functions
|
46
|
+
- Return values in non-arrow object for scalar aggregation functions
|
10
47
|
|
11
48
|
## [0.1.2] - 2022-05-08 (experimental)
|
12
49
|
|
data/README.md
CHANGED
@@ -45,7 +45,7 @@ Or install it yourself as:
|
|
45
45
|
- [x] `new` from a Rover::DataFrame
|
46
46
|
- `RedAmber::DataFrame.new(Rover::DataFrame.new(x: [1, 2, 3]))`
|
47
47
|
|
48
|
-
- [
|
48
|
+
- [x] `load` (class method)
|
49
49
|
|
50
50
|
- [x] from a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
51
51
|
- `RedAmber::DataFrame.load("test/entity/with_header.csv")`
|
@@ -55,9 +55,16 @@ Or install it yourself as:
|
|
55
55
|
- [x] from a URI
|
56
56
|
- `RedAmber::DataFrame.load(URI("https://github.com/heronshoes/red_amber/blob/master/test/entity/with_header.csv"))`
|
57
57
|
|
58
|
-
- [
|
58
|
+
- [x] from a Parquet file
|
59
59
|
|
60
|
-
-
|
60
|
+
`red-parquet` gem is required.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
require 'parquet'
|
64
|
+
dataframe = RedAmber::DataFrame.load("file.parquet")
|
65
|
+
```
|
66
|
+
|
67
|
+
- [x] `save` (instance method)
|
61
68
|
|
62
69
|
- [x] to a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
63
70
|
|
@@ -65,7 +72,14 @@ Or install it yourself as:
|
|
65
72
|
|
66
73
|
- [x] to a URI
|
67
74
|
|
68
|
-
- [
|
75
|
+
- [x] to a Parquet file
|
76
|
+
|
77
|
+
`red-parquet` gem is required.
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
require 'parquet'
|
81
|
+
dataframe.save("file.parquet")
|
82
|
+
```
|
69
83
|
|
70
84
|
### Properties
|
71
85
|
|
@@ -129,18 +143,26 @@ Or install it yourself as:
|
|
129
143
|
|
130
144
|
- [x] `inspect(tally_level: 5, max_element: 5)`
|
131
145
|
|
132
|
-
Shows some information about self.
|
146
|
+
Shows some information about self in a transposed style.
|
133
147
|
|
134
148
|
```ruby
|
135
|
-
|
136
|
-
|
149
|
+
require 'red_amber'
|
150
|
+
require 'datasets-arrow'
|
151
|
+
|
152
|
+
penguins = Datasets::Penguins.new.to_arrow
|
153
|
+
RedAmber::DataFrame.new(penguins)
|
137
154
|
# =>
|
138
|
-
RedAmber::DataFrame :
|
139
|
-
|
140
|
-
# key
|
141
|
-
1 :
|
142
|
-
2 :
|
143
|
-
3 :
|
155
|
+
RedAmber::DataFrame : 344 x 8 Vectors
|
156
|
+
Vectors : 5 numeric, 3 strings
|
157
|
+
# key type level data_preview
|
158
|
+
1 :species string 3 {"Adelie"=>152, "Chinstrap"=>68, "Gentoo"=>124}
|
159
|
+
2 :island string 3 {"Torgersen"=>52, "Biscoe"=>168, "Dream"=>124}
|
160
|
+
3 :bill_length_mm double 165 [39.1, 39.5, 40.3, nil, 36.7, ... ], 2 nils
|
161
|
+
4 :bill_depth_mm double 81 [18.7, 17.4, 18.0, nil, 19.3, ... ], 2 nils
|
162
|
+
5 :flipper_length_mm uint8 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
163
|
+
6 :body_mass_g uint16 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
164
|
+
7 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
|
165
|
+
8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
144
166
|
```
|
145
167
|
|
146
168
|
- tally_level: max level to use tally mode
|
@@ -151,19 +173,20 @@ Variables : 2 numeric, 1 string
|
|
151
173
|
- [x] Select columns by `[]` as `[key]`, `[keys]`, `[keys[index]]`
|
152
174
|
- Key in a Symbol: `df[:symbol]`
|
153
175
|
- Key in a String: `df["string"]`
|
154
|
-
- Keys in an Array: `df[:symbol1
|
176
|
+
- Keys in an Array: `df[:symbol1, "string", :symbol2]`
|
155
177
|
- Keys in indeces: `df[df.keys[0]`, `df[df.keys[1,2]]`, `df[df.keys[1..]]`
|
156
178
|
- Keys in a Range:
|
157
179
|
A end-less Range can be used to represent keys.
|
180
|
+
|
158
181
|
```ruby
|
159
182
|
hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
|
160
183
|
df = RedAmber::DataFrame.new(hash)
|
161
184
|
df[:b..:c, "a"]
|
162
185
|
# =>
|
163
|
-
RedAmber::DataFrame : 3
|
164
|
-
|
186
|
+
RedAmber::DataFrame : 3 x 3 Vectors
|
187
|
+
Vectors : 2 numeric, 1 string
|
165
188
|
# key type level data_preview
|
166
|
-
1 :b string 3 [A, B, C]
|
189
|
+
1 :b string 3 ["A", "B", "C"]
|
167
190
|
2 :c double 3 [1.0, 2.0, 3.0]
|
168
191
|
3 :a uint8 3 [1, 2, 3]
|
169
192
|
```
|
@@ -258,90 +281,127 @@ Variables : 2 numeric, 1 string
|
|
258
281
|
|
259
282
|
- [x] `tally`
|
260
283
|
|
261
|
-
- [
|
284
|
+
- [x] `n_nils`, `n_nans`
|
285
|
+
|
286
|
+
- `n_nulls` is an alias of `n_nils`
|
287
|
+
|
288
|
+
- [x] `inspect(limit: 80)`
|
289
|
+
|
290
|
+
- `limit` sets size limit to display long array.
|
262
291
|
|
263
292
|
### Functions
|
264
|
-
#### Unary aggregations: vector.func =>
|
265
|
-
|
266
|
-
| Method |Boolean|Numeric|String|Remarks|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|[
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|[
|
278
|
-
|[ ]
|
279
|
-
|
|
280
|
-
|[
|
281
|
-
|[ ]
|
282
|
-
|
|
283
|
-
|[
|
284
|
-
|[ ]
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|[
|
319
|
-
|[
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|[
|
323
|
-
|
|
324
|
-
|[
|
325
|
-
|
|
326
|
-
|[
|
327
|
-
|[
|
328
|
-
|[
|
329
|
-
|[ ]
|
330
|
-
|[ ]
|
331
|
-
|[
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|[
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
|
293
|
+
#### Unary aggregations: vector.func => scalar
|
294
|
+
|
295
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
296
|
+
| ----------- | --- | --- | --- | --- | --- |
|
297
|
+
| ✓ `all` | ✓ | | | ✓ ScalarAggregate| |
|
298
|
+
| ✓ `any` | ✓ | | | ✓ ScalarAggregate| |
|
299
|
+
| ✓ `approximate_median`| |✓| | ✓ ScalarAggregate| alias `median`|
|
300
|
+
| ✓ `count` | ✓ | ✓ | ✓ | ✓ Count | |
|
301
|
+
| ✓ `count_distinct`| ✓ | ✓ | ✓ | ✓ Count |alias `count_uniq`|
|
302
|
+
|[ ]`index` | [ ] | [ ] | [ ] |[ ] Index | |
|
303
|
+
| ✓ `max` | ✓ | ✓ | ✓ | ✓ ScalarAggregate| |
|
304
|
+
| ✓ `mean` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
305
|
+
| ✓ `min` | ✓ | ✓ | ✓ | ✓ ScalarAggregate| |
|
306
|
+
|[ ]`min_max` | [ ] | [ ] | [ ] |[ ] ScalarAggregate| |
|
307
|
+
|[ ]`mode` | | [ ] | |[ ] Mode | |
|
308
|
+
| ✓ `product` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
309
|
+
|[ ]`quantile`| | [ ] | |[ ] Quantile| |
|
310
|
+
|[ ]`stddev` | | ✓ | |[ ] Variance| |
|
311
|
+
| ✓ `sum` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
312
|
+
|[ ]`tdigest` | | [ ] | |[ ] TDigest | |
|
313
|
+
|[ ]`variance`| | ✓ | |[ ] Variance| |
|
314
|
+
|
315
|
+
|
316
|
+
Options can be used as follows.
|
317
|
+
See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
|
318
|
+
|
319
|
+
```ruby
|
320
|
+
double = RedAmber::Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
|
321
|
+
#=>
|
322
|
+
#<RedAmber::Vector(:double, size=6):0x000000000000f910>
|
323
|
+
[1.0, NaN, -Infinity, Infinity, nil, 0.0]
|
324
|
+
|
325
|
+
double.count #=> 5
|
326
|
+
double.count(opts: {mode: :only_valid}) #=> 5, default
|
327
|
+
double.count(opts: {mode: :only_null}) #=> 1
|
328
|
+
double.count(opts: {mode: :all}) #=> 6
|
329
|
+
|
330
|
+
boolean = RedAmber::Vector.new([true, true, nil])
|
331
|
+
#=>
|
332
|
+
#<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
|
333
|
+
[true, true, nil]
|
334
|
+
|
335
|
+
boolean.all #=> true
|
336
|
+
boolean.all(opts: {skip_nulls: true}) #=> true
|
337
|
+
boolean.all(opts: {skip_nulls: false}) #=> false
|
338
|
+
```
|
339
|
+
|
340
|
+
#### Unary element-wise: vector.func => vector
|
341
|
+
|
342
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
343
|
+
| ------------ | --- | --- | --- | --- | ----- |
|
344
|
+
| ✓ `-@` | | ✓ | | |as `-vector`|
|
345
|
+
| ✓ `negate` | | ✓ | | |`-@` |
|
346
|
+
| ✓ `abs` | | ✓ | | | |
|
347
|
+
|[ ]`acos` | | [ ] | | | |
|
348
|
+
|[ ]`asin` | | [ ] | | | |
|
349
|
+
| ✓ `atan` | | ✓ | | | |
|
350
|
+
| ✓ `bit_wise_not`| | (✓) | | |integer only|
|
351
|
+
|[ ]`ceil` | | ✓ | | | |
|
352
|
+
| ✓ `cos` | | ✓ | | | |
|
353
|
+
|[ ]`floor` | | ✓ | | | |
|
354
|
+
| ✓ `invert` | ✓ | | | |`!`, alias `not`|
|
355
|
+
|[ ]`ln` | | [ ] | | | |
|
356
|
+
|[ ]`log10` | | [ ] | | | |
|
357
|
+
|[ ]`log1p` | | [ ] | | | |
|
358
|
+
|[ ]`log2` | | [ ] | | | |
|
359
|
+
|[ ]`round` | | [ ] | |[ ] Round| |
|
360
|
+
|[ ]`round_to_multiple`| | [ ] | |[ ] RoundToMultiple| |
|
361
|
+
| ✓ `sign` | | ✓ | | | |
|
362
|
+
| ✓ `sin` | | ✓ | | | |
|
363
|
+
| ✓ `tan` | | ✓ | | | |
|
364
|
+
|[ ]`trunc` | | ✓ | | | |
|
365
|
+
|
366
|
+
#### Binary element-wise: vector.func(vector) => vector
|
367
|
+
|
368
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
369
|
+
| ----------------- | --- | --- | --- | --- | ----- |
|
370
|
+
| ✓ `add` | | ✓ | | | `+` |
|
371
|
+
| ✓ `atan2` | | ✓ | | | |
|
372
|
+
| ✓ `and_kleene` | ✓ | | | | `&` |
|
373
|
+
| ✓ `and_org ` | ✓ | | | |`and` in Red Arrow|
|
374
|
+
| ✓ `and_not` | ✓ | | | | |
|
375
|
+
| ✓ `and_not_kleene`| ✓ | | | | |
|
376
|
+
| ✓ `bit_wise_and` | | (✓) | | |integer only|
|
377
|
+
| ✓ `bit_wise_or` | | (✓) | | |integer only|
|
378
|
+
| ✓ `bit_wise_xor` | | (✓) | | |integer only|
|
379
|
+
| ✓ `divide` | | ✓ | | | `/` |
|
380
|
+
| ✓ `equal` | ✓ | ✓ | ✓ | |`==`, alias `eq`|
|
381
|
+
| ✓ `greater` | ✓ | ✓ | ✓ | |`>`, alias `gt`|
|
382
|
+
| ✓ `greater_equal` | ✓ | ✓ | ✓ | |`>=`, alias `ge`|
|
383
|
+
| ✓ `is_finite` | | ✓ | | | |
|
384
|
+
| ✓ `is_inf` | | ✓ | | | |
|
385
|
+
| ✓ `is_na` | ✓ | ✓ | ✓ | | |
|
386
|
+
| ✓ `is_nan` | | ✓ | | | |
|
387
|
+
|[ ]`is_nil` | ✓ | ✓ | ✓ |[ ] Null|alias `is_null`|
|
388
|
+
| ✓ `is_valid` | ✓ | ✓ | ✓ | | |
|
389
|
+
| ✓ `less` | ✓ | ✓ | ✓ | |`<`, alias `lt`|
|
390
|
+
| ✓ `less_equal` | ✓ | ✓ | ✓ | |`<=`, alias `le`|
|
391
|
+
|[ ]`logb` | | [ ] | | | |
|
392
|
+
|[ ]`mod` | | [ ] | | | `%` |
|
393
|
+
| ✓ `multiply` | | ✓ | | | `*` |
|
394
|
+
| ✓ `not_equal` | ✓ | ✓ | ✓ | |`!=`, alias `ne`|
|
395
|
+
| ✓ `or_kleene` | ✓ | | | | `\|` |
|
396
|
+
| ✓ `or_org` | ✓ | | | |`or` in Red Arrow|
|
397
|
+
| ✓ `power` | | ✓ | | | `**` |
|
398
|
+
| ✓ `subtract` | | ✓ | | | `-` |
|
399
|
+
| ✓ `shift_left` | | (✓) | | |`<<`, integer only|
|
400
|
+
| ✓ `shift_right` | | (✓) | | |`>>`, integer only|
|
401
|
+
| ✓ `xor` | ✓ | | | | `^` |
|
340
402
|
|
341
403
|
##### (Not impremented)
|
342
|
-
- [ ] invert, round, round_to_multiple
|
343
404
|
- [ ] sort, sort_index
|
344
|
-
- [ ] minmax, var, median, quantile
|
345
405
|
- [ ] argmin, argmax
|
346
406
|
- [ ] (array functions)
|
347
407
|
- [ ] (strings functions)
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -67,6 +67,17 @@ module RedAmber
|
|
67
67
|
alias_method :keys, :column_names
|
68
68
|
alias_method :header, :column_names
|
69
69
|
|
70
|
+
def key?(key)
|
71
|
+
column_names.include?(key.to_sym)
|
72
|
+
end
|
73
|
+
alias_method :has_key?, :key?
|
74
|
+
|
75
|
+
def key_index(key)
|
76
|
+
column_names.find_index(key.to_sym)
|
77
|
+
end
|
78
|
+
alias_method :find_index, :key_index
|
79
|
+
alias_method :index, :key_index
|
80
|
+
|
70
81
|
def types
|
71
82
|
@table.columns.map do |column|
|
72
83
|
column.data_type.to_s.to_sym
|
@@ -19,61 +19,51 @@ module RedAmber
|
|
19
19
|
|
20
20
|
# - tally_level: max level to use tally mode
|
21
21
|
# - max_element: max element to show values in each row
|
22
|
-
# TODO: Is it better to change name other than `inspect` ?
|
23
|
-
# TODO:
|
24
|
-
# TODO:
|
25
|
-
# TODO: Refactor code to smaller methods
|
22
|
+
# - TODO: Is it better to change name other than `inspect` ?
|
23
|
+
# - TODO: Fall back to inspect_raw when treating large dataset
|
24
|
+
# - TODO: Refactor code to smaller methods
|
26
25
|
def inspect(tally_level: 5, max_element: 5)
|
27
26
|
return '#<RedAmber::DataFrame (empty)>' if empty?
|
28
27
|
|
29
28
|
stringio = StringIO.new # output string buffer
|
30
29
|
|
30
|
+
tallys = vectors.map(&:tally)
|
31
|
+
levels = tallys.map(&:size)
|
32
|
+
type_groups = @table.columns.map { |column| type_group(column.data_type) }
|
33
|
+
quoted_keys = keys.map(&:inspect)
|
34
|
+
headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
|
35
|
+
header_format = make_header_format(levels, headers, quoted_keys)
|
36
|
+
|
31
37
|
# 1st row: show shape of the dataframe
|
32
|
-
|
33
|
-
c = pl(ncol)
|
38
|
+
vs = "Vector#{pl(ncol)}"
|
34
39
|
stringio.puts \
|
35
|
-
"#{self.class} : #{nrow}
|
40
|
+
"#{self.class} : #{nrow} x #{ncol} #{vs}"
|
36
41
|
|
37
42
|
# 2nd row: show var counts by type
|
38
|
-
|
39
|
-
|
40
|
-
stringio.puts "Variable#{pl(ncol)} : #{var_type_count(type_groups).join(', ')}"
|
43
|
+
stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
|
41
44
|
|
42
45
|
# 3rd row: print header of rows
|
43
|
-
|
44
|
-
|
45
|
-
#
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
str = format("%#{w_row}d ", data_tally.size)
|
60
|
-
str <<
|
61
|
-
case type_group
|
62
|
-
when :numeric, :string, :boolean
|
63
|
-
if data_tally.size <= tally_level && data_tally.size != nrow
|
64
|
-
data_tally.to_s
|
46
|
+
stringio.printf header_format, *headers.values
|
47
|
+
|
48
|
+
# 4th row ~: show details for each column (vector)
|
49
|
+
vectors.each.with_index do |vector, i|
|
50
|
+
key = quoted_keys[i]
|
51
|
+
type = types[i]
|
52
|
+
type_group = type_groups[i]
|
53
|
+
data_tally = tallys[i]
|
54
|
+
|
55
|
+
a = case type_group
|
56
|
+
when :numeric, :string, :boolean
|
57
|
+
if data_tally.size <= tally_level && data_tally.size != nrow
|
58
|
+
[data_tally.to_s]
|
59
|
+
else
|
60
|
+
[shorthand(vector, nrow, max_element)].concat na_string(vector)
|
61
|
+
end
|
65
62
|
else
|
66
|
-
|
63
|
+
shorthand(vector, nrow, max_element)
|
67
64
|
end
|
68
|
-
|
69
|
-
# str << " #{c} NaN#{pl(c)}" if c&.>(0) # safely call c>0
|
70
|
-
else
|
71
|
-
reduced_vector_presentation(vector, nrow, max_element)
|
72
|
-
end
|
73
|
-
|
74
|
-
stringio.printf("%#{w_idx}d %-#{w_key}s %-#{w_type}s %s\n", i, ":#{key}", type, str)
|
65
|
+
stringio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
75
66
|
end
|
76
|
-
|
77
67
|
stringio.string
|
78
68
|
end
|
79
69
|
|
@@ -83,15 +73,21 @@ module RedAmber
|
|
83
73
|
num > 1 ? 's' : ''
|
84
74
|
end
|
85
75
|
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
76
|
+
def make_header_format(levels, headers, quoted_keys)
|
77
|
+
# find longest word to adjust column width
|
78
|
+
w_idx = ncol.to_s.size
|
79
|
+
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
80
|
+
w_type = [types.map(&:size).max, headers[:type].size].max
|
81
|
+
w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
82
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
|
83
|
+
end
|
84
|
+
|
85
|
+
def type_group(data_type)
|
86
|
+
case data_type
|
87
|
+
when Arrow::NumericDataType then :numeric
|
88
|
+
when Arrow::StringDataType then :string
|
89
|
+
when Arrow::BooleanDataType then :boolean
|
90
|
+
when Arrow::TemporalDataType then :temporal
|
95
91
|
else
|
96
92
|
:other
|
97
93
|
end
|
@@ -107,10 +103,22 @@ module RedAmber
|
|
107
103
|
a
|
108
104
|
end
|
109
105
|
|
110
|
-
def
|
106
|
+
def shorthand(vector, nrow, max_element)
|
111
107
|
a = vector.to_a.take(max_element)
|
112
|
-
a
|
108
|
+
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
109
|
+
a << '... ' if nrow > max_element
|
113
110
|
"[#{a.join(', ')}]"
|
114
111
|
end
|
112
|
+
|
113
|
+
def na_string(vector)
|
114
|
+
n_nan = vector.n_nans
|
115
|
+
n_nil = vector.n_nils
|
116
|
+
a = []
|
117
|
+
return a if (n_nan + n_nil).zero?
|
118
|
+
|
119
|
+
a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero?
|
120
|
+
a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
|
121
|
+
a
|
122
|
+
end
|
115
123
|
end
|
116
124
|
end
|
@@ -45,7 +45,10 @@ module RedAmber
|
|
45
45
|
|
46
46
|
def select_columns(keys)
|
47
47
|
if keys.one?
|
48
|
-
|
48
|
+
t = @table[*keys]
|
49
|
+
raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
|
50
|
+
|
51
|
+
Vector.new(t.data)
|
49
52
|
else
|
50
53
|
DataFrame.new(@table[keys])
|
51
54
|
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -27,8 +27,20 @@ module RedAmber
|
|
27
27
|
@data.to_a.inspect
|
28
28
|
end
|
29
29
|
|
30
|
-
def inspect
|
31
|
-
|
30
|
+
def inspect(limit: 80)
|
31
|
+
sio = StringIO.new << '['
|
32
|
+
to_a.each_with_object(sio).with_index do |(e, s), i|
|
33
|
+
next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
|
34
|
+
if (s.size + next_str.size) < limit
|
35
|
+
s << next_str
|
36
|
+
else
|
37
|
+
s << ', ... ' if i < size
|
38
|
+
break
|
39
|
+
end
|
40
|
+
end
|
41
|
+
sio << ']'
|
42
|
+
|
43
|
+
format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
|
32
44
|
end
|
33
45
|
|
34
46
|
def values
|
@@ -49,6 +61,18 @@ module RedAmber
|
|
49
61
|
@data.value_type.nick.to_sym
|
50
62
|
end
|
51
63
|
|
64
|
+
def boolean?
|
65
|
+
type == :boolean
|
66
|
+
end
|
67
|
+
|
68
|
+
def numeric?
|
69
|
+
%i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
|
70
|
+
end
|
71
|
+
|
72
|
+
def string?
|
73
|
+
type == :string
|
74
|
+
end
|
75
|
+
|
52
76
|
def data_type
|
53
77
|
@data.value_type
|
54
78
|
end
|
@@ -72,5 +96,10 @@ module RedAmber
|
|
72
96
|
def n_nulls
|
73
97
|
@data.n_nulls
|
74
98
|
end
|
99
|
+
alias_method :n_nils, :n_nulls
|
100
|
+
|
101
|
+
def n_nans
|
102
|
+
numeric? ? is_nan.to_a.count(true) : 0
|
103
|
+
end
|
75
104
|
end
|
76
105
|
end
|
@@ -1,69 +1,113 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
# Not implemented in Red Arrow 8.0.0
|
7
|
+
# divmod, # '%',
|
8
|
+
# true_unless_null
|
9
|
+
|
3
10
|
module RedAmber
|
4
11
|
# mix-ins for class Vector
|
5
12
|
module VectorFunctions
|
6
|
-
#
|
7
|
-
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
8
|
-
|
9
|
-
# [Unary aggregations]: vector.func => Scalar
|
13
|
+
# [Unary aggregations]: vector.func => scalar
|
10
14
|
unary_aggregations =
|
11
|
-
%i[all any approximate_median count count_distinct max mean min
|
12
|
-
product stddev sum variance]
|
15
|
+
%i[all any approximate_median count count_distinct max mean min product stddev sum variance]
|
13
16
|
unary_aggregations.each do |function|
|
14
|
-
define_method(function)
|
17
|
+
define_method(function) do |opts: nil|
|
18
|
+
output = exec_func_unary(function, options: opts)
|
19
|
+
take_out_scalar(output)
|
20
|
+
end
|
15
21
|
end
|
22
|
+
alias_method :median, :approximate_median
|
16
23
|
alias_method :count_uniq, :count_distinct
|
17
24
|
|
18
25
|
# option(s) required
|
19
|
-
# index
|
26
|
+
# - index
|
20
27
|
|
21
28
|
# Returns other than value
|
22
|
-
# min_max
|
23
|
-
# mode
|
24
|
-
# quantile
|
25
|
-
# tdigest
|
26
|
-
|
27
|
-
# [Unary element-wise]: vector.func =>
|
28
|
-
unary_element_wise =
|
29
|
+
# - min_max
|
30
|
+
# - mode
|
31
|
+
# - quantile
|
32
|
+
# - tdigest
|
33
|
+
|
34
|
+
# [Unary element-wise]: vector.func => vector
|
35
|
+
unary_element_wise =
|
36
|
+
%i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
|
29
37
|
unary_element_wise.each do |function|
|
30
|
-
define_method(function)
|
38
|
+
define_method(function) do |opts: nil|
|
39
|
+
output = exec_func_unary(function, options: opts)
|
40
|
+
take_out_element_wise(output)
|
41
|
+
end
|
31
42
|
end
|
43
|
+
alias_method :is_nil, :is_null
|
32
44
|
|
33
|
-
|
45
|
+
def is_na
|
46
|
+
numeric? ? (is_nil | is_nan) : is_nil
|
47
|
+
end
|
48
|
+
|
49
|
+
# [Unary element-wise with operator]: vector.func => vector, op vector
|
34
50
|
unary_element_wise_op = {
|
51
|
+
invert: '!',
|
35
52
|
negate: '-@',
|
36
53
|
}
|
37
54
|
unary_element_wise_op.each do |function, operator|
|
38
|
-
define_method(function)
|
39
|
-
|
55
|
+
define_method(function) do |opts: nil|
|
56
|
+
output = exec_func_unary(function, options: opts)
|
57
|
+
take_out_element_wise(output)
|
58
|
+
end
|
59
|
+
|
60
|
+
define_method(operator) do |opts: nil|
|
61
|
+
output = exec_func_unary(function, options: opts)
|
62
|
+
take_out_element_wise(output)
|
63
|
+
end
|
40
64
|
end
|
65
|
+
alias_method :not, :invert
|
41
66
|
|
42
|
-
#
|
67
|
+
# option(s) required
|
68
|
+
# - round, round_to_multiple
|
43
69
|
|
44
70
|
# NaN support needed
|
45
|
-
#
|
71
|
+
# - acos asin ln log10 log1p log2
|
46
72
|
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
73
|
+
# Functions with numerical range check
|
74
|
+
# - abs_checked acos_checked asin_checked cos_checked ln_checked
|
75
|
+
# log10_checked log1p_checked log2_checked sin_checked tan_checked
|
50
76
|
|
51
|
-
# [Binary element-wise]: vector.func(other) =>
|
52
|
-
binary_element_wise =
|
77
|
+
# [Binary element-wise]: vector.func(other) => vector
|
78
|
+
binary_element_wise =
|
79
|
+
%i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
|
53
80
|
binary_element_wise.each do |function|
|
54
|
-
define_method(function) do |other|
|
55
|
-
|
81
|
+
define_method(function) do |other, opts: nil|
|
82
|
+
output = exec_func_binary(function, other, options: opts)
|
83
|
+
take_out_element_wise(output)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# [Logical binary element-wise]: vector.func(other) => vector
|
88
|
+
logical_binary_element_wise = {
|
89
|
+
'&': :and_kleene,
|
90
|
+
and_kleene: :and_kleene,
|
91
|
+
and_org: :and,
|
92
|
+
'|': :or_kleene,
|
93
|
+
or_kleene: :or_kleene,
|
94
|
+
or_org: :or,
|
95
|
+
}
|
96
|
+
logical_binary_element_wise.each do |method, function|
|
97
|
+
define_method(method) do |other, opts: nil|
|
98
|
+
output = exec_func_binary(function, other, options: opts)
|
99
|
+
take_out_element_wise(output)
|
56
100
|
end
|
57
101
|
end
|
58
102
|
|
59
103
|
# NaN support needed
|
60
|
-
# logb
|
104
|
+
# - logb
|
61
105
|
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
106
|
+
# Functions with numerical range check
|
107
|
+
# - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
|
108
|
+
# shift_left_checked shift_right_checked
|
65
109
|
|
66
|
-
# [Binary element-wise with operator]: vector.func(other) =>
|
110
|
+
# [Binary element-wise with operator]: vector.func(other) => vector
|
67
111
|
binary_element_wise_op = {
|
68
112
|
add: '+',
|
69
113
|
divide: '/',
|
@@ -71,9 +115,7 @@ module RedAmber
|
|
71
115
|
power: '**',
|
72
116
|
subtract: '-',
|
73
117
|
|
74
|
-
|
75
|
-
bit_wise_or: '|',
|
76
|
-
bit_wise_xor: '^',
|
118
|
+
xor: '^',
|
77
119
|
shift_left: '<<',
|
78
120
|
shift_right: '>>',
|
79
121
|
|
@@ -85,11 +127,14 @@ module RedAmber
|
|
85
127
|
not_equal: '!=',
|
86
128
|
}
|
87
129
|
binary_element_wise_op.each do |function, operator|
|
88
|
-
define_method(function) do |other|
|
89
|
-
|
130
|
+
define_method(function) do |other, opts: nil|
|
131
|
+
output = exec_func_binary(function, other, options: opts)
|
132
|
+
take_out_element_wise(output)
|
90
133
|
end
|
91
|
-
|
92
|
-
|
134
|
+
|
135
|
+
define_method(operator) do |other, opts: nil|
|
136
|
+
output = exec_func_binary(function, other, options: opts)
|
137
|
+
take_out_element_wise(output)
|
93
138
|
end
|
94
139
|
end
|
95
140
|
alias_method :eq, :equal
|
@@ -99,8 +144,6 @@ module RedAmber
|
|
99
144
|
alias_method :lt, :less
|
100
145
|
alias_method :ne, :not_equal
|
101
146
|
|
102
|
-
# mod: '%',
|
103
|
-
|
104
147
|
# (array functions)
|
105
148
|
# array_filter, array_sort_indices, array_take
|
106
149
|
# dictionary_encode, hash_all, hash_any, hash_approximate_median,
|
@@ -144,29 +187,37 @@ module RedAmber
|
|
144
187
|
|
145
188
|
# (others)
|
146
189
|
# coalesce, drop_null, fill_null_backward, fill_null_forward,
|
147
|
-
# filter,
|
190
|
+
# filter, is_in, is_in_meta_binary,
|
148
191
|
# list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
|
149
192
|
# max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
|
150
193
|
# sort_indices, struct_field, take
|
151
194
|
|
152
195
|
private # =======
|
153
196
|
|
154
|
-
def
|
197
|
+
def exec_func_unary(function, options: nil)
|
155
198
|
func = Arrow::Function.find(function)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
199
|
+
func.execute([data], options)
|
200
|
+
end
|
201
|
+
|
202
|
+
def exec_func_binary(function, other, options: nil)
|
203
|
+
func = Arrow::Function.find(function)
|
204
|
+
case other
|
205
|
+
when Vector
|
206
|
+
func.execute([data, other.data], options)
|
207
|
+
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
|
208
|
+
func.execute([data, other], options)
|
209
|
+
else
|
210
|
+
raise ArgumentError, "Operand is not supported: #{other.class}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def take_out_scalar(output)
|
215
|
+
output = output.value
|
216
|
+
output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
|
217
|
+
end
|
218
|
+
|
219
|
+
def take_out_element_wise(output)
|
220
|
+
Vector.new(output.value)
|
170
221
|
end
|
171
222
|
end
|
172
223
|
end
|
data/lib/red_amber/version.rb
CHANGED
data/red_amber.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.email = ['heronshoes877@gmail.com']
|
10
10
|
|
11
11
|
spec.summary = 'Simple dataframe library for Ruby'
|
12
|
-
spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with
|
12
|
+
spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with API similar to Rover-df.'
|
13
13
|
spec.homepage = 'https://github.com/heronshoes/red_amber'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
spec.required_ruby_version = '>= 2.7'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red_amber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hirokazu SUZUKI (heronshoes)
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-05-
|
11
|
+
date: 2022-05-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: red-arrow
|
@@ -52,8 +52,8 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 0.3.0
|
55
|
-
description: RedAmber is a simple dataframe library powered by Red Arrow with
|
56
|
-
|
55
|
+
description: RedAmber is a simple dataframe library powered by Red Arrow with API
|
56
|
+
similar to Rover-df.
|
57
57
|
email:
|
58
58
|
- heronshoes877@gmail.com
|
59
59
|
executables: []
|