red_amber 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -6
- data/CHANGELOG.md +39 -2
- data/README.md +156 -96
- data/lib/red_amber/data_frame.rb +11 -0
- data/lib/red_amber/data_frame_output.rb +60 -52
- data/lib/red_amber/data_frame_selectable.rb +4 -1
- data/lib/red_amber/vector.rb +31 -2
- data/lib/red_amber/vector_functions.rb +109 -58
- data/lib/red_amber/version.rb +1 -1
- data/red_amber.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0308ff686bf7b49b767b7cd28ddc068e02170c00c093dcd42c7187e438e0adf3'
|
4
|
+
data.tar.gz: 98397e31bce1a440e951357d5d3b475814a6ecc08f21a0908c0fdf58c6189be4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ad71d8259d04535d08567bde6ca0fc419e0d9de15d1e812dbc642fb3901f1c744c69766dbf409e876212e426e309ac0032968b767df3a960a8e6eb40d4f3c19
|
7
|
+
data.tar.gz: eee78ae4316b007d95714d6e2920ad32518497942d9cd5adb373476321e6f9e6e8099f9c721ee8bac05df2617fb3f3c747ce92ec74b1cb84da0b0bd4664051cf
|
data/.rubocop.yml
CHANGED
@@ -55,14 +55,13 @@ Layout/LineLength:
|
|
55
55
|
Metrics/AbcSize:
|
56
56
|
Max: 23
|
57
57
|
Exclude:
|
58
|
-
- 'lib/red_amber/data_frame_output.rb' # Max:
|
58
|
+
- 'lib/red_amber/data_frame_output.rb' # Max: 51
|
59
59
|
|
60
60
|
# Max: 25
|
61
61
|
Metrics/BlockLength:
|
62
62
|
Max: 25
|
63
63
|
Exclude:
|
64
64
|
- 'test/**/*'
|
65
|
-
- '*.gemspec'
|
66
65
|
|
67
66
|
# Max: 100
|
68
67
|
Metrics/ClassLength:
|
@@ -73,20 +72,27 @@ Metrics/ClassLength:
|
|
73
72
|
# Max: 7
|
74
73
|
Metrics/CyclomaticComplexity:
|
75
74
|
Max: 10
|
76
|
-
Exclude:
|
77
|
-
- 'lib/red_amber/data_frame_output.rb' # Max: 11
|
78
75
|
|
79
76
|
# Max: 10
|
80
77
|
Metrics/MethodLength:
|
81
78
|
Max: 18
|
82
79
|
Exclude:
|
83
|
-
- 'lib/red_amber/data_frame_output.rb' # Max:
|
80
|
+
- 'lib/red_amber/data_frame_output.rb' # Max: 31
|
81
|
+
|
82
|
+
# Max: 100
|
83
|
+
Metrics/ModuleLength:
|
84
|
+
Max: 100
|
85
|
+
Exclude:
|
86
|
+
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
84
87
|
|
85
88
|
# Max: 8
|
86
89
|
Metrics/PerceivedComplexity:
|
87
90
|
Max: 11
|
91
|
+
|
92
|
+
# Necessary to define is_na
|
93
|
+
Naming/PredicateName:
|
88
94
|
Exclude:
|
89
|
-
- 'lib/red_amber/
|
95
|
+
- 'lib/red_amber/vector_functions.rb'
|
90
96
|
|
91
97
|
# Necessary to test when range.end == -1
|
92
98
|
Style/SlicingWithRange:
|
data/CHANGELOG.md
CHANGED
@@ -1,12 +1,49 @@
|
|
1
|
-
## [0.1.
|
1
|
+
## [0.1.4] - Unreleased
|
2
|
+
|
3
|
+
- Prepare documents for the 'Transposed DataFrame Representation'
|
4
|
+
- Feedback to Red Arrow
|
5
|
+
- Separate documents
|
2
6
|
|
3
7
|
- `DataFrame`
|
4
8
|
- Introduce updating capabilities
|
5
9
|
- Introduce NA support
|
6
10
|
- Add slice method
|
11
|
+
|
7
12
|
- `Vector`
|
8
13
|
- Add NaN support for functions
|
9
|
-
-
|
14
|
+
- Support more functions
|
15
|
+
|
16
|
+
## [0.1.3] - 2022-05-15 (experimental)
|
17
|
+
|
18
|
+
- Bug fixes
|
19
|
+
- Fix boolean functions in `Vector` to align with Ruby's behavior
|
20
|
+
- `&` == `and_kleene`
|
21
|
+
- `|` == `or_kleene`
|
22
|
+
- Quote strings of data-preview in `DataFrame#inspect`
|
23
|
+
- Quote empty and blank keys in `DataFrame#inspect`
|
24
|
+
- Respond to error for a wrong key in `DataFrame#[]`
|
25
|
+
|
26
|
+
- New features and improvements
|
27
|
+
- `DataFrame`
|
28
|
+
- Display nil elements in `inspect`
|
29
|
+
- Show NaN and nil counts in `inspect`
|
30
|
+
- Refactor `inspect`
|
31
|
+
- Add method `key` and `key_index`
|
32
|
+
- Add how to load/save Parquet to README
|
33
|
+
|
34
|
+
- `Vector`
|
35
|
+
- Add categorization functions
|
36
|
+
|
37
|
+
This is an important step to support `slice` method and NA treatment features.
|
38
|
+
- `is_finite`
|
39
|
+
- `is_inf`
|
40
|
+
- `is_na` (RedAmber original)
|
41
|
+
- `is_nan`
|
42
|
+
- `is_nil`, `is_null`
|
43
|
+
- `is_valid`
|
44
|
+
- Show in a reduced representation for long array in `inspect`
|
45
|
+
- Support options in aggregatiton functions
|
46
|
+
- Return values in non-arrow object for scalar aggregation functions
|
10
47
|
|
11
48
|
## [0.1.2] - 2022-05-08 (experimental)
|
12
49
|
|
data/README.md
CHANGED
@@ -45,7 +45,7 @@ Or install it yourself as:
|
|
45
45
|
- [x] `new` from a Rover::DataFrame
|
46
46
|
- `RedAmber::DataFrame.new(Rover::DataFrame.new(x: [1, 2, 3]))`
|
47
47
|
|
48
|
-
- [
|
48
|
+
- [x] `load` (class method)
|
49
49
|
|
50
50
|
- [x] from a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
51
51
|
- `RedAmber::DataFrame.load("test/entity/with_header.csv")`
|
@@ -55,9 +55,16 @@ Or install it yourself as:
|
|
55
55
|
- [x] from a URI
|
56
56
|
- `RedAmber::DataFrame.load(URI("https://github.com/heronshoes/red_amber/blob/master/test/entity/with_header.csv"))`
|
57
57
|
|
58
|
-
- [
|
58
|
+
- [x] from a Parquet file
|
59
59
|
|
60
|
-
-
|
60
|
+
`red-parquet` gem is required.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
require 'parquet'
|
64
|
+
dataframe = RedAmber::DataFrame.load("file.parquet")
|
65
|
+
```
|
66
|
+
|
67
|
+
- [x] `save` (instance method)
|
61
68
|
|
62
69
|
- [x] to a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
|
63
70
|
|
@@ -65,7 +72,14 @@ Or install it yourself as:
|
|
65
72
|
|
66
73
|
- [x] to a URI
|
67
74
|
|
68
|
-
- [
|
75
|
+
- [x] to a Parquet file
|
76
|
+
|
77
|
+
`red-parquet` gem is required.
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
require 'parquet'
|
81
|
+
dataframe.save("file.parquet")
|
82
|
+
```
|
69
83
|
|
70
84
|
### Properties
|
71
85
|
|
@@ -129,18 +143,26 @@ Or install it yourself as:
|
|
129
143
|
|
130
144
|
- [x] `inspect(tally_level: 5, max_element: 5)`
|
131
145
|
|
132
|
-
Shows some information about self.
|
146
|
+
Shows some information about self in a transposed style.
|
133
147
|
|
134
148
|
```ruby
|
135
|
-
|
136
|
-
|
149
|
+
require 'red_amber'
|
150
|
+
require 'datasets-arrow'
|
151
|
+
|
152
|
+
penguins = Datasets::Penguins.new.to_arrow
|
153
|
+
RedAmber::DataFrame.new(penguins)
|
137
154
|
# =>
|
138
|
-
RedAmber::DataFrame :
|
139
|
-
|
140
|
-
# key
|
141
|
-
1 :
|
142
|
-
2 :
|
143
|
-
3 :
|
155
|
+
RedAmber::DataFrame : 344 x 8 Vectors
|
156
|
+
Vectors : 5 numeric, 3 strings
|
157
|
+
# key type level data_preview
|
158
|
+
1 :species string 3 {"Adelie"=>152, "Chinstrap"=>68, "Gentoo"=>124}
|
159
|
+
2 :island string 3 {"Torgersen"=>52, "Biscoe"=>168, "Dream"=>124}
|
160
|
+
3 :bill_length_mm double 165 [39.1, 39.5, 40.3, nil, 36.7, ... ], 2 nils
|
161
|
+
4 :bill_depth_mm double 81 [18.7, 17.4, 18.0, nil, 19.3, ... ], 2 nils
|
162
|
+
5 :flipper_length_mm uint8 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
163
|
+
6 :body_mass_g uint16 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
164
|
+
7 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
|
165
|
+
8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
144
166
|
```
|
145
167
|
|
146
168
|
- tally_level: max level to use tally mode
|
@@ -151,19 +173,20 @@ Variables : 2 numeric, 1 string
|
|
151
173
|
- [x] Select columns by `[]` as `[key]`, `[keys]`, `[keys[index]]`
|
152
174
|
- Key in a Symbol: `df[:symbol]`
|
153
175
|
- Key in a String: `df["string"]`
|
154
|
-
- Keys in an Array: `df[:symbol1
|
176
|
+
- Keys in an Array: `df[:symbol1, "string", :symbol2]`
|
155
177
|
- Keys in indeces: `df[df.keys[0]`, `df[df.keys[1,2]]`, `df[df.keys[1..]]`
|
156
178
|
- Keys in a Range:
|
157
179
|
A end-less Range can be used to represent keys.
|
180
|
+
|
158
181
|
```ruby
|
159
182
|
hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
|
160
183
|
df = RedAmber::DataFrame.new(hash)
|
161
184
|
df[:b..:c, "a"]
|
162
185
|
# =>
|
163
|
-
RedAmber::DataFrame : 3
|
164
|
-
|
186
|
+
RedAmber::DataFrame : 3 x 3 Vectors
|
187
|
+
Vectors : 2 numeric, 1 string
|
165
188
|
# key type level data_preview
|
166
|
-
1 :b string 3 [A, B, C]
|
189
|
+
1 :b string 3 ["A", "B", "C"]
|
167
190
|
2 :c double 3 [1.0, 2.0, 3.0]
|
168
191
|
3 :a uint8 3 [1, 2, 3]
|
169
192
|
```
|
@@ -258,90 +281,127 @@ Variables : 2 numeric, 1 string
|
|
258
281
|
|
259
282
|
- [x] `tally`
|
260
283
|
|
261
|
-
- [
|
284
|
+
- [x] `n_nils`, `n_nans`
|
285
|
+
|
286
|
+
- `n_nulls` is an alias of `n_nils`
|
287
|
+
|
288
|
+
- [x] `inspect(limit: 80)`
|
289
|
+
|
290
|
+
- `limit` sets size limit to display long array.
|
262
291
|
|
263
292
|
### Functions
|
264
|
-
#### Unary aggregations: vector.func =>
|
265
|
-
|
266
|
-
| Method |Boolean|Numeric|String|Remarks|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|[
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|[
|
278
|
-
|[ ]
|
279
|
-
|
|
280
|
-
|[
|
281
|
-
|[ ]
|
282
|
-
|
|
283
|
-
|[
|
284
|
-
|[ ]
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|[
|
319
|
-
|[
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|[
|
323
|
-
|
|
324
|
-
|[
|
325
|
-
|
|
326
|
-
|[
|
327
|
-
|[
|
328
|
-
|[
|
329
|
-
|[ ]
|
330
|
-
|[ ]
|
331
|
-
|[
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|[
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
|
293
|
+
#### Unary aggregations: vector.func => scalar
|
294
|
+
|
295
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
296
|
+
| ----------- | --- | --- | --- | --- | --- |
|
297
|
+
| ✓ `all` | ✓ | | | ✓ ScalarAggregate| |
|
298
|
+
| ✓ `any` | ✓ | | | ✓ ScalarAggregate| |
|
299
|
+
| ✓ `approximate_median`| |✓| | ✓ ScalarAggregate| alias `median`|
|
300
|
+
| ✓ `count` | ✓ | ✓ | ✓ | ✓ Count | |
|
301
|
+
| ✓ `count_distinct`| ✓ | ✓ | ✓ | ✓ Count |alias `count_uniq`|
|
302
|
+
|[ ]`index` | [ ] | [ ] | [ ] |[ ] Index | |
|
303
|
+
| ✓ `max` | ✓ | ✓ | ✓ | ✓ ScalarAggregate| |
|
304
|
+
| ✓ `mean` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
305
|
+
| ✓ `min` | ✓ | ✓ | ✓ | ✓ ScalarAggregate| |
|
306
|
+
|[ ]`min_max` | [ ] | [ ] | [ ] |[ ] ScalarAggregate| |
|
307
|
+
|[ ]`mode` | | [ ] | |[ ] Mode | |
|
308
|
+
| ✓ `product` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
309
|
+
|[ ]`quantile`| | [ ] | |[ ] Quantile| |
|
310
|
+
|[ ]`stddev` | | ✓ | |[ ] Variance| |
|
311
|
+
| ✓ `sum` | ✓ | ✓ | | ✓ ScalarAggregate| |
|
312
|
+
|[ ]`tdigest` | | [ ] | |[ ] TDigest | |
|
313
|
+
|[ ]`variance`| | ✓ | |[ ] Variance| |
|
314
|
+
|
315
|
+
|
316
|
+
Options can be used as follows.
|
317
|
+
See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
|
318
|
+
|
319
|
+
```ruby
|
320
|
+
double = RedAmber::Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
|
321
|
+
#=>
|
322
|
+
#<RedAmber::Vector(:double, size=6):0x000000000000f910>
|
323
|
+
[1.0, NaN, -Infinity, Infinity, nil, 0.0]
|
324
|
+
|
325
|
+
double.count #=> 5
|
326
|
+
double.count(opts: {mode: :only_valid}) #=> 5, default
|
327
|
+
double.count(opts: {mode: :only_null}) #=> 1
|
328
|
+
double.count(opts: {mode: :all}) #=> 6
|
329
|
+
|
330
|
+
boolean = RedAmber::Vector.new([true, true, nil])
|
331
|
+
#=>
|
332
|
+
#<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
|
333
|
+
[true, true, nil]
|
334
|
+
|
335
|
+
boolean.all #=> true
|
336
|
+
boolean.all(opts: {skip_nulls: true}) #=> true
|
337
|
+
boolean.all(opts: {skip_nulls: false}) #=> false
|
338
|
+
```
|
339
|
+
|
340
|
+
#### Unary element-wise: vector.func => vector
|
341
|
+
|
342
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
343
|
+
| ------------ | --- | --- | --- | --- | ----- |
|
344
|
+
| ✓ `-@` | | ✓ | | |as `-vector`|
|
345
|
+
| ✓ `negate` | | ✓ | | |`-@` |
|
346
|
+
| ✓ `abs` | | ✓ | | | |
|
347
|
+
|[ ]`acos` | | [ ] | | | |
|
348
|
+
|[ ]`asin` | | [ ] | | | |
|
349
|
+
| ✓ `atan` | | ✓ | | | |
|
350
|
+
| ✓ `bit_wise_not`| | (✓) | | |integer only|
|
351
|
+
|[ ]`ceil` | | ✓ | | | |
|
352
|
+
| ✓ `cos` | | ✓ | | | |
|
353
|
+
|[ ]`floor` | | ✓ | | | |
|
354
|
+
| ✓ `invert` | ✓ | | | |`!`, alias `not`|
|
355
|
+
|[ ]`ln` | | [ ] | | | |
|
356
|
+
|[ ]`log10` | | [ ] | | | |
|
357
|
+
|[ ]`log1p` | | [ ] | | | |
|
358
|
+
|[ ]`log2` | | [ ] | | | |
|
359
|
+
|[ ]`round` | | [ ] | |[ ] Round| |
|
360
|
+
|[ ]`round_to_multiple`| | [ ] | |[ ] RoundToMultiple| |
|
361
|
+
| ✓ `sign` | | ✓ | | | |
|
362
|
+
| ✓ `sin` | | ✓ | | | |
|
363
|
+
| ✓ `tan` | | ✓ | | | |
|
364
|
+
|[ ]`trunc` | | ✓ | | | |
|
365
|
+
|
366
|
+
#### Binary element-wise: vector.func(vector) => vector
|
367
|
+
|
368
|
+
| Method |Boolean|Numeric|String|Options|Remarks|
|
369
|
+
| ----------------- | --- | --- | --- | --- | ----- |
|
370
|
+
| ✓ `add` | | ✓ | | | `+` |
|
371
|
+
| ✓ `atan2` | | ✓ | | | |
|
372
|
+
| ✓ `and_kleene` | ✓ | | | | `&` |
|
373
|
+
| ✓ `and_org ` | ✓ | | | |`and` in Red Arrow|
|
374
|
+
| ✓ `and_not` | ✓ | | | | |
|
375
|
+
| ✓ `and_not_kleene`| ✓ | | | | |
|
376
|
+
| ✓ `bit_wise_and` | | (✓) | | |integer only|
|
377
|
+
| ✓ `bit_wise_or` | | (✓) | | |integer only|
|
378
|
+
| ✓ `bit_wise_xor` | | (✓) | | |integer only|
|
379
|
+
| ✓ `divide` | | ✓ | | | `/` |
|
380
|
+
| ✓ `equal` | ✓ | ✓ | ✓ | |`==`, alias `eq`|
|
381
|
+
| ✓ `greater` | ✓ | ✓ | ✓ | |`>`, alias `gt`|
|
382
|
+
| ✓ `greater_equal` | ✓ | ✓ | ✓ | |`>=`, alias `ge`|
|
383
|
+
| ✓ `is_finite` | | ✓ | | | |
|
384
|
+
| ✓ `is_inf` | | ✓ | | | |
|
385
|
+
| ✓ `is_na` | ✓ | ✓ | ✓ | | |
|
386
|
+
| ✓ `is_nan` | | ✓ | | | |
|
387
|
+
|[ ]`is_nil` | ✓ | ✓ | ✓ |[ ] Null|alias `is_null`|
|
388
|
+
| ✓ `is_valid` | ✓ | ✓ | ✓ | | |
|
389
|
+
| ✓ `less` | ✓ | ✓ | ✓ | |`<`, alias `lt`|
|
390
|
+
| ✓ `less_equal` | ✓ | ✓ | ✓ | |`<=`, alias `le`|
|
391
|
+
|[ ]`logb` | | [ ] | | | |
|
392
|
+
|[ ]`mod` | | [ ] | | | `%` |
|
393
|
+
| ✓ `multiply` | | ✓ | | | `*` |
|
394
|
+
| ✓ `not_equal` | ✓ | ✓ | ✓ | |`!=`, alias `ne`|
|
395
|
+
| ✓ `or_kleene` | ✓ | | | | `\|` |
|
396
|
+
| ✓ `or_org` | ✓ | | | |`or` in Red Arrow|
|
397
|
+
| ✓ `power` | | ✓ | | | `**` |
|
398
|
+
| ✓ `subtract` | | ✓ | | | `-` |
|
399
|
+
| ✓ `shift_left` | | (✓) | | |`<<`, integer only|
|
400
|
+
| ✓ `shift_right` | | (✓) | | |`>>`, integer only|
|
401
|
+
| ✓ `xor` | ✓ | | | | `^` |
|
340
402
|
|
341
403
|
##### (Not impremented)
|
342
|
-
- [ ] invert, round, round_to_multiple
|
343
404
|
- [ ] sort, sort_index
|
344
|
-
- [ ] minmax, var, median, quantile
|
345
405
|
- [ ] argmin, argmax
|
346
406
|
- [ ] (array functions)
|
347
407
|
- [ ] (strings functions)
|
data/lib/red_amber/data_frame.rb
CHANGED
@@ -67,6 +67,17 @@ module RedAmber
|
|
67
67
|
alias_method :keys, :column_names
|
68
68
|
alias_method :header, :column_names
|
69
69
|
|
70
|
+
def key?(key)
|
71
|
+
column_names.include?(key.to_sym)
|
72
|
+
end
|
73
|
+
alias_method :has_key?, :key?
|
74
|
+
|
75
|
+
def key_index(key)
|
76
|
+
column_names.find_index(key.to_sym)
|
77
|
+
end
|
78
|
+
alias_method :find_index, :key_index
|
79
|
+
alias_method :index, :key_index
|
80
|
+
|
70
81
|
def types
|
71
82
|
@table.columns.map do |column|
|
72
83
|
column.data_type.to_s.to_sym
|
@@ -19,61 +19,51 @@ module RedAmber
|
|
19
19
|
|
20
20
|
# - tally_level: max level to use tally mode
|
21
21
|
# - max_element: max element to show values in each row
|
22
|
-
# TODO: Is it better to change name other than `inspect` ?
|
23
|
-
# TODO:
|
24
|
-
# TODO:
|
25
|
-
# TODO: Refactor code to smaller methods
|
22
|
+
# - TODO: Is it better to change name other than `inspect` ?
|
23
|
+
# - TODO: Fall back to inspect_raw when treating large dataset
|
24
|
+
# - TODO: Refactor code to smaller methods
|
26
25
|
def inspect(tally_level: 5, max_element: 5)
|
27
26
|
return '#<RedAmber::DataFrame (empty)>' if empty?
|
28
27
|
|
29
28
|
stringio = StringIO.new # output string buffer
|
30
29
|
|
30
|
+
tallys = vectors.map(&:tally)
|
31
|
+
levels = tallys.map(&:size)
|
32
|
+
type_groups = @table.columns.map { |column| type_group(column.data_type) }
|
33
|
+
quoted_keys = keys.map(&:inspect)
|
34
|
+
headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
|
35
|
+
header_format = make_header_format(levels, headers, quoted_keys)
|
36
|
+
|
31
37
|
# 1st row: show shape of the dataframe
|
32
|
-
|
33
|
-
c = pl(ncol)
|
38
|
+
vs = "Vector#{pl(ncol)}"
|
34
39
|
stringio.puts \
|
35
|
-
"#{self.class} : #{nrow}
|
40
|
+
"#{self.class} : #{nrow} x #{ncol} #{vs}"
|
36
41
|
|
37
42
|
# 2nd row: show var counts by type
|
38
|
-
|
39
|
-
|
40
|
-
stringio.puts "Variable#{pl(ncol)} : #{var_type_count(type_groups).join(', ')}"
|
43
|
+
stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
|
41
44
|
|
42
45
|
# 3rd row: print header of rows
|
43
|
-
|
44
|
-
|
45
|
-
#
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
str = format("%#{w_row}d ", data_tally.size)
|
60
|
-
str <<
|
61
|
-
case type_group
|
62
|
-
when :numeric, :string, :boolean
|
63
|
-
if data_tally.size <= tally_level && data_tally.size != nrow
|
64
|
-
data_tally.to_s
|
46
|
+
stringio.printf header_format, *headers.values
|
47
|
+
|
48
|
+
# 4th row ~: show details for each column (vector)
|
49
|
+
vectors.each.with_index do |vector, i|
|
50
|
+
key = quoted_keys[i]
|
51
|
+
type = types[i]
|
52
|
+
type_group = type_groups[i]
|
53
|
+
data_tally = tallys[i]
|
54
|
+
|
55
|
+
a = case type_group
|
56
|
+
when :numeric, :string, :boolean
|
57
|
+
if data_tally.size <= tally_level && data_tally.size != nrow
|
58
|
+
[data_tally.to_s]
|
59
|
+
else
|
60
|
+
[shorthand(vector, nrow, max_element)].concat na_string(vector)
|
61
|
+
end
|
65
62
|
else
|
66
|
-
|
63
|
+
shorthand(vector, nrow, max_element)
|
67
64
|
end
|
68
|
-
|
69
|
-
# str << " #{c} NaN#{pl(c)}" if c&.>(0) # safely call c>0
|
70
|
-
else
|
71
|
-
reduced_vector_presentation(vector, nrow, max_element)
|
72
|
-
end
|
73
|
-
|
74
|
-
stringio.printf("%#{w_idx}d %-#{w_key}s %-#{w_type}s %s\n", i, ":#{key}", type, str)
|
65
|
+
stringio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
|
75
66
|
end
|
76
|
-
|
77
67
|
stringio.string
|
78
68
|
end
|
79
69
|
|
@@ -83,15 +73,21 @@ module RedAmber
|
|
83
73
|
num > 1 ? 's' : ''
|
84
74
|
end
|
85
75
|
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
76
|
+
def make_header_format(levels, headers, quoted_keys)
|
77
|
+
# find longest word to adjust column width
|
78
|
+
w_idx = ncol.to_s.size
|
79
|
+
w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
|
80
|
+
w_type = [types.map(&:size).max, headers[:type].size].max
|
81
|
+
w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
|
82
|
+
"%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
|
83
|
+
end
|
84
|
+
|
85
|
+
def type_group(data_type)
|
86
|
+
case data_type
|
87
|
+
when Arrow::NumericDataType then :numeric
|
88
|
+
when Arrow::StringDataType then :string
|
89
|
+
when Arrow::BooleanDataType then :boolean
|
90
|
+
when Arrow::TemporalDataType then :temporal
|
95
91
|
else
|
96
92
|
:other
|
97
93
|
end
|
@@ -107,10 +103,22 @@ module RedAmber
|
|
107
103
|
a
|
108
104
|
end
|
109
105
|
|
110
|
-
def
|
106
|
+
def shorthand(vector, nrow, max_element)
|
111
107
|
a = vector.to_a.take(max_element)
|
112
|
-
a
|
108
|
+
a.map! { |e| e.nil? ? 'nil' : e.inspect }
|
109
|
+
a << '... ' if nrow > max_element
|
113
110
|
"[#{a.join(', ')}]"
|
114
111
|
end
|
112
|
+
|
113
|
+
def na_string(vector)
|
114
|
+
n_nan = vector.n_nans
|
115
|
+
n_nil = vector.n_nils
|
116
|
+
a = []
|
117
|
+
return a if (n_nan + n_nil).zero?
|
118
|
+
|
119
|
+
a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero?
|
120
|
+
a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
|
121
|
+
a
|
122
|
+
end
|
115
123
|
end
|
116
124
|
end
|
@@ -45,7 +45,10 @@ module RedAmber
|
|
45
45
|
|
46
46
|
def select_columns(keys)
|
47
47
|
if keys.one?
|
48
|
-
|
48
|
+
t = @table[*keys]
|
49
|
+
raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
|
50
|
+
|
51
|
+
Vector.new(t.data)
|
49
52
|
else
|
50
53
|
DataFrame.new(@table[keys])
|
51
54
|
end
|
data/lib/red_amber/vector.rb
CHANGED
@@ -27,8 +27,20 @@ module RedAmber
|
|
27
27
|
@data.to_a.inspect
|
28
28
|
end
|
29
29
|
|
30
|
-
def inspect
|
31
|
-
|
30
|
+
def inspect(limit: 80)
|
31
|
+
sio = StringIO.new << '['
|
32
|
+
to_a.each_with_object(sio).with_index do |(e, s), i|
|
33
|
+
next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
|
34
|
+
if (s.size + next_str.size) < limit
|
35
|
+
s << next_str
|
36
|
+
else
|
37
|
+
s << ', ... ' if i < size
|
38
|
+
break
|
39
|
+
end
|
40
|
+
end
|
41
|
+
sio << ']'
|
42
|
+
|
43
|
+
format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
|
32
44
|
end
|
33
45
|
|
34
46
|
def values
|
@@ -49,6 +61,18 @@ module RedAmber
|
|
49
61
|
@data.value_type.nick.to_sym
|
50
62
|
end
|
51
63
|
|
64
|
+
def boolean?
|
65
|
+
type == :boolean
|
66
|
+
end
|
67
|
+
|
68
|
+
def numeric?
|
69
|
+
%i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
|
70
|
+
end
|
71
|
+
|
72
|
+
def string?
|
73
|
+
type == :string
|
74
|
+
end
|
75
|
+
|
52
76
|
def data_type
|
53
77
|
@data.value_type
|
54
78
|
end
|
@@ -72,5 +96,10 @@ module RedAmber
|
|
72
96
|
def n_nulls
|
73
97
|
@data.n_nulls
|
74
98
|
end
|
99
|
+
alias_method :n_nils, :n_nulls
|
100
|
+
|
101
|
+
def n_nans
|
102
|
+
numeric? ? is_nan.to_a.count(true) : 0
|
103
|
+
end
|
75
104
|
end
|
76
105
|
end
|
@@ -1,69 +1,113 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
|
4
|
+
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
5
|
+
|
6
|
+
# Not implemented in Red Arrow 8.0.0
|
7
|
+
# divmod, # '%',
|
8
|
+
# true_unless_null
|
9
|
+
|
3
10
|
module RedAmber
|
4
11
|
# mix-ins for class Vector
|
5
12
|
module VectorFunctions
|
6
|
-
#
|
7
|
-
# reference: https://arrow.apache.org/docs/cpp/compute.html
|
8
|
-
|
9
|
-
# [Unary aggregations]: vector.func => Scalar
|
13
|
+
# [Unary aggregations]: vector.func => scalar
|
10
14
|
unary_aggregations =
|
11
|
-
%i[all any approximate_median count count_distinct max mean min
|
12
|
-
product stddev sum variance]
|
15
|
+
%i[all any approximate_median count count_distinct max mean min product stddev sum variance]
|
13
16
|
unary_aggregations.each do |function|
|
14
|
-
define_method(function)
|
17
|
+
define_method(function) do |opts: nil|
|
18
|
+
output = exec_func_unary(function, options: opts)
|
19
|
+
take_out_scalar(output)
|
20
|
+
end
|
15
21
|
end
|
22
|
+
alias_method :median, :approximate_median
|
16
23
|
alias_method :count_uniq, :count_distinct
|
17
24
|
|
18
25
|
# option(s) required
|
19
|
-
# index
|
26
|
+
# - index
|
20
27
|
|
21
28
|
# Returns other than value
|
22
|
-
# min_max
|
23
|
-
# mode
|
24
|
-
# quantile
|
25
|
-
# tdigest
|
26
|
-
|
27
|
-
# [Unary element-wise]: vector.func =>
|
28
|
-
unary_element_wise =
|
29
|
+
# - min_max
|
30
|
+
# - mode
|
31
|
+
# - quantile
|
32
|
+
# - tdigest
|
33
|
+
|
34
|
+
# [Unary element-wise]: vector.func => vector
|
35
|
+
unary_element_wise =
|
36
|
+
%i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
|
29
37
|
unary_element_wise.each do |function|
|
30
|
-
define_method(function)
|
38
|
+
define_method(function) do |opts: nil|
|
39
|
+
output = exec_func_unary(function, options: opts)
|
40
|
+
take_out_element_wise(output)
|
41
|
+
end
|
31
42
|
end
|
43
|
+
alias_method :is_nil, :is_null
|
32
44
|
|
33
|
-
|
45
|
+
def is_na
|
46
|
+
numeric? ? (is_nil | is_nan) : is_nil
|
47
|
+
end
|
48
|
+
|
49
|
+
# [Unary element-wise with operator]: vector.func => vector, op vector
|
34
50
|
unary_element_wise_op = {
|
51
|
+
invert: '!',
|
35
52
|
negate: '-@',
|
36
53
|
}
|
37
54
|
unary_element_wise_op.each do |function, operator|
|
38
|
-
define_method(function)
|
39
|
-
|
55
|
+
define_method(function) do |opts: nil|
|
56
|
+
output = exec_func_unary(function, options: opts)
|
57
|
+
take_out_element_wise(output)
|
58
|
+
end
|
59
|
+
|
60
|
+
define_method(operator) do |opts: nil|
|
61
|
+
output = exec_func_unary(function, options: opts)
|
62
|
+
take_out_element_wise(output)
|
63
|
+
end
|
40
64
|
end
|
65
|
+
alias_method :not, :invert
|
41
66
|
|
42
|
-
#
|
67
|
+
# option(s) required
|
68
|
+
# - round, round_to_multiple
|
43
69
|
|
44
70
|
# NaN support needed
|
45
|
-
#
|
71
|
+
# - acos asin ln log10 log1p log2
|
46
72
|
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
73
|
+
# Functions with numerical range check
|
74
|
+
# - abs_checked acos_checked asin_checked cos_checked ln_checked
|
75
|
+
# log10_checked log1p_checked log2_checked sin_checked tan_checked
|
50
76
|
|
51
|
-
# [Binary element-wise]: vector.func(other) =>
|
52
|
-
binary_element_wise =
|
77
|
+
# [Binary element-wise]: vector.func(other) => vector
|
78
|
+
binary_element_wise =
|
79
|
+
%i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
|
53
80
|
binary_element_wise.each do |function|
|
54
|
-
define_method(function) do |other|
|
55
|
-
|
81
|
+
define_method(function) do |other, opts: nil|
|
82
|
+
output = exec_func_binary(function, other, options: opts)
|
83
|
+
take_out_element_wise(output)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# [Logical binary element-wise]: vector.func(other) => vector
|
88
|
+
logical_binary_element_wise = {
|
89
|
+
'&': :and_kleene,
|
90
|
+
and_kleene: :and_kleene,
|
91
|
+
and_org: :and,
|
92
|
+
'|': :or_kleene,
|
93
|
+
or_kleene: :or_kleene,
|
94
|
+
or_org: :or,
|
95
|
+
}
|
96
|
+
logical_binary_element_wise.each do |method, function|
|
97
|
+
define_method(method) do |other, opts: nil|
|
98
|
+
output = exec_func_binary(function, other, options: opts)
|
99
|
+
take_out_element_wise(output)
|
56
100
|
end
|
57
101
|
end
|
58
102
|
|
59
103
|
# NaN support needed
|
60
|
-
# logb
|
104
|
+
# - logb
|
61
105
|
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
106
|
+
# Functions with numerical range check
|
107
|
+
# - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
|
108
|
+
# shift_left_checked shift_right_checked
|
65
109
|
|
66
|
-
# [Binary element-wise with operator]: vector.func(other) =>
|
110
|
+
# [Binary element-wise with operator]: vector.func(other) => vector
|
67
111
|
binary_element_wise_op = {
|
68
112
|
add: '+',
|
69
113
|
divide: '/',
|
@@ -71,9 +115,7 @@ module RedAmber
|
|
71
115
|
power: '**',
|
72
116
|
subtract: '-',
|
73
117
|
|
74
|
-
|
75
|
-
bit_wise_or: '|',
|
76
|
-
bit_wise_xor: '^',
|
118
|
+
xor: '^',
|
77
119
|
shift_left: '<<',
|
78
120
|
shift_right: '>>',
|
79
121
|
|
@@ -85,11 +127,14 @@ module RedAmber
|
|
85
127
|
not_equal: '!=',
|
86
128
|
}
|
87
129
|
binary_element_wise_op.each do |function, operator|
|
88
|
-
define_method(function) do |other|
|
89
|
-
|
130
|
+
define_method(function) do |other, opts: nil|
|
131
|
+
output = exec_func_binary(function, other, options: opts)
|
132
|
+
take_out_element_wise(output)
|
90
133
|
end
|
91
|
-
|
92
|
-
|
134
|
+
|
135
|
+
define_method(operator) do |other, opts: nil|
|
136
|
+
output = exec_func_binary(function, other, options: opts)
|
137
|
+
take_out_element_wise(output)
|
93
138
|
end
|
94
139
|
end
|
95
140
|
alias_method :eq, :equal
|
@@ -99,8 +144,6 @@ module RedAmber
|
|
99
144
|
alias_method :lt, :less
|
100
145
|
alias_method :ne, :not_equal
|
101
146
|
|
102
|
-
# mod: '%',
|
103
|
-
|
104
147
|
# (array functions)
|
105
148
|
# array_filter, array_sort_indices, array_take
|
106
149
|
# dictionary_encode, hash_all, hash_any, hash_approximate_median,
|
@@ -144,29 +187,37 @@ module RedAmber
|
|
144
187
|
|
145
188
|
# (others)
|
146
189
|
# coalesce, drop_null, fill_null_backward, fill_null_forward,
|
147
|
-
# filter,
|
190
|
+
# filter, is_in, is_in_meta_binary,
|
148
191
|
# list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
|
149
192
|
# max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
|
150
193
|
# sort_indices, struct_field, take
|
151
194
|
|
152
195
|
private # =======
|
153
196
|
|
154
|
-
def
|
197
|
+
def exec_func_unary(function, options: nil)
|
155
198
|
func = Arrow::Function.find(function)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
199
|
+
func.execute([data], options)
|
200
|
+
end
|
201
|
+
|
202
|
+
def exec_func_binary(function, other, options: nil)
|
203
|
+
func = Arrow::Function.find(function)
|
204
|
+
case other
|
205
|
+
when Vector
|
206
|
+
func.execute([data, other.data], options)
|
207
|
+
when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
|
208
|
+
func.execute([data, other], options)
|
209
|
+
else
|
210
|
+
raise ArgumentError, "Operand is not supported: #{other.class}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def take_out_scalar(output)
|
215
|
+
output = output.value
|
216
|
+
output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
|
217
|
+
end
|
218
|
+
|
219
|
+
def take_out_element_wise(output)
|
220
|
+
Vector.new(output.value)
|
170
221
|
end
|
171
222
|
end
|
172
223
|
end
|
data/lib/red_amber/version.rb
CHANGED
data/red_amber.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.email = ['heronshoes877@gmail.com']
|
10
10
|
|
11
11
|
spec.summary = 'Simple dataframe library for Ruby'
|
12
|
-
spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with
|
12
|
+
spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with API similar to Rover-df.'
|
13
13
|
spec.homepage = 'https://github.com/heronshoes/red_amber'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
spec.required_ruby_version = '>= 2.7'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red_amber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hirokazu SUZUKI (heronshoes)
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-05-
|
11
|
+
date: 2022-05-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: red-arrow
|
@@ -52,8 +52,8 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 0.3.0
|
55
|
-
description: RedAmber is a simple dataframe library powered by Red Arrow with
|
56
|
-
|
55
|
+
description: RedAmber is a simple dataframe library powered by Red Arrow with API
|
56
|
+
similar to Rover-df.
|
57
57
|
email:
|
58
58
|
- heronshoes877@gmail.com
|
59
59
|
executables: []
|