red_amber 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54de345111ab7c3918e119abe820d2ff207007f1ce9731e2f8954513d47c76a9
4
- data.tar.gz: 75e4251c6d6be8eab05739f75e064a2e65cbe3abdafaa574c559d9356fe93a20
3
+ metadata.gz: '0308ff686bf7b49b767b7cd28ddc068e02170c00c093dcd42c7187e438e0adf3'
4
+ data.tar.gz: 98397e31bce1a440e951357d5d3b475814a6ecc08f21a0908c0fdf58c6189be4
5
5
  SHA512:
6
- metadata.gz: 60c2d11d30b91947b67e608864e5e4fe13e544662f671789256e6e2e624a892577f616572e4ba55be4de99affd528d020060b4be56f8820250697db2a80132a2
7
- data.tar.gz: 19170b7cd3d6b1174b7de44c0b8841d47acc4d1832fe72fdc8adc7171245e031c922614aca979755ae035566deae0a711644a3e483cecbceabdcfc411efb2263
6
+ metadata.gz: 7ad71d8259d04535d08567bde6ca0fc419e0d9de15d1e812dbc642fb3901f1c744c69766dbf409e876212e426e309ac0032968b767df3a960a8e6eb40d4f3c19
7
+ data.tar.gz: eee78ae4316b007d95714d6e2920ad32518497942d9cd5adb373476321e6f9e6e8099f9c721ee8bac05df2617fb3f3c747ce92ec74b1cb84da0b0bd4664051cf
data/.rubocop.yml CHANGED
@@ -55,14 +55,13 @@ Layout/LineLength:
55
55
  Metrics/AbcSize:
56
56
  Max: 23
57
57
  Exclude:
58
- - 'lib/red_amber/data_frame_output.rb' # Max: 78
58
+ - 'lib/red_amber/data_frame_output.rb' # Max: 51
59
59
 
60
60
  # Max: 25
61
61
  Metrics/BlockLength:
62
62
  Max: 25
63
63
  Exclude:
64
64
  - 'test/**/*'
65
- - '*.gemspec'
66
65
 
67
66
  # Max: 100
68
67
  Metrics/ClassLength:
@@ -73,20 +72,27 @@ Metrics/ClassLength:
73
72
  # Max: 7
74
73
  Metrics/CyclomaticComplexity:
75
74
  Max: 10
76
- Exclude:
77
- - 'lib/red_amber/data_frame_output.rb' # Max: 11
78
75
 
79
76
  # Max: 10
80
77
  Metrics/MethodLength:
81
78
  Max: 18
82
79
  Exclude:
83
- - 'lib/red_amber/data_frame_output.rb' # Max: 35
80
+ - 'lib/red_amber/data_frame_output.rb' # Max: 31
81
+
82
+ # Max: 100
83
+ Metrics/ModuleLength:
84
+ Max: 100
85
+ Exclude:
86
+ - 'lib/red_amber/vector_functions.rb' # Max: 114
84
87
 
85
88
  # Max: 8
86
89
  Metrics/PerceivedComplexity:
87
90
  Max: 11
91
+
92
+ # Necessary to define is_na
93
+ Naming/PredicateName:
88
94
  Exclude:
89
- - 'lib/red_amber/data_frame_output.rb' # Max: 12
95
+ - 'lib/red_amber/vector_functions.rb'
90
96
 
91
97
  # Necessary to test when range.end == -1
92
98
  Style/SlicingWithRange:
data/CHANGELOG.md CHANGED
@@ -1,12 +1,49 @@
1
- ## [0.1.3] - Unreleased
1
+ ## [0.1.4] - Unreleased
2
+
3
+ - Prepare documents for the 'Transposed DataFrame Representation'
4
+ - Feedback to Red Arrow
5
+ - Separate documents
2
6
 
3
7
  - `DataFrame`
4
8
  - Introduce updating capabilities
5
9
  - Introduce NA support
6
10
  - Add slice method
11
+
7
12
  - `Vector`
8
13
  - Add NaN support for functions
9
- - More functions
14
+ - Support more functions
15
+
16
+ ## [0.1.3] - 2022-05-15 (experimental)
17
+
18
+ - Bug fixes
19
+ - Fix boolean functions in `Vector` to align with Ruby's behavior
20
+ - `&` == `and_kleene`
21
+ - `|` == `or_kleene`
22
+ - Quote strings of data-preview in `DataFrame#inspect`
23
+ - Quote empty and blank keys in `DataFrame#inspect`
24
+ - Respond to error for a wrong key in `DataFrame#[]`
25
+
26
+ - New features and improvements
27
+ - `DataFrame`
28
+ - Display nil elements in `inspect`
29
+ - Show NaN and nil counts in `inspect`
30
+ - Refactor `inspect`
31
+ - Add method `key` and `key_index`
32
+ - Add how to load/save Parquet to README
33
+
34
+ - `Vector`
35
+ - Add categorization functions
36
+
37
+ This is an important step to support `slice` method and NA treatment features.
38
+ - `is_finite`
39
+ - `is_inf`
40
+ - `is_na` (RedAmber original)
41
+ - `is_nan`
42
+ - `is_nil`, `is_null`
43
+ - `is_valid`
44
+ - Show in a reduced representation for long array in `inspect`
45
+ - Support options in aggregatiton functions
46
+ - Return values in non-arrow object for scalar aggregation functions
10
47
 
11
48
  ## [0.1.2] - 2022-05-08 (experimental)
12
49
 
data/README.md CHANGED
@@ -45,7 +45,7 @@ Or install it yourself as:
45
45
  - [x] `new` from a Rover::DataFrame
46
46
  - `RedAmber::DataFrame.new(Rover::DataFrame.new(x: [1, 2, 3]))`
47
47
 
48
- - [ ] `load` (class method)
48
+ - [x] `load` (class method)
49
49
 
50
50
  - [x] from a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
51
51
  - `RedAmber::DataFrame.load("test/entity/with_header.csv")`
@@ -55,9 +55,16 @@ Or install it yourself as:
55
55
  - [x] from a URI
56
56
  - `RedAmber::DataFrame.load(URI("https://github.com/heronshoes/red_amber/blob/master/test/entity/with_header.csv"))`
57
57
 
58
- - [ ] from a parquet file
58
+ - [x] from a Parquet file
59
59
 
60
- - [ ] `save` (instance method)
60
+ `red-parquet` gem is required.
61
+
62
+ ```ruby
63
+ require 'parquet'
64
+ dataframe = RedAmber::DataFrame.load("file.parquet")
65
+ ```
66
+
67
+ - [x] `save` (instance method)
61
68
 
62
69
  - [x] to a [`.arrow`, `.arrows`, `.csv`, `.csv.gz`, `.tsv`] file
63
70
 
@@ -65,7 +72,14 @@ Or install it yourself as:
65
72
 
66
73
  - [x] to a URI
67
74
 
68
- - [ ] to a parquet file
75
+ - [x] to a Parquet file
76
+
77
+ `red-parquet` gem is required.
78
+
79
+ ```ruby
80
+ require 'parquet'
81
+ dataframe.save("file.parquet")
82
+ ```
69
83
 
70
84
  ### Properties
71
85
 
@@ -129,18 +143,26 @@ Or install it yourself as:
129
143
 
130
144
  - [x] `inspect(tally_level: 5, max_element: 5)`
131
145
 
132
- Shows some information about self.
146
+ Shows some information about self in a transposed style.
133
147
 
134
148
  ```ruby
135
- hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
136
- RedAmber::DataFrame.new(hash)
149
+ require 'red_amber'
150
+ require 'datasets-arrow'
151
+
152
+ penguins = Datasets::Penguins.new.to_arrow
153
+ RedAmber::DataFrame.new(penguins)
137
154
  # =>
138
- RedAmber::DataFrame : 3 observations(rows) of 3 variables(columns)
139
- Variables : 2 numeric, 1 string
140
- # key type level data_preview
141
- 1 :a uint8 3 [1, 2, 3]
142
- 2 :b string 3 [A, B, C]
143
- 3 :c double 3 [1.0, 2.0, 3.0]
155
+ RedAmber::DataFrame : 344 x 8 Vectors
156
+ Vectors : 5 numeric, 3 strings
157
+ # key type level data_preview
158
+ 1 :species string 3 {"Adelie"=>152, "Chinstrap"=>68, "Gentoo"=>124}
159
+ 2 :island string 3 {"Torgersen"=>52, "Biscoe"=>168, "Dream"=>124}
160
+ 3 :bill_length_mm double 165 [39.1, 39.5, 40.3, nil, 36.7, ... ], 2 nils
161
+ 4 :bill_depth_mm double 81 [18.7, 17.4, 18.0, nil, 19.3, ... ], 2 nils
162
+ 5 :flipper_length_mm uint8 56 [181, 186, 195, nil, 193, ... ], 2 nils
163
+ 6 :body_mass_g uint16 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
164
+ 7 :sex string 3 {"male"=>168, "female"=>165, nil=>11}
165
+ 8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
144
166
  ```
145
167
 
146
168
  - tally_level: max level to use tally mode
@@ -151,19 +173,20 @@ Variables : 2 numeric, 1 string
151
173
  - [x] Select columns by `[]` as `[key]`, `[keys]`, `[keys[index]]`
152
174
  - Key in a Symbol: `df[:symbol]`
153
175
  - Key in a String: `df["string"]`
154
- - Keys in an Array: `df[:symbol1`, `"string"`, `:symbol2`
176
+ - Keys in an Array: `df[:symbol1, "string", :symbol2]`
155
177
  - Keys in indeces: `df[df.keys[0]`, `df[df.keys[1,2]]`, `df[df.keys[1..]]`
156
178
  - Keys in a Range:
157
179
  A end-less Range can be used to represent keys.
180
+
158
181
  ```ruby
159
182
  hash = {a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3]}
160
183
  df = RedAmber::DataFrame.new(hash)
161
184
  df[:b..:c, "a"]
162
185
  # =>
163
- RedAmber::DataFrame : 3 observations(rows) of 3 variables(columns)
164
- Variables : 2 numeric, 1 string
186
+ RedAmber::DataFrame : 3 x 3 Vectors
187
+ Vectors : 2 numeric, 1 string
165
188
  # key type level data_preview
166
- 1 :b string 3 [A, B, C]
189
+ 1 :b string 3 ["A", "B", "C"]
167
190
  2 :c double 3 [1.0, 2.0, 3.0]
168
191
  3 :a uint8 3 [1, 2, 3]
169
192
  ```
@@ -258,90 +281,127 @@ Variables : 2 numeric, 1 string
258
281
 
259
282
  - [x] `tally`
260
283
 
261
- - [ ] `n_nulls`
284
+ - [x] `n_nils`, `n_nans`
285
+
286
+ - `n_nulls` is an alias of `n_nils`
287
+
288
+ - [x] `inspect(limit: 80)`
289
+
290
+ - `limit` sets size limit to display long array.
262
291
 
263
292
  ### Functions
264
- #### Unary aggregations: vector.func => Scalar
265
-
266
- | Method |Boolean|Numeric|String|Remarks|
267
- | ------------ | --- | --- | --- | ----- |
268
- |[x] `all` | [x] | | | |
269
- |[x] `any` | [x] | | | |
270
- |[x] `approximate_median`| | [x] | | |
271
- |[x] `count` | [x] | [x] | [x] | |
272
- |[x] `count_distinct`| [x] | [x] | [x] | |
273
- |[x] `count_uniq` | [x] | [x] | [x] |an alias of `count_distinct`|
274
- |[ ] `index` | | | | |
275
- |[x] `max` | [x] | [x] | [x] | |
276
- |[x] `mean` | [x] | [x] | | |
277
- |[x] `min` | [x] | [x] | [x] | |
278
- |[ ] `min_max` | | | | |
279
- |[ ] `mode` | | | | |
280
- |[x] `product` | [x] | [x] | | |
281
- |[ ] `quantile`| | | | |
282
- |[x] `stddev` | | [x] | | |
283
- |[x] `sum` | [x] | [x] | | |
284
- |[ ] `tdigest` | | | | |
285
- |[x] `variance`| | [x] | | |
286
-
287
- #### Unary element-wise: vector.func => Vector
288
-
289
- | Method |Boolean|Numeric|String|Remarks|
290
- | ------------ | --- | --- | --- | ----- |
291
- |[x] `-@` | | [x] | |as `-vector`|
292
- |[x] `negate` | | [x] | |`-@` |
293
- |[x] `abs` | | [x] | | |
294
- |[ ] `acos` | | [ ] | | |
295
- |[ ] `asin` | | [ ] | | |
296
- |[x] `atan` | | [x] | | |
297
- |[ ] `ceil` | | [x] | | |
298
- |[x] `cos` | | [x] | | |
299
- |[ ] `floor` | | [x] | | |
300
- |[ ] `ln` | | [ ] | | |
301
- |[ ] `log10` | | [ ] | | |
302
- |[ ] `log1p` | | [ ] | | |
303
- |[ ] `log2` | | [ ] | | |
304
- |[x] `sign` | | [x] | | |
305
- |[x] `sin` | | [x] | | |
306
- |[x] `tan` | | [x] | | |
307
- |[ ] `trunc` | | [x] | | |
308
-
309
- #### Binary element-wise: vector.func(vector) => Vector
310
-
311
- | Method |Boolean|Numeric|String|Remarks|
312
- | ------------------ | --- | --- | --- | ----- |
313
- |[x] `add` | | [x] | | `+` |
314
- |[x] `atan2` | | [x] | | |
315
- |[x] `and` | [x] | | | |
316
- |[x] `and_kleene` | [x] | | | |
317
- |[x] `and_not` | [x] | | | |
318
- |[x] `and_not_kleene`| [x] | | | |
319
- |[x] `bit_wise_and` | |([x])| |`&`, integer only|
320
- |[ ] `bit_wise_not` | |([x])| |`!`, integer only|
321
- |[x] `bit_wise_or` | |([x])| |`|`, integer only|
322
- |[x] `bit_wise_xor` | |([x])| |`^`, integer only|
323
- |[x] `divide` | | [x] | | `/` |
324
- |[x] `equal` | [x] | [x] | [x] |`==`, alias `eq`|
325
- |[x] `greater` | [x] | [x] | [x] |`>`, alias `gt`|
326
- |[x] `greater_equal` | [x] | [x] | [x] |`>=`, alias `ge`|
327
- |[x] `less` | [x] | [x] | [x] |`<`, alias `lt`|
328
- |[x] `less_equal` | [x] | [x] | [x] |`<=`, alias `le`|
329
- |[ ] `logb` | | [ ] | | |
330
- |[ ] `mod` | | [ ] | | |
331
- |[x] `multiply` | | [x] | | `*` |
332
- |[x] `not_equal` | [x] | [x] | [x] |`!=`, alias `ne`|
333
- |[x] `or` | [x] | | | |
334
- |[x] `or_kleene` | [x] | | | |
335
- |[x] `power` | | [x] | | `**` |
336
- |[x] `subtract` | | [x] | | `-` |
337
- |[x] `shift_left` | |([x])| |`<<`, integer only|
338
- |[x] `shift_right` | |([x])| |`>>`, integer only|
339
- |[x] `xor` | [x] | | | |
293
+ #### Unary aggregations: vector.func => scalar
294
+
295
+ | Method |Boolean|Numeric|String|Options|Remarks|
296
+ | ----------- | --- | --- | --- | --- | --- |
297
+ | `all` || | | ✓ ScalarAggregate| |
298
+ | `any` || | | ✓ ScalarAggregate| |
299
+ | `approximate_median`| |✓| | ✓ ScalarAggregate| alias `median`|
300
+ | `count` ||| ✓ | ✓ Count | |
301
+ | `count_distinct`| | | | ✓ Count |alias `count_uniq`|
302
+ |[ ]`index` | [ ] | [ ] | [ ] |[ ] Index | |
303
+ | `max` || ✓ | ✓ | ✓ ScalarAggregate| |
304
+ | `mean` ||| | ScalarAggregate| |
305
+ | `min` || ✓ | ✓ | ScalarAggregate| |
306
+ |[ ]`min_max` | [ ] | [ ] | [ ] |[ ] ScalarAggregate| |
307
+ |[ ]`mode` | | [ ] | |[ ] Mode | |
308
+ | `product` || ✓ | | ✓ ScalarAggregate| |
309
+ |[ ]`quantile`| | [ ] | |[ ] Quantile| |
310
+ |[ ]`stddev` | | ✓ | |[ ] Variance| |
311
+ | `sum` | | ✓ | | ScalarAggregate| |
312
+ |[ ]`tdigest` | | [ ] | |[ ] TDigest | |
313
+ |[ ]`variance`| | ✓ | |[ ] Variance| |
314
+
315
+
316
+ Options can be used as follows.
317
+ See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
318
+
319
+ ```ruby
320
+ double = RedAmber::Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
321
+ #=>
322
+ #<RedAmber::Vector(:double, size=6):0x000000000000f910>
323
+ [1.0, NaN, -Infinity, Infinity, nil, 0.0]
324
+
325
+ double.count #=> 5
326
+ double.count(opts: {mode: :only_valid}) #=> 5, default
327
+ double.count(opts: {mode: :only_null}) #=> 1
328
+ double.count(opts: {mode: :all}) #=> 6
329
+
330
+ boolean = RedAmber::Vector.new([true, true, nil])
331
+ #=>
332
+ #<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
333
+ [true, true, nil]
334
+
335
+ boolean.all #=> true
336
+ boolean.all(opts: {skip_nulls: true}) #=> true
337
+ boolean.all(opts: {skip_nulls: false}) #=> false
338
+ ```
339
+
340
+ #### Unary element-wise: vector.func => vector
341
+
342
+ | Method |Boolean|Numeric|String|Options|Remarks|
343
+ | ------------ | --- | --- | --- | --- | ----- |
344
+ | `-@` | | ✓ | | |as `-vector`|
345
+ | `negate` | | ✓ | | |`-@` |
346
+ | `abs` | | ✓ | | | |
347
+ |[ ]`acos` | | [ ] | | | |
348
+ |[ ]`asin` | | [ ] | | | |
349
+ | `atan` | || | | |
350
+ | `bit_wise_not`| | () | | |integer only|
351
+ |[ ]`ceil` | || | | |
352
+ | `cos` | || | | |
353
+ |[ ]`floor` | || | | |
354
+ | `invert` || | | |`!`, alias `not`|
355
+ |[ ]`ln` | | [ ] | | | |
356
+ |[ ]`log10` | | [ ] | | | |
357
+ |[ ]`log1p` | | [ ] | | | |
358
+ |[ ]`log2` | | [ ] | | | |
359
+ |[ ]`round` | | [ ] | |[ ] Round| |
360
+ |[ ]`round_to_multiple`| | [ ] | |[ ] RoundToMultiple| |
361
+ | `sign` | || | | |
362
+ | `sin` | | ✓ | | | |
363
+ | `tan` | || | | |
364
+ |[ ]`trunc` | || | | |
365
+
366
+ #### Binary element-wise: vector.func(vector) => vector
367
+
368
+ | Method |Boolean|Numeric|String|Options|Remarks|
369
+ | ----------------- | --- | --- | --- | --- | ----- |
370
+ | ✓ `add` | | ✓ | | | `+` |
371
+ | ✓ `atan2` | | ✓ | | | |
372
+ | ✓ `and_kleene` | ✓ | | | | `&` |
373
+ | ✓ `and_org ` | ✓ | | | |`and` in Red Arrow|
374
+ | ✓ `and_not` | ✓ | | | | |
375
+ | ✓ `and_not_kleene`| ✓ | | | | |
376
+ | ✓ `bit_wise_and` | | (✓) | | |integer only|
377
+ | ✓ `bit_wise_or` | | (✓) | | |integer only|
378
+ | ✓ `bit_wise_xor` | | (✓) | | |integer only|
379
+ | ✓ `divide` | | ✓ | | | `/` |
380
+ | ✓ `equal` | ✓ | ✓ | ✓ | |`==`, alias `eq`|
381
+ | ✓ `greater` | ✓ | ✓ | ✓ | |`>`, alias `gt`|
382
+ | ✓ `greater_equal` | ✓ | ✓ | ✓ | |`>=`, alias `ge`|
383
+ | ✓ `is_finite` | | ✓ | | | |
384
+ | ✓ `is_inf` | | ✓ | | | |
385
+ | ✓ `is_na` | ✓ | ✓ | ✓ | | |
386
+ | ✓ `is_nan` | | ✓ | | | |
387
+ |[ ]`is_nil` | ✓ | ✓ | ✓ |[ ] Null|alias `is_null`|
388
+ | ✓ `is_valid` | ✓ | ✓ | ✓ | | |
389
+ | ✓ `less` | ✓ | ✓ | ✓ | |`<`, alias `lt`|
390
+ | ✓ `less_equal` | ✓ | ✓ | ✓ | |`<=`, alias `le`|
391
+ |[ ]`logb` | | [ ] | | | |
392
+ |[ ]`mod` | | [ ] | | | `%` |
393
+ | ✓ `multiply` | | ✓ | | | `*` |
394
+ | ✓ `not_equal` | ✓ | ✓ | ✓ | |`!=`, alias `ne`|
395
+ | ✓ `or_kleene` | ✓ | | | | `\|` |
396
+ | ✓ `or_org` | ✓ | | | |`or` in Red Arrow|
397
+ | ✓ `power` | | ✓ | | | `**` |
398
+ | ✓ `subtract` | | ✓ | | | `-` |
399
+ | ✓ `shift_left` | | (✓) | | |`<<`, integer only|
400
+ | ✓ `shift_right` | | (✓) | | |`>>`, integer only|
401
+ | ✓ `xor` | ✓ | | | | `^` |
340
402
 
341
403
  ##### (Not impremented)
342
- - [ ] invert, round, round_to_multiple
343
404
  - [ ] sort, sort_index
344
- - [ ] minmax, var, median, quantile
345
405
  - [ ] argmin, argmax
346
406
  - [ ] (array functions)
347
407
  - [ ] (strings functions)
@@ -67,6 +67,17 @@ module RedAmber
67
67
  alias_method :keys, :column_names
68
68
  alias_method :header, :column_names
69
69
 
70
+ def key?(key)
71
+ column_names.include?(key.to_sym)
72
+ end
73
+ alias_method :has_key?, :key?
74
+
75
+ def key_index(key)
76
+ column_names.find_index(key.to_sym)
77
+ end
78
+ alias_method :find_index, :key_index
79
+ alias_method :index, :key_index
80
+
70
81
  def types
71
82
  @table.columns.map do |column|
72
83
  column.data_type.to_s.to_sym
@@ -19,61 +19,51 @@ module RedAmber
19
19
 
20
20
  # - tally_level: max level to use tally mode
21
21
  # - max_element: max element to show values in each row
22
- # TODO: Is it better to change name other than `inspect` ?
23
- # TODO: Add na count capability
24
- # TODO: Fall back to inspect_raw when treating large dataset
25
- # TODO: Refactor code to smaller methods
22
+ # - TODO: Is it better to change name other than `inspect` ?
23
+ # - TODO: Fall back to inspect_raw when treating large dataset
24
+ # - TODO: Refactor code to smaller methods
26
25
  def inspect(tally_level: 5, max_element: 5)
27
26
  return '#<RedAmber::DataFrame (empty)>' if empty?
28
27
 
29
28
  stringio = StringIO.new # output string buffer
30
29
 
30
+ tallys = vectors.map(&:tally)
31
+ levels = tallys.map(&:size)
32
+ type_groups = @table.columns.map { |column| type_group(column.data_type) }
33
+ quoted_keys = keys.map(&:inspect)
34
+ headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
35
+ header_format = make_header_format(levels, headers, quoted_keys)
36
+
31
37
  # 1st row: show shape of the dataframe
32
- r = pl(nrow)
33
- c = pl(ncol)
38
+ vs = "Vector#{pl(ncol)}"
34
39
  stringio.puts \
35
- "#{self.class} : #{nrow} observation#{r}(row#{r}) of #{ncol} variable#{c}(column#{c})"
40
+ "#{self.class} : #{nrow} x #{ncol} #{vs}"
36
41
 
37
42
  # 2nd row: show var counts by type
38
- type_groups = data_types.map { |t| type_group(t) }
39
-
40
- stringio.puts "Variable#{pl(ncol)} : #{var_type_count(type_groups).join(', ')}"
43
+ stringio.puts "#{vs} : #{var_type_count(type_groups).join(', ')}"
41
44
 
42
45
  # 3rd row: print header of rows
43
- levels = vectors.map { |v| v.to_a.uniq.size }
44
- row_headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
45
- # find longest word to adjust column width
46
- w_idx = ncol.to_s.size
47
- w_key = (keys.map { |key| key.size + 1 } << row_headers[:key].size).max
48
- w_type = (types.map(&:size) << row_headers[:type].size).max
49
- w_row = (levels.map { |l| l.to_s.size } << row_headers[:levels].size).max
50
- stringio.printf("%-#{w_idx}s %-#{w_key}s %-#{w_type}s %-#{w_row}s %s\n", *row_headers.values)
51
-
52
- # (4) show details for each column (vector)
53
- vectors.each.with_index(1) do |vector, i|
54
- key = keys[i - 1]
55
- type = types[i - 1]
56
- type_group = type_groups[i - 1]
57
- data_tally = vector.tally
58
-
59
- str = format("%#{w_row}d ", data_tally.size)
60
- str <<
61
- case type_group
62
- when :numeric, :string, :boolean
63
- if data_tally.size <= tally_level && data_tally.size != nrow
64
- data_tally.to_s
46
+ stringio.printf header_format, *headers.values
47
+
48
+ # 4th row ~: show details for each column (vector)
49
+ vectors.each.with_index do |vector, i|
50
+ key = quoted_keys[i]
51
+ type = types[i]
52
+ type_group = type_groups[i]
53
+ data_tally = tallys[i]
54
+
55
+ a = case type_group
56
+ when :numeric, :string, :boolean
57
+ if data_tally.size <= tally_level && data_tally.size != nrow
58
+ [data_tally.to_s]
59
+ else
60
+ [shorthand(vector, nrow, max_element)].concat na_string(vector)
61
+ end
65
62
  else
66
- reduced_vector_presentation(vector, nrow, max_element)
63
+ shorthand(vector, nrow, max_element)
67
64
  end
68
- # c = vector.is_na.tally[1] # release when `#is_na` impremented
69
- # str << " #{c} NaN#{pl(c)}" if c&.>(0) # safely call c>0
70
- else
71
- reduced_vector_presentation(vector, nrow, max_element)
72
- end
73
-
74
- stringio.printf("%#{w_idx}d %-#{w_key}s %-#{w_type}s %s\n", i, ":#{key}", type, str)
65
+ stringio.printf header_format, i + 1, key, type, data_tally.size, a.join(', ')
75
66
  end
76
-
77
67
  stringio.string
78
68
  end
79
69
 
@@ -83,15 +73,21 @@ module RedAmber
83
73
  num > 1 ? 's' : ''
84
74
  end
85
75
 
86
- def type_group(type)
87
- if Arrow::NumericDataType >= type
88
- :numeric
89
- elsif Arrow::StringDataType >= type
90
- :string
91
- elsif Arrow::BooleanDataType >= type
92
- :boolean
93
- elsif Arrow::TemporalDataType >= type
94
- :temporal
76
+ def make_header_format(levels, headers, quoted_keys)
77
+ # find longest word to adjust column width
78
+ w_idx = ncol.to_s.size
79
+ w_key = [quoted_keys.map(&:size).max, headers[:key].size].max
80
+ w_type = [types.map(&:size).max, headers[:type].size].max
81
+ w_row = [levels.map { |l| l.to_s.size }.max, headers[:levels].size].max
82
+ "%-#{w_idx}s %-#{w_key}s %-#{w_type}s %#{w_row}s %s\n"
83
+ end
84
+
85
+ def type_group(data_type)
86
+ case data_type
87
+ when Arrow::NumericDataType then :numeric
88
+ when Arrow::StringDataType then :string
89
+ when Arrow::BooleanDataType then :boolean
90
+ when Arrow::TemporalDataType then :temporal
95
91
  else
96
92
  :other
97
93
  end
@@ -107,10 +103,22 @@ module RedAmber
107
103
  a
108
104
  end
109
105
 
110
- def reduced_vector_presentation(vector, nrow, max_element)
106
+ def shorthand(vector, nrow, max_element)
111
107
  a = vector.to_a.take(max_element)
112
- a << '...' if nrow > max_element
108
+ a.map! { |e| e.nil? ? 'nil' : e.inspect }
109
+ a << '... ' if nrow > max_element
113
110
  "[#{a.join(', ')}]"
114
111
  end
112
+
113
+ def na_string(vector)
114
+ n_nan = vector.n_nans
115
+ n_nil = vector.n_nils
116
+ a = []
117
+ return a if (n_nan + n_nil).zero?
118
+
119
+ a << "#{n_nan} NaN#{pl(n_nan)}" unless n_nan.zero?
120
+ a << "#{n_nil} nil#{pl(n_nil)}" unless n_nil.zero?
121
+ a
122
+ end
115
123
  end
116
124
  end
@@ -45,7 +45,10 @@ module RedAmber
45
45
 
46
46
  def select_columns(keys)
47
47
  if keys.one?
48
- Vector.new(@table[*keys].data)
48
+ t = @table[*keys]
49
+ raise DataFrameArgumentError, "Key is not exists #{keys}" unless t
50
+
51
+ Vector.new(t.data)
49
52
  else
50
53
  DataFrame.new(@table[keys])
51
54
  end
@@ -27,8 +27,20 @@ module RedAmber
27
27
  @data.to_a.inspect
28
28
  end
29
29
 
30
- def inspect
31
- format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n#{self}", object_id
30
+ def inspect(limit: 80)
31
+ sio = StringIO.new << '['
32
+ to_a.each_with_object(sio).with_index do |(e, s), i|
33
+ next_str = "#{s.size > 1 ? ', ' : ''}#{e.inspect}"
34
+ if (s.size + next_str.size) < limit
35
+ s << next_str
36
+ else
37
+ s << ', ... ' if i < size
38
+ break
39
+ end
40
+ end
41
+ sio << ']'
42
+
43
+ format "#<#{self.class}(:#{type}, size=#{size}):0x%016x>\n%s\n", object_id, sio.string
32
44
  end
33
45
 
34
46
  def values
@@ -49,6 +61,18 @@ module RedAmber
49
61
  @data.value_type.nick.to_sym
50
62
  end
51
63
 
64
+ def boolean?
65
+ type == :boolean
66
+ end
67
+
68
+ def numeric?
69
+ %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
70
+ end
71
+
72
+ def string?
73
+ type == :string
74
+ end
75
+
52
76
  def data_type
53
77
  @data.value_type
54
78
  end
@@ -72,5 +96,10 @@ module RedAmber
72
96
  def n_nulls
73
97
  @data.n_nulls
74
98
  end
99
+ alias_method :n_nils, :n_nulls
100
+
101
+ def n_nans
102
+ numeric? ? is_nan.to_a.count(true) : 0
103
+ end
75
104
  end
76
105
  end
@@ -1,69 +1,113 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
4
+ # reference: https://arrow.apache.org/docs/cpp/compute.html
5
+
6
+ # Not implemented in Red Arrow 8.0.0
7
+ # divmod, # '%',
8
+ # true_unless_null
9
+
3
10
  module RedAmber
4
11
  # mix-ins for class Vector
5
12
  module VectorFunctions
6
- # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
7
- # reference: https://arrow.apache.org/docs/cpp/compute.html
8
-
9
- # [Unary aggregations]: vector.func => Scalar
13
+ # [Unary aggregations]: vector.func => scalar
10
14
  unary_aggregations =
11
- %i[all any approximate_median count count_distinct max mean min \
12
- product stddev sum variance]
15
+ %i[all any approximate_median count count_distinct max mean min product stddev sum variance]
13
16
  unary_aggregations.each do |function|
14
- define_method(function) { exec_func(function, other: nil, options: { aggregate: true }) }
17
+ define_method(function) do |opts: nil|
18
+ output = exec_func_unary(function, options: opts)
19
+ take_out_scalar(output)
20
+ end
15
21
  end
22
+ alias_method :median, :approximate_median
16
23
  alias_method :count_uniq, :count_distinct
17
24
 
18
25
  # option(s) required
19
- # index
26
+ # - index
20
27
 
21
28
  # Returns other than value
22
- # min_max
23
- # mode
24
- # quantile
25
- # tdigest
26
-
27
- # [Unary element-wise]: vector.func => Vector
28
- unary_element_wise = %i[abs atan ceil cos floor sign sin tan trunc]
29
+ # - min_max
30
+ # - mode
31
+ # - quantile
32
+ # - tdigest
33
+
34
+ # [Unary element-wise]: vector.func => vector
35
+ unary_element_wise =
36
+ %i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
29
37
  unary_element_wise.each do |function|
30
- define_method(function) { exec_func(function, other: nil, options: {}) }
38
+ define_method(function) do |opts: nil|
39
+ output = exec_func_unary(function, options: opts)
40
+ take_out_element_wise(output)
41
+ end
31
42
  end
43
+ alias_method :is_nil, :is_null
32
44
 
33
- # [Unary element-wise with operator]: vector.func => Vector
45
+ def is_na
46
+ numeric? ? (is_nil | is_nan) : is_nil
47
+ end
48
+
49
+ # [Unary element-wise with operator]: vector.func => vector, op vector
34
50
  unary_element_wise_op = {
51
+ invert: '!',
35
52
  negate: '-@',
36
53
  }
37
54
  unary_element_wise_op.each do |function, operator|
38
- define_method(function) { exec_func(function, other: nil, options: {}) }
39
- define_method(operator) { exec_func(function, other: nil, options: {}) }
55
+ define_method(function) do |opts: nil|
56
+ output = exec_func_unary(function, options: opts)
57
+ take_out_element_wise(output)
58
+ end
59
+
60
+ define_method(operator) do |opts: nil|
61
+ output = exec_func_unary(function, options: opts)
62
+ take_out_element_wise(output)
63
+ end
40
64
  end
65
+ alias_method :not, :invert
41
66
 
42
- # bit_wise_not => '!', invert, round, round_to_multiple
67
+ # option(s) required
68
+ # - round, round_to_multiple
43
69
 
44
70
  # NaN support needed
45
- # %i[acos asin ln log10 log1p log2]
71
+ # - acos asin ln log10 log1p log2
46
72
 
47
- # With numerical range check
48
- # %i[abs_checked acos_checked asin_checked cos_checked ln_checked \
49
- # log10_checked log1p_checked log2_checked sin_checked tan_checked]
73
+ # Functions with numerical range check
74
+ # - abs_checked acos_checked asin_checked cos_checked ln_checked
75
+ # log10_checked log1p_checked log2_checked sin_checked tan_checked
50
76
 
51
- # [Binary element-wise]: vector.func(other) => Vector
52
- binary_element_wise = %i[atan2 and and_kleene and_not and_not_kleene or or_kleene xor]
77
+ # [Binary element-wise]: vector.func(other) => vector
78
+ binary_element_wise =
79
+ %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
53
80
  binary_element_wise.each do |function|
54
- define_method(function) do |other|
55
- exec_func(function, other: other, options: {})
81
+ define_method(function) do |other, opts: nil|
82
+ output = exec_func_binary(function, other, options: opts)
83
+ take_out_element_wise(output)
84
+ end
85
+ end
86
+
87
+ # [Logical binary element-wise]: vector.func(other) => vector
88
+ logical_binary_element_wise = {
89
+ '&': :and_kleene,
90
+ and_kleene: :and_kleene,
91
+ and_org: :and,
92
+ '|': :or_kleene,
93
+ or_kleene: :or_kleene,
94
+ or_org: :or,
95
+ }
96
+ logical_binary_element_wise.each do |method, function|
97
+ define_method(method) do |other, opts: nil|
98
+ output = exec_func_binary(function, other, options: opts)
99
+ take_out_element_wise(output)
56
100
  end
57
101
  end
58
102
 
59
103
  # NaN support needed
60
- # logb
104
+ # - logb
61
105
 
62
- # With numerical range check
63
- # %i[add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked \
64
- # shift_left_checked shift_right_checked]
106
+ # Functions with numerical range check
107
+ # - add_checked divide_checked logb_checked multiply_checked power_checked subtract_checked
108
+ # shift_left_checked shift_right_checked
65
109
 
66
- # [Binary element-wise with operator]: vector.func(other) => Vector
110
+ # [Binary element-wise with operator]: vector.func(other) => vector
67
111
  binary_element_wise_op = {
68
112
  add: '+',
69
113
  divide: '/',
@@ -71,9 +115,7 @@ module RedAmber
71
115
  power: '**',
72
116
  subtract: '-',
73
117
 
74
- bit_wise_and: '&',
75
- bit_wise_or: '|',
76
- bit_wise_xor: '^',
118
+ xor: '^',
77
119
  shift_left: '<<',
78
120
  shift_right: '>>',
79
121
 
@@ -85,11 +127,14 @@ module RedAmber
85
127
  not_equal: '!=',
86
128
  }
87
129
  binary_element_wise_op.each do |function, operator|
88
- define_method(function) do |other|
89
- exec_func(function, other: other, options: {})
130
+ define_method(function) do |other, opts: nil|
131
+ output = exec_func_binary(function, other, options: opts)
132
+ take_out_element_wise(output)
90
133
  end
91
- define_method(operator) do |other|
92
- exec_func(function, other: other, options: {})
134
+
135
+ define_method(operator) do |other, opts: nil|
136
+ output = exec_func_binary(function, other, options: opts)
137
+ take_out_element_wise(output)
93
138
  end
94
139
  end
95
140
  alias_method :eq, :equal
@@ -99,8 +144,6 @@ module RedAmber
99
144
  alias_method :lt, :less
100
145
  alias_method :ne, :not_equal
101
146
 
102
- # mod: '%',
103
-
104
147
  # (array functions)
105
148
  # array_filter, array_sort_indices, array_take
106
149
  # dictionary_encode, hash_all, hash_any, hash_approximate_median,
@@ -144,29 +187,37 @@ module RedAmber
144
187
 
145
188
  # (others)
146
189
  # coalesce, drop_null, fill_null_backward, fill_null_forward,
147
- # filter, is_finite, is_in, is_in_meta_binary, is_inf, is_nan, is_null, is_valid,
190
+ # filter, is_in, is_in_meta_binary,
148
191
  # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
149
192
  # max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
150
193
  # sort_indices, struct_field, take
151
194
 
152
195
  private # =======
153
196
 
154
- def exec_func(function, other: nil, options: {})
197
+ def exec_func_unary(function, options: nil)
155
198
  func = Arrow::Function.find(function)
156
- output =
157
- case other
158
- when nil
159
- func.execute([data])
160
- when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Numeric
161
- func.execute([data, other])
162
- when Vector
163
- func.execute([data, other.data])
164
- when Rover::Vector
165
- func.execute([data, other.to_a])
166
- else
167
- raise ArgumentError, "Operand is not supported: #{other.class}"
168
- end
169
- options[:aggregate] ? output.value : Vector.new(output.value)
199
+ func.execute([data], options)
200
+ end
201
+
202
+ def exec_func_binary(function, other, options: nil)
203
+ func = Arrow::Function.find(function)
204
+ case other
205
+ when Vector
206
+ func.execute([data, other.data], options)
207
+ when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
208
+ func.execute([data, other], options)
209
+ else
210
+ raise ArgumentError, "Operand is not supported: #{other.class}"
211
+ end
212
+ end
213
+
214
+ def take_out_scalar(output)
215
+ output = output.value
216
+ output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
217
+ end
218
+
219
+ def take_out_element_wise(output)
220
+ Vector.new(output.value)
170
221
  end
171
222
  end
172
223
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- VERSION = '0.1.2'
4
+ VERSION = '0.1.3'
5
5
  end
data/red_amber.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.email = ['heronshoes877@gmail.com']
10
10
 
11
11
  spec.summary = 'Simple dataframe library for Ruby'
12
- spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with simple API similar to Rover-df.'
12
+ spec.description = 'RedAmber is a simple dataframe library powered by Red Arrow with API similar to Rover-df.'
13
13
  spec.homepage = 'https://github.com/heronshoes/red_amber'
14
14
  spec.license = 'MIT'
15
15
  spec.required_ruby_version = '>= 2.7'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red_amber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hirokazu SUZUKI (heronshoes)
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-05-08 00:00:00.000000000 Z
11
+ date: 2022-05-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: red-arrow
@@ -52,8 +52,8 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 0.3.0
55
- description: RedAmber is a simple dataframe library powered by Red Arrow with simple
56
- API similar to Rover-df.
55
+ description: RedAmber is a simple dataframe library powered by Red Arrow with API
56
+ similar to Rover-df.
57
57
  email:
58
58
  - heronshoes877@gmail.com
59
59
  executables: []