red_amber 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -8
- data/CHANGELOG.md +74 -7
- data/Gemfile +3 -0
- data/README.md +47 -13
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +185 -35
- data/doc/Vector.md +132 -10
- data/doc/image/dataframe_model.png +0 -0
- data/doc/tdr.md +14 -11
- data/doc/tdr_ja.md +13 -10
- data/lib/red_amber/data_frame.rb +38 -23
- data/lib/red_amber/data_frame_displayable.rb +4 -3
- data/lib/red_amber/data_frame_helper.rb +8 -8
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +13 -2
- data/lib/red_amber/data_frame_selectable.rb +14 -4
- data/lib/red_amber/vector.rb +28 -5
- data/lib/red_amber/vector_compensable.rb +68 -0
- data/lib/red_amber/vector_functions.rb +16 -13
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +5 -0
- data/red_amber.gemspec +3 -6
- metadata +12 -9
- data/doc/image/TDR_operations.pdf +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d18eedf5de7fd06fe52e8a82ad38fe12d590dc10929c96872e557b9e946f785
|
4
|
+
data.tar.gz: dda93f0af421096410e00ecf2261e8846a236634bd96ae9941d1b5cd49cd5eb2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c1b1edd6c1f6f3f275ea765c4bc8765327c88a36120a4c5a66dd8afa59f5913db4a5b436d80378554e03403bab823edf7467beea0f44e2803e36f3e9677a065
|
7
|
+
data.tar.gz: 949fd15d2076d4e53fb141375bde282228c7f6566e137047344134c54964fe77fd2f9757b0bdc324eb3cfa14091f2ae928e0e844d28f3ebbcfa17fc7d388bbd0
|
data/.rubocop.yml
CHANGED
@@ -53,12 +53,10 @@ Layout/LineLength:
|
|
53
53
|
# 18..30 unsatisfactory
|
54
54
|
# > 30 dangerous
|
55
55
|
Metrics/AbcSize:
|
56
|
-
Max:
|
56
|
+
Max: 30
|
57
57
|
Exclude:
|
58
58
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 55
|
59
|
-
- 'lib/red_amber/
|
60
|
-
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 29
|
61
|
-
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 26
|
59
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 36
|
62
60
|
|
63
61
|
# Max: 25
|
64
62
|
Metrics/BlockLength:
|
@@ -68,21 +66,21 @@ Metrics/BlockLength:
|
|
68
66
|
|
69
67
|
# Max: 100
|
70
68
|
Metrics/ClassLength:
|
71
|
-
Max:
|
69
|
+
Max: 120
|
72
70
|
Exclude:
|
73
71
|
- 'test/**/*'
|
74
72
|
|
75
73
|
# Max: 7
|
76
74
|
Metrics/CyclomaticComplexity:
|
77
75
|
Max: 12
|
76
|
+
Exclude:
|
77
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 14
|
78
78
|
|
79
79
|
# Max: 10
|
80
80
|
Metrics/MethodLength:
|
81
|
-
Max:
|
81
|
+
Max: 30
|
82
82
|
Exclude:
|
83
83
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 33
|
84
|
-
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 21
|
85
|
-
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 20
|
86
84
|
|
87
85
|
# Max: 100
|
88
86
|
Metrics/ModuleLength:
|
@@ -93,6 +91,8 @@ Metrics/ModuleLength:
|
|
93
91
|
# Max: 8
|
94
92
|
Metrics/PerceivedComplexity:
|
95
93
|
Max: 13
|
94
|
+
Exclude:
|
95
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 15
|
96
96
|
|
97
97
|
# Necessary to define is_na
|
98
98
|
Naming/PredicateName:
|
data/CHANGELOG.md
CHANGED
@@ -1,19 +1,86 @@
|
|
1
|
-
##
|
1
|
+
## [0.2.0] - unreleased
|
2
2
|
|
3
|
-
-
|
3
|
+
- Document
|
4
|
+
- YARD support
|
5
|
+
|
6
|
+
- DataFrame#join features
|
7
|
+
|
8
|
+
## [0.1.6] - Unreleased
|
9
|
+
|
10
|
+
- Feedback something to Red Data Tools
|
4
11
|
|
5
12
|
- `DataFrame`
|
6
|
-
- Introduce `group_by`
|
7
|
-
- Introduce `summarize`
|
8
13
|
- Introduce `summary` or ``describe`
|
14
|
+
- Add `Quantile` by own code?
|
9
15
|
- Improve dataframe obs. manipuration methods to accept float as a index (#10)
|
10
|
-
-
|
16
|
+
- Improve as more performant by benchmark check.
|
11
17
|
|
12
18
|
- `Vector`
|
13
19
|
- Support more functions
|
20
|
+
- Support coerece
|
14
21
|
|
15
|
-
-
|
16
|
-
|
22
|
+
- More examples of frequently needed tasks
|
23
|
+
|
24
|
+
## [0.1.5] - 2022-06-12 (experimental)
|
25
|
+
|
26
|
+
- Bug fixes
|
27
|
+
- Fix DF#tdr to display timestamp type (#19)
|
28
|
+
- Add TZ setting in CI test to pass temporal tests (#19)
|
29
|
+
- Fix example in document of #load(csv_from_URI) (#23)
|
30
|
+
|
31
|
+
- New features and improvements
|
32
|
+
- Improve usability of DataFrame manipulating block (#19)
|
33
|
+
- Add `DataFrame#v` to select a Vector
|
34
|
+
- Add `DataFrame#variables` method
|
35
|
+
- Add `DataFrame#to_arrow`
|
36
|
+
- Add instance variables in DataFrame with lazy initialization
|
37
|
+
- Add `Vector#key` to get key name
|
38
|
+
- Add `Vector#temporal?` to check if temporal type
|
39
|
+
- Refine around DataFrame#variables
|
40
|
+
- Refine init of instance variables
|
41
|
+
- Refine DataFrame#type_classes, V#ectortype_class
|
42
|
+
- Refine DataFrame#tdr to shorten temporal data
|
43
|
+
|
44
|
+
- Add supports to make up for missing values (#20)
|
45
|
+
- Add VectorArgumentError
|
46
|
+
- Add `Vector#replace_with`
|
47
|
+
- Add helper function to assert with NaN
|
48
|
+
- To assert NaN == NaN
|
49
|
+
- Add `Vector#fill_nil_backward`, `Vector#forward`
|
50
|
+
- Add `DataFrame#remove_nil` method
|
51
|
+
- Change to accept nil as replacement in Vector#replace_with
|
52
|
+
|
53
|
+
- Introduce index related methods (#22)
|
54
|
+
- Add `Vector#sort_indexes` method
|
55
|
+
- Add `Vector#uniq` method
|
56
|
+
- Add `Vector#tally` and `Vectorvalue_counts` methods
|
57
|
+
- Add `DataFrame#sort` method
|
58
|
+
- Add `DataFrame#group` method
|
59
|
+
- Change to use DataFrame#map_indices in #[]
|
60
|
+
|
61
|
+
- Add rounding functions with opts (#21)
|
62
|
+
- With options :mode and :n_digits
|
63
|
+
- :n_digits also can be specified with :multiple option in `Vector#round_to_multiple`
|
64
|
+
- `Vector#round`
|
65
|
+
- `Vector#ceil`
|
66
|
+
- `Vector#floor`
|
67
|
+
- `Vector#trunc`
|
68
|
+
|
69
|
+
- Documentation
|
70
|
+
- Update TDR, TDR_ja documents to latest (#18)
|
71
|
+
- Refinement and small fix in DataFrame.md (#18)
|
72
|
+
- Update README to use more effective example (#18)
|
73
|
+
- Delete expired TDR_operations.pdf (#23)
|
74
|
+
- Update README and dataframe_model image (#23)
|
75
|
+
- Update description about rover-df in README (#23)
|
76
|
+
- Add installation of Arrow in README (#23)
|
77
|
+
|
78
|
+
- Others
|
79
|
+
- Tried but cannot use bundler cache in ci test (#17)
|
80
|
+
- Bump up requirements to Arrow 8.0.0 (#25)
|
81
|
+
- Arrow 7.0.0 with Ubuntu 21.04 causes an fatal error in replace_with_mask function.
|
82
|
+
- Update the description of gem (#23)
|
83
|
+
- Add benchmark tests (#26)
|
17
84
|
|
18
85
|
## [0.1.4] - 2022-05-29 (experimental)
|
19
86
|
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,18 +3,27 @@
|
|
3
3
|
A simple dataframe library for Ruby (experimental)
|
4
4
|
|
5
5
|
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
6
|
-
-
|
6
|
+
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
7
7
|
|
8
8
|
## Requirements
|
9
9
|
|
10
10
|
```ruby
|
11
|
-
gem 'red-arrow', '>=
|
12
|
-
gem 'red-parquet', '>=
|
11
|
+
gem 'red-arrow', '>= 8.0.0'
|
12
|
+
gem 'red-parquet', '>= 8.0.0' # if you use IO from/to parquet
|
13
13
|
gem 'rover-df', '~> 0.3.0' # if you use IO from/to Rover::DataFrame
|
14
14
|
```
|
15
15
|
|
16
16
|
## Installation
|
17
17
|
|
18
|
+
Install requirements before you install Red Amber.
|
19
|
+
|
20
|
+
- Apache Arrow GLib (>= 8.0.0)
|
21
|
+
- Apache Parquet GLib (>= 8.0.0)
|
22
|
+
|
23
|
+
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
24
|
+
|
25
|
+
Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
|
26
|
+
|
18
27
|
Add this line to your Gemfile:
|
19
28
|
|
20
29
|
```ruby
|
@@ -41,8 +50,9 @@ Represents a set of data in 2D-shape.
|
|
41
50
|
require 'red_amber'
|
42
51
|
require 'datasets-arrow'
|
43
52
|
|
44
|
-
|
45
|
-
|
53
|
+
arrow = Datasets::Penguins.new.to_arrow
|
54
|
+
penguins = RedAmber::DataFrame.new(arrow)
|
55
|
+
penguins.tdr
|
46
56
|
# =>
|
47
57
|
RedAmber::DataFrame : 344 x 8 Vectors
|
48
58
|
Vectors : 5 numeric, 3 strings
|
@@ -71,12 +81,10 @@ Vector : 1 numeric
|
|
71
81
|
1 :body_mass_g int64 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
72
82
|
```
|
73
83
|
|
74
|
-
`DataFrame#assign`
|
84
|
+
`DataFrame#assign` creates new variables (column in the table).
|
75
85
|
|
76
86
|
```ruby
|
77
|
-
df.assign
|
78
|
-
{:body_mass_kg => penguins[:body_mass_g] / 1000.0}
|
79
|
-
end
|
87
|
+
df.assign(:body_mass_kg => df[:body_mass_g] / 1000.0)
|
80
88
|
# =>
|
81
89
|
#<RedAmber::DataFrame : 344 x 2 Vectors, 0x000000000000fa28>
|
82
90
|
Vectors : 2 numeric
|
@@ -85,7 +93,33 @@ Vectors : 2 numeric
|
|
85
93
|
2 :body_mass_kg double 95 [3.75, 3.8, 3.25, nil, 3.45, ... ], 2 nils
|
86
94
|
```
|
87
95
|
|
88
|
-
|
96
|
+
DataFrame manipulating methods like `pick`, `drop`, `slice`, `remove`, `rename` and `assign` accept a block.
|
97
|
+
|
98
|
+
This is an exaple to eliminate observations (row in the table) containing nil.
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
# remove all observation contains nil
|
102
|
+
nil_removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
103
|
+
nil_removed.tdr
|
104
|
+
# =>
|
105
|
+
RedAmber::DataFrame : 342 x 8 Vectors
|
106
|
+
Vectors : 5 numeric, 3 strings
|
107
|
+
# key type level data_preview
|
108
|
+
1 :species string 3 {"Adelie"=>151, "Chinstrap"=>68, "Gentoo"=>123}
|
109
|
+
2 :island string 3 {"Torgersen"=>51, "Biscoe"=>167, "Dream"=>124}
|
110
|
+
3 :bill_length_mm double 164 [39.1, 39.5, 40.3, 36.7, 39.3, ... ]
|
111
|
+
4 :bill_depth_mm double 80 [18.7, 17.4, 18.0, 19.3, 20.6, ... ]
|
112
|
+
5 :flipper_length_mm int64 55 [181, 186, 195, 193, 190, ... ]
|
113
|
+
6 :body_mass_g int64 94 [3750, 3800, 3250, 3450, 3650, ... ]
|
114
|
+
7 :sex string 3 {"male"=>168, "female"=>165, ""=>9}
|
115
|
+
8 :year int64 3 {2007=>109, 2008=>114, 2009=>119}
|
116
|
+
```
|
117
|
+
|
118
|
+
For this frequently needed task, we can do it much simpler.
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
penguins.remove_nil # => same result as above
|
122
|
+
```
|
89
123
|
|
90
124
|
See [DataFrame.md](doc/DataFrame.md) for details.
|
91
125
|
|
@@ -95,10 +129,10 @@ See [DataFrame.md](doc/DataFrame.md) for details.
|
|
95
129
|
Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
96
130
|
|
97
131
|
```ruby
|
98
|
-
penguins[:
|
132
|
+
penguins[:bill_length_mm]
|
99
133
|
# =>
|
100
|
-
#<RedAmber::Vector(:
|
101
|
-
[
|
134
|
+
#<RedAmber::Vector(:double, size=344):0x000000000000f8fc>
|
135
|
+
[39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
102
136
|
```
|
103
137
|
|
104
138
|
Vectors accepts some [functional methods from Arrow](https://arrow.apache.org/docs/cpp/compute.html).
|
@@ -0,0 +1,15 @@
|
|
1
|
+
prelude: |
|
2
|
+
require 'datasets-arrow'
|
3
|
+
require 'rover'
|
4
|
+
require 'red_amber'
|
5
|
+
|
6
|
+
penguins_csv = 'benchmark/cache/penguins.csv'
|
7
|
+
|
8
|
+
unless File.exist?(penguins_csv)
|
9
|
+
arrow = Datasets::Penguins.new.to_arrow
|
10
|
+
RedAmber::DataFrame.new(arrow).save(penguins_csv)
|
11
|
+
end
|
12
|
+
|
13
|
+
benchmark:
|
14
|
+
'penguins by Rover': Rover.read_csv(penguins_csv)
|
15
|
+
'penguins by RedAmber': RedAmber::DataFrame.load(penguins_csv)
|
data/doc/DataFrame.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# DataFrame
|
2
2
|
|
3
|
-
Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
3
|
+
Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
4
4
|
- A collection of data which have same data type within. We call it `Vector`.
|
5
5
|
- A label is attached to `Vector`. We call it `key`.
|
6
6
|
- A `Vector` and associated `key` is grouped as a `variable`.
|
@@ -11,13 +11,13 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
11
11
|
|
12
12
|
## Constructors and saving
|
13
13
|
|
14
|
-
### `new` from a
|
14
|
+
### `new` from a Hash
|
15
15
|
|
16
16
|
```ruby
|
17
17
|
RedAmber::DataFrame.new(x: [1, 2, 3])
|
18
18
|
```
|
19
19
|
|
20
|
-
### `new` from a schema (by Hash) and
|
20
|
+
### `new` from a schema (by Hash) and data (by Array)
|
21
21
|
|
22
22
|
```ruby
|
23
23
|
RedAmber::DataFrame.new({:x=>:uint8}, [[1], [2], [3]])
|
@@ -52,7 +52,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
52
52
|
- from a URI
|
53
53
|
|
54
54
|
```ruby
|
55
|
-
uri = URI("https://
|
55
|
+
uri = URI("uri = URI("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
|
56
56
|
RedAmber::DataFrame.load(uri)
|
57
57
|
```
|
58
58
|
|
@@ -78,7 +78,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
78
78
|
|
79
79
|
## Properties
|
80
80
|
|
81
|
-
### `table`
|
81
|
+
### `table`, `to_arrow`
|
82
82
|
|
83
83
|
- Reader of Arrow::Table object inside.
|
84
84
|
|
@@ -93,16 +93,53 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
93
93
|
### `shape`
|
94
94
|
|
95
95
|
- Returns shape in an Array[n_rows, n_cols].
|
96
|
-
|
96
|
+
|
97
|
+
### `variables`
|
98
|
+
|
99
|
+
- Returns key names and Vectors pair in a Hash.
|
100
|
+
|
101
|
+
It is convenient to use in a block when both key and vector required. We will write:
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
# update numeric variables
|
105
|
+
df.assign do
|
106
|
+
variables.select.with_object({}) do |(key, vector), assigner|
|
107
|
+
assigner[key] = vector * -1 if vector.numeric?
|
108
|
+
end
|
109
|
+
end
|
110
|
+
```
|
111
|
+
|
112
|
+
Instead of:
|
113
|
+
```ruby
|
114
|
+
df.assign do
|
115
|
+
assigner = {}
|
116
|
+
vectors.each_with_index do |vector, i|
|
117
|
+
assigner[keys[i]] = vector * -1 if vector.numeric?
|
118
|
+
end
|
119
|
+
assigner
|
120
|
+
end
|
121
|
+
```
|
122
|
+
|
97
123
|
### `keys`, `var_names`, `column_names`
|
98
124
|
|
99
125
|
- Returns key names in an Array.
|
100
126
|
|
127
|
+
When we use it with vectors, Vector#key is useful to get the key inside of DataFrame.
|
128
|
+
|
129
|
+
```ruby
|
130
|
+
# update numeric variables, another solution
|
131
|
+
df.assign do
|
132
|
+
vectors.each_with_object({}) do |vector, assigner|
|
133
|
+
assigner[vector.key] = vector * -1 if vector.numeric?
|
134
|
+
end
|
135
|
+
end
|
136
|
+
```
|
137
|
+
|
101
138
|
### `types`
|
102
139
|
|
103
140
|
- Returns types of vectors in an Array of Symbols.
|
104
141
|
|
105
|
-
### `
|
142
|
+
### `type_classes`
|
106
143
|
|
107
144
|
- Returns types of vector in an Array of `Arrow::DataType`.
|
108
145
|
|
@@ -167,7 +204,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
167
204
|
8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
168
205
|
```
|
169
206
|
|
170
|
-
- limit:
|
207
|
+
- limit: limit of variables to show. Default value is 10.
|
171
208
|
- tally: max level to use tally mode.
|
172
209
|
- elements: max num of element to show values in each observations.
|
173
210
|
|
@@ -224,7 +261,16 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
224
261
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
225
262
|
[1, 2, 3]
|
226
263
|
```
|
227
|
-
|
264
|
+
Or `#v` method also returns a Vector for a key.
|
265
|
+
|
266
|
+
```ruby
|
267
|
+
df.v(:a)
|
268
|
+
# =>
|
269
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
270
|
+
[1, 2, 3]
|
271
|
+
```
|
272
|
+
|
273
|
+
This may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
228
274
|
|
229
275
|
### Select observations (rows in a table) by `[]` as `[index]`, `[range]`, `[array]`
|
230
276
|
|
@@ -267,13 +313,13 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
267
313
|
3 :c double 1 [1.0]
|
268
314
|
```
|
269
315
|
|
270
|
-
### Select rows from top or bottom
|
316
|
+
### Select rows from top or from bottom
|
271
317
|
|
272
318
|
`head(n=5)`, `tail(n=5)`, `first(n=1)`, `last(n=1)`
|
273
319
|
|
274
320
|
## Sub DataFrame manipulations
|
275
321
|
|
276
|
-
### `pick`
|
322
|
+
### `pick ` - pick up variables by key label -
|
277
323
|
|
278
324
|
Pick up some variables (columns) to create a sub DataFrame.
|
279
325
|
|
@@ -313,6 +359,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
313
359
|
`pick {block}` is also acceptable. We can't use both arguments and a block at a same time. The block should return keys, or a boolean Array with a same length as `n_keys`. Block is called in the context of self.
|
314
360
|
|
315
361
|
```ruby
|
362
|
+
# It is ok to write `keys ...` in the block, not `penguins.keys ...`
|
316
363
|
penguins.pick { keys.map { |key| key.end_with?('mm') } }
|
317
364
|
# =>
|
318
365
|
#<RedAmber::DataFrame : 344 x 3 Vectors, 0x000000000000f1cc>
|
@@ -323,7 +370,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
323
370
|
3 :flipper_length_mm int64 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
324
371
|
```
|
325
372
|
|
326
|
-
### `drop`
|
373
|
+
### `drop ` - pick and drop -
|
327
374
|
|
328
375
|
Drop some variables (columns) to create a remainer DataFrame.
|
329
376
|
|
@@ -352,15 +399,10 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
352
399
|
```
|
353
400
|
- Difference between `pick`/`drop` and `[]`
|
354
401
|
|
355
|
-
If `pick` or `drop` will select single variable (column), it returns a `DataFrame` with one variable. In contrast, `[]` returns a `Vector`.
|
402
|
+
If `pick` or `drop` will select a single variable (column), it returns a `DataFrame` with one variable. In contrast, `[]` returns a `Vector`. This behavior may be useful to use in a block of DataFrame manipulations.
|
356
403
|
|
357
404
|
```ruby
|
358
405
|
df = RedAmber::DataFrame.new(a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3])
|
359
|
-
df[:a]
|
360
|
-
# =>
|
361
|
-
#<RedAmber::Vector(:uint8, size=3):0x000000000000f258>
|
362
|
-
[1, 2, 3]
|
363
|
-
|
364
406
|
df.pick(:a) # or
|
365
407
|
df.drop(:b, :c)
|
366
408
|
# =>
|
@@ -368,9 +410,14 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
368
410
|
Vector : 1 numeric
|
369
411
|
# key type level data_preview
|
370
412
|
1 :a uint8 3 [1, 2, 3]
|
413
|
+
|
414
|
+
df[:a]
|
415
|
+
# =>
|
416
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000000f258>
|
417
|
+
[1, 2, 3]
|
371
418
|
```
|
372
419
|
|
373
|
-
### `slice`
|
420
|
+
### `slice ` - to cut vertically is slice -
|
374
421
|
|
375
422
|
Slice and select observations (rows) to create a sub DataFrame.
|
376
423
|
|
@@ -488,17 +535,17 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
488
535
|
removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
489
536
|
removed.tdr
|
490
537
|
# =>
|
491
|
-
RedAmber::DataFrame :
|
538
|
+
RedAmber::DataFrame : 333 x 8 Vectors
|
492
539
|
Vectors : 5 numeric, 3 strings
|
493
540
|
# key type level data_preview
|
494
|
-
1 :species string 3 {"Adelie"=>
|
495
|
-
2 :island string 3 {"Torgersen"=>
|
496
|
-
3 :bill_length_mm double
|
497
|
-
4 :bill_depth_mm double
|
498
|
-
5 :flipper_length_mm
|
499
|
-
6 :body_mass_g
|
500
|
-
7 :sex string
|
501
|
-
8 :year
|
541
|
+
1 :species string 3 {"Adelie"=>146, "Chinstrap"=>68, "Gentoo"=>119}
|
542
|
+
2 :island string 3 {"Torgersen"=>47, "Biscoe"=>163, "Dream"=>123}
|
543
|
+
3 :bill_length_mm double 163 [39.1, 39.5, 40.3, 36.7, 39.3, ... ]
|
544
|
+
4 :bill_depth_mm double 79 [18.7, 17.4, 18.0, 19.3, 20.6, ... ]
|
545
|
+
5 :flipper_length_mm uint8 54 [181, 186, 195, 193, 190, ... ]
|
546
|
+
6 :body_mass_g uint16 93 [3750, 3800, 3250, 3450, 3650, ... ]
|
547
|
+
7 :sex string 2 {"male"=>168, "female"=>165}
|
548
|
+
8 :year uint16 3 {2007=>103, 2008=>113, 2009=>117}
|
502
549
|
```
|
503
550
|
|
504
551
|
- Keys or booleans by a block
|
@@ -583,7 +630,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
583
630
|
|
584
631
|
### `assign`
|
585
632
|
|
586
|
-
Assign new variables (columns) and create a updated DataFrame.
|
633
|
+
Assign new or updated variables (columns) and create a updated DataFrame.
|
587
634
|
|
588
635
|
- Variables with new keys will append new variables at bottom (right in the table).
|
589
636
|
- Variables with exisiting keys will update corresponding vectors.
|
@@ -649,6 +696,14 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
649
696
|
1 :index int8 5 [0, -1, -2, -3, nil], 1 nil
|
650
697
|
2 :float double 5 [-0.0, -1.1, -2.2, NaN, nil], 1 NaN, 1 nil
|
651
698
|
3 :string string 5 ["A", "B", "C", "D", nil], 1 nil
|
699
|
+
|
700
|
+
# Or it ’s shorter like this:
|
701
|
+
df.assign do
|
702
|
+
variables.select.with_object({}) do |(key, vector), assigner|
|
703
|
+
assigner[key] = vector * -1 if vector.numeric?
|
704
|
+
end
|
705
|
+
end
|
706
|
+
# => same as above
|
652
707
|
```
|
653
708
|
|
654
709
|
- Key type
|
@@ -657,21 +712,116 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
657
712
|
|
658
713
|
## Updating
|
659
714
|
|
660
|
-
|
715
|
+
### `sort`
|
661
716
|
|
662
|
-
|
717
|
+
`sort` accepts parameters as sort_keys thanks to the amazing Red Arrow feature。
|
718
|
+
- :key, "key" or "+key" denotes ascending order
|
719
|
+
- "-key" denotes descending order
|
720
|
+
|
721
|
+
```ruby
|
722
|
+
df = RedAmber::DataFrame.new({
|
723
|
+
index: [1, 1, 0, nil, 0],
|
724
|
+
string: ['C', 'B', nil, 'A', 'B'],
|
725
|
+
bool: [nil, true, false, true, false],
|
726
|
+
})
|
727
|
+
df.sort(:index, '-bool').tdr(tally: 0)
|
728
|
+
# =>
|
729
|
+
RedAmber::DataFrame : 5 x 3 Vectors
|
730
|
+
Vectors : 1 numeric, 1 string, 1 boolean
|
731
|
+
# key type level data_preview
|
732
|
+
1 :index uint8 3 [0, 0, 1, 1, nil], 1 nil
|
733
|
+
2 :string string 4 [nil, "B", "B", "C", "A"], 1 nil
|
734
|
+
3 :bool boolean 3 [false, false, true, nil, true], 1 nil
|
735
|
+
```
|
663
736
|
|
664
|
-
- [ ]
|
737
|
+
- [ ] Clamp
|
665
738
|
|
666
739
|
- [ ] Clear data
|
667
740
|
|
668
741
|
## Treat na data
|
669
742
|
|
670
|
-
|
743
|
+
### `remove_nil`
|
744
|
+
|
745
|
+
Remove any observations containing nil.
|
746
|
+
|
747
|
+
## Grouping
|
748
|
+
|
749
|
+
### `group(aggregating_keys, function, target_keys)`
|
671
750
|
|
672
|
-
|
751
|
+
Create grouped dataframe by `aggregation_keys` and apply `function` to each group and returns in `target_keys`. Aggregated key name is `function(key)` style.
|
752
|
+
|
753
|
+
(The current implementation is not intuitive. Needs improvement.)
|
754
|
+
|
755
|
+
```ruby
|
756
|
+
ds = Datasets::Rdatasets.new('dplyr', 'starwars')
|
757
|
+
starwars = RedAmber::DataFrame.new(ds.to_table.to_h)
|
758
|
+
starwars.tdr(11)
|
759
|
+
# =>
|
760
|
+
RedAmber::DataFrame : 87 x 11 Vectors
|
761
|
+
Vectors : 3 numeric, 8 strings
|
762
|
+
# key type level data_preview
|
763
|
+
1 :name string 87 ["Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Organa", ... ]
|
764
|
+
2 :height uint16 46 [172, 167, 96, 202, 150, ... ], 6 nils
|
765
|
+
3 :mass double 39 [77.0, 75.0, 32.0, 136.0, 49.0, ... ], 28 nils
|
766
|
+
4 :hair_color string 13 ["blond", nil, nil, "none", "brown", ... ], 5 nils
|
767
|
+
5 :skin_color string 31 ["fair", "gold", "white, blue", "white", "light", .. . ]
|
768
|
+
6 :eye_color string 15 ["blue", "yellow", "red", "yellow", "brown", ... ]
|
769
|
+
7 :birth_year double 37 [19.0, 112.0, 33.0, 41.9, 19.0, ... ], 44 nils
|
770
|
+
8 :sex string 5 {"male"=>60, "none"=>6, "female"=>16, "hermaphroditic"=>1, nil=>4}
|
771
|
+
9 :gender string 3 {"masculine"=>66, "feminine"=>17, nil=>4}
|
772
|
+
10 :homeworld string 49 ["Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", ... ], 10 nils
|
773
|
+
11 :species string 38 ["Human", "Droid", "Droid", "Human", "Human", ... ], 4 nils
|
774
|
+
|
775
|
+
grouped = starwars.group(:species, :mean, [:mass, :height])
|
776
|
+
# =>
|
777
|
+
#<RedAmber::DataFrame : 38 x 3 Vectors, 0x000000000000fbf4>
|
778
|
+
Vectors : 2 numeric, 1 string
|
779
|
+
# key type level data_preview
|
780
|
+
1 :"mean(mass)" double 27 [82.78181818181818, 69.75, 124.0, 74.0, 1358.0, ... ], 6 nils
|
781
|
+
2 :"mean(height)" double 32 [176.6451612903226, 131.2, 231.0, 173.0, 175.0, ... ]
|
782
|
+
3 :species string 38 ["Human", "Droid", "Wookiee", "Rodian", "Hutt", ... ], 1 nil
|
783
|
+
|
784
|
+
count = starwars.group(:species, :count, :species)[:"count(species)"]
|
785
|
+
df = grouped.slice(count > 1)
|
786
|
+
# =>
|
787
|
+
#<RedAmber::DataFrame : 8 x 3 Vectors, 0x000000000000fc44>
|
788
|
+
Vectors : 2 numeric, 1 string
|
789
|
+
# key type level data_preview
|
790
|
+
1 :"mean(mass)" double 8 [82.78181818181818, 69.75, 124.0, 74.0, 80.0, ... ]
|
791
|
+
2 :"mean(height)" double 8 [176.6451612903226, 131.2, 231.0, 208.66666666666666, 173.0, ... ]
|
792
|
+
3 :species string 8 ["Human", "Droid", "Wookiee", "Gungan", "Zabrak", ... ]
|
793
|
+
|
794
|
+
df.table
|
795
|
+
# =>
|
796
|
+
#<Arrow::Table:0x1165593c8 ptr=0x7fb3db144c70>
|
797
|
+
mean(mass) mean(height) species
|
798
|
+
0 82.781818 176.645161 Human
|
799
|
+
1 69.750000 131.200000 Droid
|
800
|
+
2 124.000000 231.000000 Wookiee
|
801
|
+
3 74.000000 208.666667 Gungan
|
802
|
+
4 80.000000 173.000000 Zabrak
|
803
|
+
5 55.000000 179.000000 Twi'lek
|
804
|
+
6 53.100000 168.000000 Mirialan
|
805
|
+
7 88.000000 221.000000 Kaminoan
|
806
|
+
```
|
673
807
|
|
674
|
-
|
808
|
+
Available functions are:
|
809
|
+
|
810
|
+
- [ ] all
|
811
|
+
- [ ] any
|
812
|
+
- [ ] approximate_median
|
813
|
+
- ✓ count
|
814
|
+
- [ ] count_distinct
|
815
|
+
- [ ] distinct
|
816
|
+
- ✓ max
|
817
|
+
- ✓ mean
|
818
|
+
- ✓ min
|
819
|
+
- [ ] min_max
|
820
|
+
- ✓ product
|
821
|
+
- ✓ stddev
|
822
|
+
- ✓ sum
|
823
|
+
- [ ] tdigest
|
824
|
+
- ✓ variance
|
675
825
|
|
676
826
|
## Combining DataFrames
|
677
827
|
|