red_amber 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -8
- data/CHANGELOG.md +74 -7
- data/Gemfile +3 -0
- data/README.md +47 -13
- data/benchmark/csv_load_penguins.yml +15 -0
- data/benchmark/drop_nil.yml +11 -0
- data/doc/DataFrame.md +185 -35
- data/doc/Vector.md +132 -10
- data/doc/image/dataframe_model.png +0 -0
- data/doc/tdr.md +14 -11
- data/doc/tdr_ja.md +13 -10
- data/lib/red_amber/data_frame.rb +38 -23
- data/lib/red_amber/data_frame_displayable.rb +4 -3
- data/lib/red_amber/data_frame_helper.rb +8 -8
- data/lib/red_amber/data_frame_indexable.rb +38 -0
- data/lib/red_amber/data_frame_observation_operation.rb +13 -2
- data/lib/red_amber/data_frame_selectable.rb +14 -4
- data/lib/red_amber/vector.rb +28 -5
- data/lib/red_amber/vector_compensable.rb +68 -0
- data/lib/red_amber/vector_functions.rb +16 -13
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +5 -0
- data/red_amber.gemspec +3 -6
- metadata +12 -9
- data/doc/image/TDR_operations.pdf +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d18eedf5de7fd06fe52e8a82ad38fe12d590dc10929c96872e557b9e946f785
|
4
|
+
data.tar.gz: dda93f0af421096410e00ecf2261e8846a236634bd96ae9941d1b5cd49cd5eb2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c1b1edd6c1f6f3f275ea765c4bc8765327c88a36120a4c5a66dd8afa59f5913db4a5b436d80378554e03403bab823edf7467beea0f44e2803e36f3e9677a065
|
7
|
+
data.tar.gz: 949fd15d2076d4e53fb141375bde282228c7f6566e137047344134c54964fe77fd2f9757b0bdc324eb3cfa14091f2ae928e0e844d28f3ebbcfa17fc7d388bbd0
|
data/.rubocop.yml
CHANGED
@@ -53,12 +53,10 @@ Layout/LineLength:
|
|
53
53
|
# 18..30 unsatisfactory
|
54
54
|
# > 30 dangerous
|
55
55
|
Metrics/AbcSize:
|
56
|
-
Max:
|
56
|
+
Max: 30
|
57
57
|
Exclude:
|
58
58
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 55
|
59
|
-
- 'lib/red_amber/
|
60
|
-
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 29
|
61
|
-
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 26
|
59
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 36
|
62
60
|
|
63
61
|
# Max: 25
|
64
62
|
Metrics/BlockLength:
|
@@ -68,21 +66,21 @@ Metrics/BlockLength:
|
|
68
66
|
|
69
67
|
# Max: 100
|
70
68
|
Metrics/ClassLength:
|
71
|
-
Max:
|
69
|
+
Max: 120
|
72
70
|
Exclude:
|
73
71
|
- 'test/**/*'
|
74
72
|
|
75
73
|
# Max: 7
|
76
74
|
Metrics/CyclomaticComplexity:
|
77
75
|
Max: 12
|
76
|
+
Exclude:
|
77
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 14
|
78
78
|
|
79
79
|
# Max: 10
|
80
80
|
Metrics/MethodLength:
|
81
|
-
Max:
|
81
|
+
Max: 30
|
82
82
|
Exclude:
|
83
83
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 33
|
84
|
-
- 'lib/red_amber/data_frame_observation_operation.rb' # Max: 21
|
85
|
-
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 20
|
86
84
|
|
87
85
|
# Max: 100
|
88
86
|
Metrics/ModuleLength:
|
@@ -93,6 +91,8 @@ Metrics/ModuleLength:
|
|
93
91
|
# Max: 8
|
94
92
|
Metrics/PerceivedComplexity:
|
95
93
|
Max: 13
|
94
|
+
Exclude:
|
95
|
+
- 'lib/red_amber/vector_compensable.rb' # Max: 15
|
96
96
|
|
97
97
|
# Necessary to define is_na
|
98
98
|
Naming/PredicateName:
|
data/CHANGELOG.md
CHANGED
@@ -1,19 +1,86 @@
|
|
1
|
-
##
|
1
|
+
## [0.2.0] - unreleased
|
2
2
|
|
3
|
-
-
|
3
|
+
- Document
|
4
|
+
- YARD support
|
5
|
+
|
6
|
+
- DataFrame#join features
|
7
|
+
|
8
|
+
## [0.1.6] - Unreleased
|
9
|
+
|
10
|
+
- Feedback something to Red Data Tools
|
4
11
|
|
5
12
|
- `DataFrame`
|
6
|
-
- Introduce `group_by`
|
7
|
-
- Introduce `summarize`
|
8
13
|
- Introduce `summary` or ``describe`
|
14
|
+
- Add `Quantile` by own code?
|
9
15
|
- Improve dataframe obs. manipuration methods to accept float as a index (#10)
|
10
|
-
-
|
16
|
+
- Improve as more performant by benchmark check.
|
11
17
|
|
12
18
|
- `Vector`
|
13
19
|
- Support more functions
|
20
|
+
- Support coerece
|
14
21
|
|
15
|
-
-
|
16
|
-
|
22
|
+
- More examples of frequently needed tasks
|
23
|
+
|
24
|
+
## [0.1.5] - 2022-06-12 (experimental)
|
25
|
+
|
26
|
+
- Bug fixes
|
27
|
+
- Fix DF#tdr to display timestamp type (#19)
|
28
|
+
- Add TZ setting in CI test to pass temporal tests (#19)
|
29
|
+
- Fix example in document of #load(csv_from_URI) (#23)
|
30
|
+
|
31
|
+
- New features and improvements
|
32
|
+
- Improve usability of DataFrame manipulating block (#19)
|
33
|
+
- Add `DataFrame#v` to select a Vector
|
34
|
+
- Add `DataFrame#variables` method
|
35
|
+
- Add `DataFrame#to_arrow`
|
36
|
+
- Add instance variables in DataFrame with lazy initialization
|
37
|
+
- Add `Vector#key` to get key name
|
38
|
+
- Add `Vector#temporal?` to check if temporal type
|
39
|
+
- Refine around DataFrame#variables
|
40
|
+
- Refine init of instance variables
|
41
|
+
- Refine DataFrame#type_classes, V#ectortype_class
|
42
|
+
- Refine DataFrame#tdr to shorten temporal data
|
43
|
+
|
44
|
+
- Add supports to make up for missing values (#20)
|
45
|
+
- Add VectorArgumentError
|
46
|
+
- Add `Vector#replace_with`
|
47
|
+
- Add helper function to assert with NaN
|
48
|
+
- To assert NaN == NaN
|
49
|
+
- Add `Vector#fill_nil_backward`, `Vector#forward`
|
50
|
+
- Add `DataFrame#remove_nil` method
|
51
|
+
- Change to accept nil as replacement in Vector#replace_with
|
52
|
+
|
53
|
+
- Introduce index related methods (#22)
|
54
|
+
- Add `Vector#sort_indexes` method
|
55
|
+
- Add `Vector#uniq` method
|
56
|
+
- Add `Vector#tally` and `Vectorvalue_counts` methods
|
57
|
+
- Add `DataFrame#sort` method
|
58
|
+
- Add `DataFrame#group` method
|
59
|
+
- Change to use DataFrame#map_indices in #[]
|
60
|
+
|
61
|
+
- Add rounding functions with opts (#21)
|
62
|
+
- With options :mode and :n_digits
|
63
|
+
- :n_digits also can be specified with :multiple option in `Vector#round_to_multiple`
|
64
|
+
- `Vector#round`
|
65
|
+
- `Vector#ceil`
|
66
|
+
- `Vector#floor`
|
67
|
+
- `Vector#trunc`
|
68
|
+
|
69
|
+
- Documentation
|
70
|
+
- Update TDR, TDR_ja documents to latest (#18)
|
71
|
+
- Refinement and small fix in DataFrame.md (#18)
|
72
|
+
- Update README to use more effective example (#18)
|
73
|
+
- Delete expired TDR_operations.pdf (#23)
|
74
|
+
- Update README and dataframe_model image (#23)
|
75
|
+
- Update description about rover-df in README (#23)
|
76
|
+
- Add installation of Arrow in README (#23)
|
77
|
+
|
78
|
+
- Others
|
79
|
+
- Tried but cannot use bundler cache in ci test (#17)
|
80
|
+
- Bump up requirements to Arrow 8.0.0 (#25)
|
81
|
+
- Arrow 7.0.0 with Ubuntu 21.04 causes an fatal error in replace_with_mask function.
|
82
|
+
- Update the description of gem (#23)
|
83
|
+
- Add benchmark tests (#26)
|
17
84
|
|
18
85
|
## [0.1.4] - 2022-05-29 (experimental)
|
19
86
|
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,18 +3,27 @@
|
|
3
3
|
A simple dataframe library for Ruby (experimental)
|
4
4
|
|
5
5
|
- Powered by [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow)
|
6
|
-
-
|
6
|
+
- Inspired by the dataframe library [Rover-df](https://github.com/ankane/rover)
|
7
7
|
|
8
8
|
## Requirements
|
9
9
|
|
10
10
|
```ruby
|
11
|
-
gem 'red-arrow', '>=
|
12
|
-
gem 'red-parquet', '>=
|
11
|
+
gem 'red-arrow', '>= 8.0.0'
|
12
|
+
gem 'red-parquet', '>= 8.0.0' # if you use IO from/to parquet
|
13
13
|
gem 'rover-df', '~> 0.3.0' # if you use IO from/to Rover::DataFrame
|
14
14
|
```
|
15
15
|
|
16
16
|
## Installation
|
17
17
|
|
18
|
+
Install requirements before you install Red Amber.
|
19
|
+
|
20
|
+
- Apache Arrow GLib (>= 8.0.0)
|
21
|
+
- Apache Parquet GLib (>= 8.0.0)
|
22
|
+
|
23
|
+
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
24
|
+
|
25
|
+
Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
|
26
|
+
|
18
27
|
Add this line to your Gemfile:
|
19
28
|
|
20
29
|
```ruby
|
@@ -41,8 +50,9 @@ Represents a set of data in 2D-shape.
|
|
41
50
|
require 'red_amber'
|
42
51
|
require 'datasets-arrow'
|
43
52
|
|
44
|
-
|
45
|
-
|
53
|
+
arrow = Datasets::Penguins.new.to_arrow
|
54
|
+
penguins = RedAmber::DataFrame.new(arrow)
|
55
|
+
penguins.tdr
|
46
56
|
# =>
|
47
57
|
RedAmber::DataFrame : 344 x 8 Vectors
|
48
58
|
Vectors : 5 numeric, 3 strings
|
@@ -71,12 +81,10 @@ Vector : 1 numeric
|
|
71
81
|
1 :body_mass_g int64 95 [3750, 3800, 3250, nil, 3450, ... ], 2 nils
|
72
82
|
```
|
73
83
|
|
74
|
-
`DataFrame#assign`
|
84
|
+
`DataFrame#assign` creates new variables (column in the table).
|
75
85
|
|
76
86
|
```ruby
|
77
|
-
df.assign
|
78
|
-
{:body_mass_kg => penguins[:body_mass_g] / 1000.0}
|
79
|
-
end
|
87
|
+
df.assign(:body_mass_kg => df[:body_mass_g] / 1000.0)
|
80
88
|
# =>
|
81
89
|
#<RedAmber::DataFrame : 344 x 2 Vectors, 0x000000000000fa28>
|
82
90
|
Vectors : 2 numeric
|
@@ -85,7 +93,33 @@ Vectors : 2 numeric
|
|
85
93
|
2 :body_mass_kg double 95 [3.75, 3.8, 3.25, nil, 3.45, ... ], 2 nils
|
86
94
|
```
|
87
95
|
|
88
|
-
|
96
|
+
DataFrame manipulating methods like `pick`, `drop`, `slice`, `remove`, `rename` and `assign` accept a block.
|
97
|
+
|
98
|
+
This is an exaple to eliminate observations (row in the table) containing nil.
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
# remove all observation contains nil
|
102
|
+
nil_removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
103
|
+
nil_removed.tdr
|
104
|
+
# =>
|
105
|
+
RedAmber::DataFrame : 342 x 8 Vectors
|
106
|
+
Vectors : 5 numeric, 3 strings
|
107
|
+
# key type level data_preview
|
108
|
+
1 :species string 3 {"Adelie"=>151, "Chinstrap"=>68, "Gentoo"=>123}
|
109
|
+
2 :island string 3 {"Torgersen"=>51, "Biscoe"=>167, "Dream"=>124}
|
110
|
+
3 :bill_length_mm double 164 [39.1, 39.5, 40.3, 36.7, 39.3, ... ]
|
111
|
+
4 :bill_depth_mm double 80 [18.7, 17.4, 18.0, 19.3, 20.6, ... ]
|
112
|
+
5 :flipper_length_mm int64 55 [181, 186, 195, 193, 190, ... ]
|
113
|
+
6 :body_mass_g int64 94 [3750, 3800, 3250, 3450, 3650, ... ]
|
114
|
+
7 :sex string 3 {"male"=>168, "female"=>165, ""=>9}
|
115
|
+
8 :year int64 3 {2007=>109, 2008=>114, 2009=>119}
|
116
|
+
```
|
117
|
+
|
118
|
+
For this frequently needed task, we can do it much simpler.
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
penguins.remove_nil # => same result as above
|
122
|
+
```
|
89
123
|
|
90
124
|
See [DataFrame.md](doc/DataFrame.md) for details.
|
91
125
|
|
@@ -95,10 +129,10 @@ See [DataFrame.md](doc/DataFrame.md) for details.
|
|
95
129
|
Class `RedAmber::Vector` represents a series of data in the DataFrame.
|
96
130
|
|
97
131
|
```ruby
|
98
|
-
penguins[:
|
132
|
+
penguins[:bill_length_mm]
|
99
133
|
# =>
|
100
|
-
#<RedAmber::Vector(:
|
101
|
-
[
|
134
|
+
#<RedAmber::Vector(:double, size=344):0x000000000000f8fc>
|
135
|
+
[39.1, 39.5, 40.3, nil, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, 37.8, 37.8, 41.1, ... ]
|
102
136
|
```
|
103
137
|
|
104
138
|
Vectors accepts some [functional methods from Arrow](https://arrow.apache.org/docs/cpp/compute.html).
|
@@ -0,0 +1,15 @@
|
|
1
|
+
prelude: |
|
2
|
+
require 'datasets-arrow'
|
3
|
+
require 'rover'
|
4
|
+
require 'red_amber'
|
5
|
+
|
6
|
+
penguins_csv = 'benchmark/cache/penguins.csv'
|
7
|
+
|
8
|
+
unless File.exist?(penguins_csv)
|
9
|
+
arrow = Datasets::Penguins.new.to_arrow
|
10
|
+
RedAmber::DataFrame.new(arrow).save(penguins_csv)
|
11
|
+
end
|
12
|
+
|
13
|
+
benchmark:
|
14
|
+
'penguins by Rover': Rover.read_csv(penguins_csv)
|
15
|
+
'penguins by RedAmber': RedAmber::DataFrame.load(penguins_csv)
|
data/doc/DataFrame.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# DataFrame
|
2
2
|
|
3
|
-
Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
3
|
+
Class `RedAmber::DataFrame` represents 2D-data. A `DataFrame` consists with:
|
4
4
|
- A collection of data which have same data type within. We call it `Vector`.
|
5
5
|
- A label is attached to `Vector`. We call it `key`.
|
6
6
|
- A `Vector` and associated `key` is grouped as a `variable`.
|
@@ -11,13 +11,13 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
11
11
|
|
12
12
|
## Constructors and saving
|
13
13
|
|
14
|
-
### `new` from a
|
14
|
+
### `new` from a Hash
|
15
15
|
|
16
16
|
```ruby
|
17
17
|
RedAmber::DataFrame.new(x: [1, 2, 3])
|
18
18
|
```
|
19
19
|
|
20
|
-
### `new` from a schema (by Hash) and
|
20
|
+
### `new` from a schema (by Hash) and data (by Array)
|
21
21
|
|
22
22
|
```ruby
|
23
23
|
RedAmber::DataFrame.new({:x=>:uint8}, [[1], [2], [3]])
|
@@ -52,7 +52,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
52
52
|
- from a URI
|
53
53
|
|
54
54
|
```ruby
|
55
|
-
uri = URI("https://
|
55
|
+
uri = URI("uri = URI("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
|
56
56
|
RedAmber::DataFrame.load(uri)
|
57
57
|
```
|
58
58
|
|
@@ -78,7 +78,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
78
78
|
|
79
79
|
## Properties
|
80
80
|
|
81
|
-
### `table`
|
81
|
+
### `table`, `to_arrow`
|
82
82
|
|
83
83
|
- Reader of Arrow::Table object inside.
|
84
84
|
|
@@ -93,16 +93,53 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
93
93
|
### `shape`
|
94
94
|
|
95
95
|
- Returns shape in an Array[n_rows, n_cols].
|
96
|
-
|
96
|
+
|
97
|
+
### `variables`
|
98
|
+
|
99
|
+
- Returns key names and Vectors pair in a Hash.
|
100
|
+
|
101
|
+
It is convenient to use in a block when both key and vector required. We will write:
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
# update numeric variables
|
105
|
+
df.assign do
|
106
|
+
variables.select.with_object({}) do |(key, vector), assigner|
|
107
|
+
assigner[key] = vector * -1 if vector.numeric?
|
108
|
+
end
|
109
|
+
end
|
110
|
+
```
|
111
|
+
|
112
|
+
Instead of:
|
113
|
+
```ruby
|
114
|
+
df.assign do
|
115
|
+
assigner = {}
|
116
|
+
vectors.each_with_index do |vector, i|
|
117
|
+
assigner[keys[i]] = vector * -1 if vector.numeric?
|
118
|
+
end
|
119
|
+
assigner
|
120
|
+
end
|
121
|
+
```
|
122
|
+
|
97
123
|
### `keys`, `var_names`, `column_names`
|
98
124
|
|
99
125
|
- Returns key names in an Array.
|
100
126
|
|
127
|
+
When we use it with vectors, Vector#key is useful to get the key inside of DataFrame.
|
128
|
+
|
129
|
+
```ruby
|
130
|
+
# update numeric variables, another solution
|
131
|
+
df.assign do
|
132
|
+
vectors.each_with_object({}) do |vector, assigner|
|
133
|
+
assigner[vector.key] = vector * -1 if vector.numeric?
|
134
|
+
end
|
135
|
+
end
|
136
|
+
```
|
137
|
+
|
101
138
|
### `types`
|
102
139
|
|
103
140
|
- Returns types of vectors in an Array of Symbols.
|
104
141
|
|
105
|
-
### `
|
142
|
+
### `type_classes`
|
106
143
|
|
107
144
|
- Returns types of vector in an Array of `Arrow::DataType`.
|
108
145
|
|
@@ -167,7 +204,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
167
204
|
8 :year uint16 3 {2007=>110, 2008=>114, 2009=>120}
|
168
205
|
```
|
169
206
|
|
170
|
-
- limit:
|
207
|
+
- limit: limit of variables to show. Default value is 10.
|
171
208
|
- tally: max level to use tally mode.
|
172
209
|
- elements: max num of element to show values in each observations.
|
173
210
|
|
@@ -224,7 +261,16 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
224
261
|
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
225
262
|
[1, 2, 3]
|
226
263
|
```
|
227
|
-
|
264
|
+
Or `#v` method also returns a Vector for a key.
|
265
|
+
|
266
|
+
```ruby
|
267
|
+
df.v(:a)
|
268
|
+
# =>
|
269
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000000f140>
|
270
|
+
[1, 2, 3]
|
271
|
+
```
|
272
|
+
|
273
|
+
This may be useful to use in a block of DataFrame manipulation verbs. We can write `v(:a)` rather than `self[:a]` or `df[:a]`
|
228
274
|
|
229
275
|
### Select observations (rows in a table) by `[]` as `[index]`, `[range]`, `[array]`
|
230
276
|
|
@@ -267,13 +313,13 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
267
313
|
3 :c double 1 [1.0]
|
268
314
|
```
|
269
315
|
|
270
|
-
### Select rows from top or bottom
|
316
|
+
### Select rows from top or from bottom
|
271
317
|
|
272
318
|
`head(n=5)`, `tail(n=5)`, `first(n=1)`, `last(n=1)`
|
273
319
|
|
274
320
|
## Sub DataFrame manipulations
|
275
321
|
|
276
|
-
### `pick`
|
322
|
+
### `pick ` - pick up variables by key label -
|
277
323
|
|
278
324
|
Pick up some variables (columns) to create a sub DataFrame.
|
279
325
|
|
@@ -313,6 +359,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
313
359
|
`pick {block}` is also acceptable. We can't use both arguments and a block at a same time. The block should return keys, or a boolean Array with a same length as `n_keys`. Block is called in the context of self.
|
314
360
|
|
315
361
|
```ruby
|
362
|
+
# It is ok to write `keys ...` in the block, not `penguins.keys ...`
|
316
363
|
penguins.pick { keys.map { |key| key.end_with?('mm') } }
|
317
364
|
# =>
|
318
365
|
#<RedAmber::DataFrame : 344 x 3 Vectors, 0x000000000000f1cc>
|
@@ -323,7 +370,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
323
370
|
3 :flipper_length_mm int64 56 [181, 186, 195, nil, 193, ... ], 2 nils
|
324
371
|
```
|
325
372
|
|
326
|
-
### `drop`
|
373
|
+
### `drop ` - pick and drop -
|
327
374
|
|
328
375
|
Drop some variables (columns) to create a remainer DataFrame.
|
329
376
|
|
@@ -352,15 +399,10 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
352
399
|
```
|
353
400
|
- Difference between `pick`/`drop` and `[]`
|
354
401
|
|
355
|
-
If `pick` or `drop` will select single variable (column), it returns a `DataFrame` with one variable. In contrast, `[]` returns a `Vector`.
|
402
|
+
If `pick` or `drop` will select a single variable (column), it returns a `DataFrame` with one variable. In contrast, `[]` returns a `Vector`. This behavior may be useful to use in a block of DataFrame manipulations.
|
356
403
|
|
357
404
|
```ruby
|
358
405
|
df = RedAmber::DataFrame.new(a: [1, 2, 3], b: %w[A B C], c: [1.0, 2, 3])
|
359
|
-
df[:a]
|
360
|
-
# =>
|
361
|
-
#<RedAmber::Vector(:uint8, size=3):0x000000000000f258>
|
362
|
-
[1, 2, 3]
|
363
|
-
|
364
406
|
df.pick(:a) # or
|
365
407
|
df.drop(:b, :c)
|
366
408
|
# =>
|
@@ -368,9 +410,14 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
368
410
|
Vector : 1 numeric
|
369
411
|
# key type level data_preview
|
370
412
|
1 :a uint8 3 [1, 2, 3]
|
413
|
+
|
414
|
+
df[:a]
|
415
|
+
# =>
|
416
|
+
#<RedAmber::Vector(:uint8, size=3):0x000000000000f258>
|
417
|
+
[1, 2, 3]
|
371
418
|
```
|
372
419
|
|
373
|
-
### `slice`
|
420
|
+
### `slice ` - to cut vertically is slice -
|
374
421
|
|
375
422
|
Slice and select observations (rows) to create a sub DataFrame.
|
376
423
|
|
@@ -488,17 +535,17 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
488
535
|
removed = penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
489
536
|
removed.tdr
|
490
537
|
# =>
|
491
|
-
RedAmber::DataFrame :
|
538
|
+
RedAmber::DataFrame : 333 x 8 Vectors
|
492
539
|
Vectors : 5 numeric, 3 strings
|
493
540
|
# key type level data_preview
|
494
|
-
1 :species string 3 {"Adelie"=>
|
495
|
-
2 :island string 3 {"Torgersen"=>
|
496
|
-
3 :bill_length_mm double
|
497
|
-
4 :bill_depth_mm double
|
498
|
-
5 :flipper_length_mm
|
499
|
-
6 :body_mass_g
|
500
|
-
7 :sex string
|
501
|
-
8 :year
|
541
|
+
1 :species string 3 {"Adelie"=>146, "Chinstrap"=>68, "Gentoo"=>119}
|
542
|
+
2 :island string 3 {"Torgersen"=>47, "Biscoe"=>163, "Dream"=>123}
|
543
|
+
3 :bill_length_mm double 163 [39.1, 39.5, 40.3, 36.7, 39.3, ... ]
|
544
|
+
4 :bill_depth_mm double 79 [18.7, 17.4, 18.0, 19.3, 20.6, ... ]
|
545
|
+
5 :flipper_length_mm uint8 54 [181, 186, 195, 193, 190, ... ]
|
546
|
+
6 :body_mass_g uint16 93 [3750, 3800, 3250, 3450, 3650, ... ]
|
547
|
+
7 :sex string 2 {"male"=>168, "female"=>165}
|
548
|
+
8 :year uint16 3 {2007=>103, 2008=>113, 2009=>117}
|
502
549
|
```
|
503
550
|
|
504
551
|
- Keys or booleans by a block
|
@@ -583,7 +630,7 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
583
630
|
|
584
631
|
### `assign`
|
585
632
|
|
586
|
-
Assign new variables (columns) and create a updated DataFrame.
|
633
|
+
Assign new or updated variables (columns) and create a updated DataFrame.
|
587
634
|
|
588
635
|
- Variables with new keys will append new variables at bottom (right in the table).
|
589
636
|
- Variables with exisiting keys will update corresponding vectors.
|
@@ -649,6 +696,14 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
649
696
|
1 :index int8 5 [0, -1, -2, -3, nil], 1 nil
|
650
697
|
2 :float double 5 [-0.0, -1.1, -2.2, NaN, nil], 1 NaN, 1 nil
|
651
698
|
3 :string string 5 ["A", "B", "C", "D", nil], 1 nil
|
699
|
+
|
700
|
+
# Or it ’s shorter like this:
|
701
|
+
df.assign do
|
702
|
+
variables.select.with_object({}) do |(key, vector), assigner|
|
703
|
+
assigner[key] = vector * -1 if vector.numeric?
|
704
|
+
end
|
705
|
+
end
|
706
|
+
# => same as above
|
652
707
|
```
|
653
708
|
|
654
709
|
- Key type
|
@@ -657,21 +712,116 @@ Class `RedAmber::DataFrame` represents 2D-data. `DataFrame` consists with:
|
|
657
712
|
|
658
713
|
## Updating
|
659
714
|
|
660
|
-
|
715
|
+
### `sort`
|
661
716
|
|
662
|
-
|
717
|
+
`sort` accepts parameters as sort_keys thanks to the amazing Red Arrow feature。
|
718
|
+
- :key, "key" or "+key" denotes ascending order
|
719
|
+
- "-key" denotes descending order
|
720
|
+
|
721
|
+
```ruby
|
722
|
+
df = RedAmber::DataFrame.new({
|
723
|
+
index: [1, 1, 0, nil, 0],
|
724
|
+
string: ['C', 'B', nil, 'A', 'B'],
|
725
|
+
bool: [nil, true, false, true, false],
|
726
|
+
})
|
727
|
+
df.sort(:index, '-bool').tdr(tally: 0)
|
728
|
+
# =>
|
729
|
+
RedAmber::DataFrame : 5 x 3 Vectors
|
730
|
+
Vectors : 1 numeric, 1 string, 1 boolean
|
731
|
+
# key type level data_preview
|
732
|
+
1 :index uint8 3 [0, 0, 1, 1, nil], 1 nil
|
733
|
+
2 :string string 4 [nil, "B", "B", "C", "A"], 1 nil
|
734
|
+
3 :bool boolean 3 [false, false, true, nil, true], 1 nil
|
735
|
+
```
|
663
736
|
|
664
|
-
- [ ]
|
737
|
+
- [ ] Clamp
|
665
738
|
|
666
739
|
- [ ] Clear data
|
667
740
|
|
668
741
|
## Treat na data
|
669
742
|
|
670
|
-
|
743
|
+
### `remove_nil`
|
744
|
+
|
745
|
+
Remove any observations containing nil.
|
746
|
+
|
747
|
+
## Grouping
|
748
|
+
|
749
|
+
### `group(aggregating_keys, function, target_keys)`
|
671
750
|
|
672
|
-
|
751
|
+
Create grouped dataframe by `aggregation_keys` and apply `function` to each group and returns in `target_keys`. Aggregated key name is `function(key)` style.
|
752
|
+
|
753
|
+
(The current implementation is not intuitive. Needs improvement.)
|
754
|
+
|
755
|
+
```ruby
|
756
|
+
ds = Datasets::Rdatasets.new('dplyr', 'starwars')
|
757
|
+
starwars = RedAmber::DataFrame.new(ds.to_table.to_h)
|
758
|
+
starwars.tdr(11)
|
759
|
+
# =>
|
760
|
+
RedAmber::DataFrame : 87 x 11 Vectors
|
761
|
+
Vectors : 3 numeric, 8 strings
|
762
|
+
# key type level data_preview
|
763
|
+
1 :name string 87 ["Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Organa", ... ]
|
764
|
+
2 :height uint16 46 [172, 167, 96, 202, 150, ... ], 6 nils
|
765
|
+
3 :mass double 39 [77.0, 75.0, 32.0, 136.0, 49.0, ... ], 28 nils
|
766
|
+
4 :hair_color string 13 ["blond", nil, nil, "none", "brown", ... ], 5 nils
|
767
|
+
5 :skin_color string 31 ["fair", "gold", "white, blue", "white", "light", .. . ]
|
768
|
+
6 :eye_color string 15 ["blue", "yellow", "red", "yellow", "brown", ... ]
|
769
|
+
7 :birth_year double 37 [19.0, 112.0, 33.0, 41.9, 19.0, ... ], 44 nils
|
770
|
+
8 :sex string 5 {"male"=>60, "none"=>6, "female"=>16, "hermaphroditic"=>1, nil=>4}
|
771
|
+
9 :gender string 3 {"masculine"=>66, "feminine"=>17, nil=>4}
|
772
|
+
10 :homeworld string 49 ["Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", ... ], 10 nils
|
773
|
+
11 :species string 38 ["Human", "Droid", "Droid", "Human", "Human", ... ], 4 nils
|
774
|
+
|
775
|
+
grouped = starwars.group(:species, :mean, [:mass, :height])
|
776
|
+
# =>
|
777
|
+
#<RedAmber::DataFrame : 38 x 3 Vectors, 0x000000000000fbf4>
|
778
|
+
Vectors : 2 numeric, 1 string
|
779
|
+
# key type level data_preview
|
780
|
+
1 :"mean(mass)" double 27 [82.78181818181818, 69.75, 124.0, 74.0, 1358.0, ... ], 6 nils
|
781
|
+
2 :"mean(height)" double 32 [176.6451612903226, 131.2, 231.0, 173.0, 175.0, ... ]
|
782
|
+
3 :species string 38 ["Human", "Droid", "Wookiee", "Rodian", "Hutt", ... ], 1 nil
|
783
|
+
|
784
|
+
count = starwars.group(:species, :count, :species)[:"count(species)"]
|
785
|
+
df = grouped.slice(count > 1)
|
786
|
+
# =>
|
787
|
+
#<RedAmber::DataFrame : 8 x 3 Vectors, 0x000000000000fc44>
|
788
|
+
Vectors : 2 numeric, 1 string
|
789
|
+
# key type level data_preview
|
790
|
+
1 :"mean(mass)" double 8 [82.78181818181818, 69.75, 124.0, 74.0, 80.0, ... ]
|
791
|
+
2 :"mean(height)" double 8 [176.6451612903226, 131.2, 231.0, 208.66666666666666, 173.0, ... ]
|
792
|
+
3 :species string 8 ["Human", "Droid", "Wookiee", "Gungan", "Zabrak", ... ]
|
793
|
+
|
794
|
+
df.table
|
795
|
+
# =>
|
796
|
+
#<Arrow::Table:0x1165593c8 ptr=0x7fb3db144c70>
|
797
|
+
mean(mass) mean(height) species
|
798
|
+
0 82.781818 176.645161 Human
|
799
|
+
1 69.750000 131.200000 Droid
|
800
|
+
2 124.000000 231.000000 Wookiee
|
801
|
+
3 74.000000 208.666667 Gungan
|
802
|
+
4 80.000000 173.000000 Zabrak
|
803
|
+
5 55.000000 179.000000 Twi'lek
|
804
|
+
6 53.100000 168.000000 Mirialan
|
805
|
+
7 88.000000 221.000000 Kaminoan
|
806
|
+
```
|
673
807
|
|
674
|
-
|
808
|
+
Available functions are:
|
809
|
+
|
810
|
+
- [ ] all
|
811
|
+
- [ ] any
|
812
|
+
- [ ] approximate_median
|
813
|
+
- ✓ count
|
814
|
+
- [ ] count_distinct
|
815
|
+
- [ ] distinct
|
816
|
+
- ✓ max
|
817
|
+
- ✓ mean
|
818
|
+
- ✓ min
|
819
|
+
- [ ] min_max
|
820
|
+
- ✓ product
|
821
|
+
- ✓ stddev
|
822
|
+
- ✓ sum
|
823
|
+
- [ ] tdigest
|
824
|
+
- ✓ variance
|
675
825
|
|
676
826
|
## Combining DataFrames
|
677
827
|
|