red_amber 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -0
- data/CHANGELOG.md +114 -31
- data/Gemfile +4 -2
- data/README.md +41 -25
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +332 -53
- data/doc/Vector.md +3 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +6 -5
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +2 -0
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +4 -4
- data/lib/red_amber/group.rb +99 -18
- data/lib/red_amber/helper.rb +1 -13
- data/lib/red_amber/vector.rb +7 -0
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_updatable.rb +60 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -0
- data/red_amber.gemspec +1 -1
- metadata +21 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f70451aad21c0750fb2a0bfe165baf5862ac3837541135cf9d58df4ecd732ac
|
4
|
+
data.tar.gz: c31f143278bf3792bc03e0e727e12df72fca5a001b3d6c098c3f028df456e2f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5026422e8f0c0b1b1b25f6baa97b540287937a3c0d224a16ce13c17c16a11ccd54682682f17fdf5e176190f7db40bfda7bbe5659e89a212eed8f1bf9b3567e9
|
7
|
+
data.tar.gz: 82b500a1570b8fc8925a7c988bd7d3f3db677588537ee9f8d75f965ca98f5d730a0a55be5fa6637e3beeb4aed11b55a9afd3e4dfd52aee687e123f390f0d8d2b
|
data/.rubocop.yml
CHANGED
@@ -60,6 +60,7 @@ Layout/LineLength:
|
|
60
60
|
Metrics/AbcSize:
|
61
61
|
Max: 30
|
62
62
|
Exclude:
|
63
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 43
|
63
64
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 55
|
64
65
|
- 'lib/red_amber/data_frame_reshaping.rb' # Max 40.91
|
65
66
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 51
|
@@ -80,11 +81,13 @@ Metrics/ClassLength:
|
|
80
81
|
- 'test/**/*'
|
81
82
|
- 'lib/red_amber/data_frame.rb' #Max: 131
|
82
83
|
- 'lib/red_amber/vector.rb' #Max: 102
|
84
|
+
- 'lib/red_amber/group.rb' #Max: 103
|
83
85
|
|
84
86
|
# Max: 7
|
85
87
|
Metrics/CyclomaticComplexity:
|
86
88
|
Max: 12
|
87
89
|
Exclude:
|
90
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 15
|
88
91
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 18
|
89
92
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
90
93
|
- 'lib/red_amber/helper.rb' # Max: 15
|
@@ -95,6 +98,7 @@ Metrics/CyclomaticComplexity:
|
|
95
98
|
Metrics/MethodLength:
|
96
99
|
Max: 30
|
97
100
|
Exclude:
|
101
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 38
|
98
102
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 33
|
99
103
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 38
|
100
104
|
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 35
|
@@ -103,15 +107,23 @@ Metrics/MethodLength:
|
|
103
107
|
Metrics/ModuleLength:
|
104
108
|
Max: 100
|
105
109
|
Exclude:
|
110
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 108
|
106
111
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 132
|
107
112
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 141
|
108
113
|
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 110
|
109
114
|
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
110
115
|
|
116
|
+
# Max: 5
|
117
|
+
Metrics/ParameterLists:
|
118
|
+
Max: 6
|
119
|
+
# Exclude:
|
120
|
+
# - 'lib/red_amber/data_frame_combinable.rb' # Max: 6
|
121
|
+
|
111
122
|
# Max: 8
|
112
123
|
Metrics/PerceivedComplexity:
|
113
124
|
Max: 13
|
114
125
|
Exclude:
|
126
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 14
|
115
127
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
116
128
|
- 'lib/red_amber/helper.rb' # Max: 15
|
117
129
|
- 'lib/red_amber/vector_updatable.rb' # Max: 15
|
data/CHANGELOG.md
CHANGED
@@ -1,35 +1,137 @@
|
|
1
|
+
## [0.2.4] - 2022-12-25 (unleleased)
|
2
|
+
|
3
|
+
## [0.2.3] - 2022-11-16
|
4
|
+
|
5
|
+
- Bug fixes
|
6
|
+
|
7
|
+
- Fix DataFrame#to_s when DataFrame.size == 0 (#125)
|
8
|
+
- Remove unused lines in funcs (#128)
|
9
|
+
- Remove unused methods in helper (#128)
|
10
|
+
- Add test for invalid arg in DataFrame.new (#128)
|
11
|
+
- Add test for Vector#shift(0) (#128)
|
12
|
+
- Fix bugs for DataFrame#[], #pick and #drop with Range of Symbols and Symbol (#135)
|
13
|
+
|
14
|
+
- New features and improvements
|
15
|
+
|
16
|
+
- Upgrade dependency to Arrow 10.0.0 (#132)
|
17
|
+
|
18
|
+
It is possible to initialize by the objects responsible to `to_arrow` since 0.2.3 .
|
19
|
+
Arrays in Numo::NArray is responsible to `to_arrow` with Red Arrow Numo::NArray 0.0.6 .
|
20
|
+
This feature is proposed by the Red Data Tools member @kojix2 and implemented by @kou.
|
21
|
+
I made also Vector to be responsible to `to_arrow` and `to_arrow_array`.
|
22
|
+
It becomes a member of ducks ('quack quack'). Thanks!
|
23
|
+
|
24
|
+
- Change dev dependency to red-dataset-arrow (#117)
|
25
|
+
- Add dev dependency for red-arrow-numo-narray (#132)
|
26
|
+
- Support Numo::NArray in Vector.new (#132)
|
27
|
+
- Support Vector#to_arrow_array (#132)
|
28
|
+
|
29
|
+
- Update group (#118)
|
30
|
+
- Introduce new DataFrame group support (experimental)
|
31
|
+
|
32
|
+
This additional API will treat a grouped DataFrame as a list of DataFrames.
|
33
|
+
I think this API has pros such as:
|
34
|
+
- API is easy to understand and flexible.
|
35
|
+
- It has good compatibility with Ruby's primitive Enumerables.
|
36
|
+
- We can only use non hash-ed aggregation functions.
|
37
|
+
- Do not need grouped DataFrame state, nor `#ungroup` method.
|
38
|
+
- May be useful for concurrent operations.
|
39
|
+
|
40
|
+
This feature is implemented by Ruby, so it is pretty slow and experimental.
|
41
|
+
Use original Group API for practical purpose.
|
42
|
+
|
43
|
+
- `include Enumerable` to Group (experimental)
|
44
|
+
- Add Group#each, #inspect
|
45
|
+
- Refactor Group to align with Arrow
|
46
|
+
|
47
|
+
- Introduce DataFrame combining methods (#125)
|
48
|
+
- Introduce DataFrame#concatenate method
|
49
|
+
- Add DataFrame#merge method
|
50
|
+
- Add DataFrame#inner_join method
|
51
|
+
- Add DataFrame#full_join method
|
52
|
+
- Add DataFrame#left_join method
|
53
|
+
- Add DataFrame#right_join method
|
54
|
+
- Add DataFrame#semi_join method
|
55
|
+
- Add DataFrame#anti_join method
|
56
|
+
- Add DataFrame#intersect method
|
57
|
+
- Add DataFrame#union method
|
58
|
+
- Add DataFrame#setdiff method
|
59
|
+
- Rename #setdiff to #difference
|
60
|
+
- Support natural join in DataFrame#join
|
61
|
+
- Support partial join_key and renaming
|
62
|
+
- Fix DataFrame#join to merge key columns
|
63
|
+
- Add DataFrame#set_operable? method
|
64
|
+
- Add join/set/bind image to DataFrame.md
|
65
|
+
- Fix DataFrame#join, #right_semi, #right_anti (#128)
|
66
|
+
|
67
|
+
- Miscellaneous
|
68
|
+
- Return Vector in DataFrame#indices (#118)
|
69
|
+
|
70
|
+
- Improve tests/ci
|
71
|
+
|
72
|
+
- Improve CI
|
73
|
+
- Add CI test on macOS (#133)
|
74
|
+
- Enable bundler-cache on macOS (#128)
|
75
|
+
- Add install gobject introspection prior to glib in CI (#133)
|
76
|
+
This will stabilize CI system installation especially with cache.
|
77
|
+
|
78
|
+
- Rename workflows/test.yml to ci.yml (#133)
|
79
|
+
- Fix link in CI badge of README.md (#118)
|
80
|
+
|
81
|
+
- Add github action for coverage (#128)
|
82
|
+
|
83
|
+
- Add benchmark
|
84
|
+
- Add benchmarks with Rover (#118)
|
85
|
+
- Introduce benchmark suite (#134)
|
86
|
+
- Add benchmark for combining operations (#134)
|
87
|
+
|
88
|
+
- Measuring test coverage
|
89
|
+
- Add test coverage measurement (#128)
|
90
|
+
|
91
|
+
- Refactoring
|
92
|
+
|
93
|
+
- Remove redundant string escape in `test_vector_function` (#132)
|
94
|
+
- Refine tests to use `assert_equal_array` (#128)
|
95
|
+
- Rewrite Vector#replace (#128)
|
96
|
+
|
97
|
+
- Documentation
|
98
|
+
|
99
|
+
- Update README.md for installation (#126)
|
100
|
+
- Add clause that keys must be unique in doc. (#126)
|
101
|
+
- Rows should be called as 'records' (#126)
|
102
|
+
- Update Jupyter Notebook `83 examples of RedAmber` (#135)
|
103
|
+
|
104
|
+
- GitHub site
|
105
|
+
|
106
|
+
- Update Jupyter notebooks in Binder
|
107
|
+
- Change default branch name from 'master' to 'main' (#127)
|
108
|
+
|
109
|
+
- Thanks
|
110
|
+
|
111
|
+
Ruby Association Grant committee
|
112
|
+
It is a great honor for selecting RedAmber as a project of Ruby Association Grant 2022.
|
113
|
+
|
114
|
+
|
1
115
|
## [0.2.2] - 2022-10-04
|
2
116
|
|
3
117
|
- Bug fixes
|
4
118
|
|
5
119
|
- Return self when no replacement happen in Vector#replace. (#92)
|
6
|
-
|
7
120
|
- Limit n-digits in to_iruby. (#111)
|
8
|
-
|
9
121
|
- Fix displaying space in to_iruby. (#111)
|
10
|
-
|
11
122
|
- Raise error if key is duplicated. (#113)
|
12
|
-
|
13
123
|
- Fix DataFrame#pick/#drop with endless Range. (#113)
|
14
|
-
|
15
124
|
- Change type from dictionary to string in DataFrame reshaping methods. (#113)
|
16
|
-
|
17
125
|
- Fix arguments parser to accept Enumerator. (#114)
|
18
126
|
|
19
127
|
- New features and improvements
|
20
128
|
|
21
129
|
- Support to make a data frame from a to_arrow-responsible object. (#106) [Patch by Kenta Murata]
|
22
|
-
|
23
130
|
- Introduce DataFrame#auto_cast (experimental feature) (#105)
|
24
|
-
|
25
131
|
- Change default name in DataFrame#transpose, #to_long, #to_wide. (#110)
|
26
|
-
|
27
132
|
- Add Vector#dictionary? method. (#113)
|
28
|
-
|
29
133
|
- Add display mode 'Plain' and 'Minimum'. (#113)
|
30
|
-
|
31
134
|
- Refactor code
|
32
|
-
|
33
135
|
- Refine test_vector_selectable. (#92)
|
34
136
|
- Refine test_vector_updatable. (#92)
|
35
137
|
- Refine Vector.new. (#113)
|
@@ -38,7 +140,6 @@
|
|
38
140
|
- Documents
|
39
141
|
|
40
142
|
- Update images. (#90, #105, #113)
|
41
|
-
|
42
143
|
- Update README to use simpler examples. (#112)
|
43
144
|
- Update README with a new screenshot example. (#113)
|
44
145
|
|
@@ -61,39 +162,27 @@
|
|
61
162
|
|
62
163
|
- Fix `Vector#each` with block (#66)
|
63
164
|
`Vector#each` will return value of each element with block.
|
64
|
-
|
65
165
|
- Fix table format at size == 9 (#67)
|
66
|
-
|
67
166
|
- Fix to support Vector in `DataFrame#assign` (#77)
|
68
|
-
|
69
167
|
- Add `assert_delta` functionality for `assert_with_NaN` (#78)
|
70
|
-
|
71
168
|
- Fix Vector#is_in when self is chunked (#79)
|
72
|
-
|
73
169
|
- Fix Array type error (uint/int) (#79)
|
74
170
|
|
75
171
|
- New features and improvements
|
76
172
|
|
77
173
|
- Refine `DataFrame#indices` method (#67)
|
78
|
-
|
79
174
|
- Update DataFrame reshaping methods (#73)
|
80
|
-
|
81
175
|
- Change default option value of DataFrame reshaping
|
82
|
-
|
83
176
|
- Change the order of import_cars example
|
84
177
|
|
85
178
|
- Add `DataFrame#method_missing` to get column vector by method (#75)
|
86
|
-
|
87
179
|
- Add `DataFrame#method_missing` to get column (#75)
|
88
180
|
|
89
181
|
- Accept both args and block in `DataFrame#assign` (#75)
|
90
|
-
|
91
182
|
- Accept indices in `DataFrame#pick` and `DataFrame#drop` (#76)
|
92
183
|
|
93
184
|
- Add `DataFrame#slice_by` method (#77)
|
94
|
-
|
95
185
|
- Add new Vector functions (#78)
|
96
|
-
|
97
186
|
- Add inverse trigonometric function for Vector
|
98
187
|
- `acos`
|
99
188
|
- `asin`
|
@@ -123,25 +212,19 @@
|
|
123
212
|
- Bug fixes
|
124
213
|
|
125
214
|
- Fix order of multiple group keys (#55)
|
126
|
-
|
127
215
|
Only 1 group key comes to left. Other keys remain in right.
|
128
216
|
|
129
217
|
- Remove optional `require` for rover (#55)
|
130
|
-
|
131
218
|
Fix DataFrame.new for argument with Rover::DataFrame.
|
132
|
-
|
133
219
|
- Fix occasional failure in CI (#59)
|
134
|
-
|
135
220
|
Sometimes the CI test fails. I added -dev dependency
|
136
221
|
in Arrow install by apt, not doing in bundler.
|
137
222
|
|
138
223
|
- Fix calling :take in V#[] (#56)
|
139
|
-
|
140
224
|
Fixed to call Arrow function :take instead of :array_take in Vector#take_by_vector. This will prevent the error below
|
141
225
|
when called with Arrow::ChunkedArray.
|
142
226
|
|
143
227
|
- Raise error renaming non existing key (#61)
|
144
|
-
|
145
228
|
Add error when specified key is not exist.
|
146
229
|
|
147
230
|
- Fix DataFrame#rename #assign by array (#65)
|
data/Gemfile
CHANGED
@@ -7,7 +7,7 @@ gemspec
|
|
7
7
|
group :test do
|
8
8
|
gem 'rake'
|
9
9
|
|
10
|
-
gem 'red-parquet', '
|
10
|
+
gem 'red-parquet', '~> 10.0.0'
|
11
11
|
gem 'rover-df', '~> 0.3.0'
|
12
12
|
|
13
13
|
gem 'rubocop'
|
@@ -21,5 +21,7 @@ group :test do
|
|
21
21
|
gem 'yard'
|
22
22
|
|
23
23
|
gem 'benchmark_driver'
|
24
|
-
gem 'red-
|
24
|
+
gem 'red-arrow-numo-narray'
|
25
|
+
gem 'red-datasets-arrow'
|
26
|
+
gem 'simplecov'
|
25
27
|
end
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# RedAmber
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
|
4
|
-
[![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/
|
4
|
+
[![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
|
5
5
|
[![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
|
6
6
|
|
7
7
|
A simple dataframe library for Ruby.
|
@@ -20,9 +20,9 @@ I recommend Ruby 3 for performance.
|
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
# Libraries required
|
23
|
-
gem 'red-arrow', '
|
23
|
+
gem 'red-arrow', '~> 10.0.0' # Requires Apache Arrow (see installation below)
|
24
24
|
|
25
|
-
gem 'red-parquet', '
|
25
|
+
gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
|
26
26
|
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
27
27
|
```
|
28
28
|
|
@@ -30,37 +30,52 @@ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
|
30
30
|
|
31
31
|
Install requirements before you install Red Amber.
|
32
32
|
|
33
|
-
- Apache Arrow
|
34
|
-
|
35
|
-
- Apache Parquet GLib (
|
33
|
+
- Apache Arrow (~> 10.0.0)
|
34
|
+
- Apache Arrow GLib (~> 10.0.0)
|
35
|
+
- Apache Parquet GLib (~> 10.0.0) # If you use IO from/to parquet
|
36
36
|
|
37
37
|
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
38
38
|
|
39
|
-
Minimum installation example for the latest Ubuntu
|
40
|
-
|
41
|
-
|
39
|
+
- Minimum installation example for the latest Ubuntu:
|
40
|
+
```
|
41
|
+
sudo apt update
|
42
|
+
sudo apt install -y -V ca-certificates lsb-release wget
|
43
|
+
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
44
|
+
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
45
|
+
sudo apt update
|
46
|
+
sudo apt install -y -V libarrow-dev
|
47
|
+
sudo apt install -y -V libarrow-glib-dev
|
48
|
+
```
|
49
|
+
- On macOS, you can install Apache Arrow C++ library using Homebrew:
|
50
|
+
|
51
|
+
```
|
52
|
+
brew install apache-arrow
|
53
|
+
```
|
54
|
+
|
55
|
+
and GLib (C) package with:
|
56
|
+
|
57
|
+
```
|
58
|
+
brew install apache-arrow-glib
|
59
|
+
```
|
60
|
+
|
61
|
+
If you prepared Apache Arrow, add these lines to your Gemfile:
|
42
62
|
|
43
63
|
```ruby
|
64
|
+
gem 'red-arrow', '~> 10.0.0'
|
44
65
|
gem 'red_amber'
|
66
|
+
gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
|
67
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
68
|
+
gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
|
69
|
+
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
45
70
|
```
|
46
71
|
|
47
|
-
And then execute
|
48
|
-
|
49
|
-
```shell
|
50
|
-
bundle install
|
51
|
-
```
|
52
|
-
|
53
|
-
Or install it yourself as:
|
54
|
-
|
55
|
-
```shell
|
56
|
-
gem install red_amber
|
57
|
-
```
|
72
|
+
And then execute `bundle install` or install it yourself as `gem install red_amber`.
|
58
73
|
|
59
74
|
## Docker image and Jupyter Notebook
|
60
75
|
|
61
76
|
[RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
|
62
77
|
|
63
|
-
Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=
|
78
|
+
Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
64
79
|
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
65
80
|
|
66
81
|
|
@@ -71,7 +86,7 @@ The entity is a Red Arrow's Table object.
|
|
71
86
|
|
72
87
|
![dataframe model of RedAmber](doc/image/dataframe_model.png)
|
73
88
|
|
74
|
-
|
89
|
+
Let's load the library and try some examples.
|
75
90
|
|
76
91
|
```ruby
|
77
92
|
require 'red_amber' # require 'red-amber' is also OK.
|
@@ -101,7 +116,7 @@ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if
|
|
101
116
|
53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
|
102
117
|
```
|
103
118
|
|
104
|
-
For example, we can compute mean prices per
|
119
|
+
For example, we can compute mean prices per cut for the data larger than 1 carat.
|
105
120
|
|
106
121
|
```ruby
|
107
122
|
df = diamonds
|
@@ -125,7 +140,7 @@ Arrow data is immutable, so these methods always return new objects.
|
|
125
140
|
Next example will rename a column and create a new column by simple calcuration.
|
126
141
|
|
127
142
|
```ruby
|
128
|
-
usdjpy = 110.0
|
143
|
+
usdjpy = 110.0 # when the yen was stronger
|
129
144
|
|
130
145
|
df.rename('mean(price)': :mean_price_USD)
|
131
146
|
.assign(:mean_price_JPY) { mean_price_USD * usdjpy }
|
@@ -181,7 +196,8 @@ See [Vector.md](doc/Vector.md) for details.
|
|
181
196
|
|
182
197
|
## Jupyter notebook
|
183
198
|
|
184
|
-
[
|
199
|
+
[83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
|
200
|
+
([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
|
185
201
|
|
186
202
|
You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
|
187
203
|
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)
|
data/benchmark/basic.yml
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
contexts:
|
2
|
+
- name: HEAD
|
3
|
+
prelude: |
|
4
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
5
|
+
- gems:
|
6
|
+
red_amber: 0.2.0
|
7
|
+
- gems:
|
8
|
+
red_amber: 0.1.5
|
9
|
+
|
10
|
+
prelude: |
|
11
|
+
require 'red_amber'
|
12
|
+
require 'datasets-arrow'
|
13
|
+
|
14
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
15
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
16
|
+
|
17
|
+
slicer = df[:distance] > 1000
|
18
|
+
distance_km = df[:distance] * 1.852
|
19
|
+
|
20
|
+
benchmark:
|
21
|
+
'B01: Pick([]) by a key name': |
|
22
|
+
df[:flight]
|
23
|
+
|
24
|
+
'B02: Pick by index': |
|
25
|
+
df[df.keys[9]]
|
26
|
+
|
27
|
+
'B03: Pick by key names': |
|
28
|
+
df.pick(:carrier, :flight)
|
29
|
+
|
30
|
+
'B04: Drop by key names': |
|
31
|
+
df.drop(:year, :month, :day)
|
32
|
+
|
33
|
+
'B05: Pick by booleans': |
|
34
|
+
df.pick(df.vectors.map(&:string?))
|
35
|
+
|
36
|
+
'B06: Pick by a block': |
|
37
|
+
df.pick { keys.map { |key| key.end_with?('time') } }
|
38
|
+
|
39
|
+
'B07: Slice([]) by a index': |
|
40
|
+
df[877]
|
41
|
+
|
42
|
+
'B08: Slice by indeces': |
|
43
|
+
df.slice(0...5, -5..-1)
|
44
|
+
|
45
|
+
'B09: Slice([]) by booleans': |
|
46
|
+
df[slicer]
|
47
|
+
|
48
|
+
'B10: Slice by booleans': |
|
49
|
+
df.slice(slicer)
|
50
|
+
|
51
|
+
'B11: Remove by booleans': |
|
52
|
+
df.remove(slicer)
|
53
|
+
|
54
|
+
'B12: Slice by a block': |
|
55
|
+
df.slice { slicer }
|
56
|
+
|
57
|
+
'B13: Rename by Hash': |
|
58
|
+
df.rename(distance: :distance_mile)
|
59
|
+
|
60
|
+
'B14: Assign an existing variable': |
|
61
|
+
df.assign(distance: distance_km)
|
62
|
+
|
63
|
+
'B15: Assign a new variable': |
|
64
|
+
df.assign(distance_km: distance_km)
|
65
|
+
|
66
|
+
'B16: Sort by a key': |
|
67
|
+
df.sort(:distance)
|
68
|
+
|
69
|
+
'B17: Sort by keys': |
|
70
|
+
df.sort(:origin, '-distance')
|
71
|
+
|
72
|
+
'B18: Convert to a Hash': |
|
73
|
+
df.to_h
|
74
|
+
|
75
|
+
'B19: Output in TDR style': |
|
76
|
+
df.tdr
|
77
|
+
|
78
|
+
'B20: Inspect': |
|
79
|
+
df.inspect
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# --repeat-count 3
|
2
|
+
|
3
|
+
loop_count: 3
|
4
|
+
|
5
|
+
contexts:
|
6
|
+
- name: HEAD
|
7
|
+
prelude: |
|
8
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
9
|
+
# - gems:
|
10
|
+
# red_amber: 0.2.3
|
11
|
+
|
12
|
+
prelude: |
|
13
|
+
require 'red_amber'
|
14
|
+
include RedAmber
|
15
|
+
require 'datasets-arrow'
|
16
|
+
|
17
|
+
package = 'nycflights13'
|
18
|
+
airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
|
19
|
+
airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
|
20
|
+
flights = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
|
21
|
+
.pick(%i[month day carrier flight tailnum origin dest air_time distance])
|
22
|
+
planes = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
|
23
|
+
weather = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
|
24
|
+
|
25
|
+
flights_Q1 = flights.slice { month <= 3 }
|
26
|
+
flights_Q2 = flights.slice { month > 3 }
|
27
|
+
|
28
|
+
flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
|
29
|
+
flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
|
30
|
+
|
31
|
+
flights_left = flights_Q1.pick(...5)
|
32
|
+
flights_right = flights_Q1.pick(5..)
|
33
|
+
|
34
|
+
benchmark:
|
35
|
+
'C01: Inner join on flights_Q1 by carrier': |
|
36
|
+
flights_Q1.inner_join(airlines, :carrier)
|
37
|
+
|
38
|
+
'C02: Full join on flights_Q1 by planes': |
|
39
|
+
flights_Q1.full_join(planes, :tailnum)
|
40
|
+
|
41
|
+
'C03: Left join on flights_Q1 by planes': |
|
42
|
+
flights_Q1.left_join(planes, :tailnum)
|
43
|
+
|
44
|
+
'C04: Semi join on flights_Q1 by planes': |
|
45
|
+
flights_Q1.semi_join(planes, :tailnum)
|
46
|
+
|
47
|
+
'C05: Anti join on flights_Q1 by planes': |
|
48
|
+
flights_Q1.anti_join(planes, :tailnum)
|
49
|
+
|
50
|
+
'C06: Intersection of flights_1_2 and flights_1_3': |
|
51
|
+
flights_1_2.intersect(flights_1_3)
|
52
|
+
|
53
|
+
'C07: Union of flights_1_2 and flights_1_3': |
|
54
|
+
flights_1_2.union(flights_1_3)
|
55
|
+
|
56
|
+
'C08: Difference between flights_1_2 and flights_1_3': |
|
57
|
+
flights_1_2.difference(flights_1_3)
|
58
|
+
|
59
|
+
'C09: Concatenate flight_Q1 on flight_Q2': |
|
60
|
+
flights_Q1.concatenate(flights_Q2)
|
61
|
+
|
62
|
+
'C10: Merge flights_Q1_right on flights_Q1_left': |
|
63
|
+
flights_left.merge(flights_right)
|
data/benchmark/drop_nil.yml
CHANGED
@@ -1,11 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.1.8
|
4
|
+
- gems:
|
5
|
+
red_amber: 0.2.2
|
6
|
+
- name: HEAD
|
7
|
+
prelude: |
|
8
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
9
|
+
require 'red_amber'
|
10
|
+
|
1
11
|
prelude: |
|
2
12
|
require 'datasets-arrow'
|
3
13
|
require 'red_amber'
|
4
14
|
|
5
15
|
penguins = RedAmber::DataFrame.new(Datasets::Penguins.new.to_arrow)
|
6
16
|
|
7
|
-
def
|
8
|
-
penguins.remove { vectors.map
|
17
|
+
def remove_nil(penguins)
|
18
|
+
penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
9
19
|
end
|
10
20
|
|
11
|
-
benchmark:
|
21
|
+
benchmark:
|
22
|
+
'Remove and reduce': remove_nil(penguins)
|
23
|
+
'remove_nil method': penguins.remove_nil
|
data/benchmark/group.yml
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
contexts:
|
2
|
+
- name: HEAD
|
3
|
+
prelude: |
|
4
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
5
|
+
- gems:
|
6
|
+
red_amber: 0.2.2
|
7
|
+
|
8
|
+
prelude: |
|
9
|
+
require 'red_amber'
|
10
|
+
require 'datasets-arrow'
|
11
|
+
|
12
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
13
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
14
|
+
.assign(:flight) { flight.map(&:to_s) }
|
15
|
+
|
16
|
+
slicer = df[:distance] > 1000
|
17
|
+
distance_km = df[:distance] * 1.852
|
18
|
+
|
19
|
+
benchmark:
|
20
|
+
'G01: sum distance by destination': |
|
21
|
+
df.group(:dest).sum(:distance)
|
22
|
+
|
23
|
+
'G02: sum arr_delay by month and day': |
|
24
|
+
df.group(:month, :day).sum(:arr_delay)
|
25
|
+
|
26
|
+
'G03: sum arr_delay, mean distance by flight': |
|
27
|
+
df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
|
28
|
+
|
29
|
+
'G04: mean air_time, distance by flight': |
|
30
|
+
df.group(:flight).mean(:air_time, :distance)
|
31
|
+
|
32
|
+
'G05: sum dep_delay, arr_delay by carrer': |
|
33
|
+
df.group(:carrier).sum(:dep_delay, :arr_delay)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# --repeat-count 3
|
2
|
+
|
3
|
+
contexts:
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
- gems:
|
8
|
+
red_amber: 0.2.2
|
9
|
+
|
10
|
+
prelude: |
|
11
|
+
require 'red_amber'
|
12
|
+
require 'datasets-arrow'
|
13
|
+
|
14
|
+
ds = Datasets::Rdatasets.new('tidyr', 'billboard')
|
15
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
16
|
+
sub_df = df.pick(:track, df.keys.select{ |k| k.start_with? 'wk' })
|
17
|
+
long_df = df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
|
18
|
+
|
19
|
+
benchmark:
|
20
|
+
'R01: Transpose a DataFrame': |
|
21
|
+
sub_df.transpose(name: :week)
|
22
|
+
|
23
|
+
'R02: Reshape to longer DataFrame': |
|
24
|
+
df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
|
25
|
+
|
26
|
+
'R03: Reshape to wider DataFrame': |
|
27
|
+
long_df.to_wide(name: :week, value: :rank)
|
@@ -2,12 +2,12 @@ prelude: |
|
|
2
2
|
require 'rover'
|
3
3
|
require 'red_amber'
|
4
4
|
|
5
|
-
penguins_csv = '
|
5
|
+
penguins_csv = 'tmp/penguins.csv'
|
6
6
|
|
7
7
|
unless File.exist?(penguins_csv)
|
8
8
|
require 'datasets-arrow'
|
9
|
-
|
10
|
-
RedAmber::DataFrame.new(
|
9
|
+
ds = Datasets::Penguins.new
|
10
|
+
RedAmber::DataFrame.new(ds).save(penguins_csv)
|
11
11
|
end
|
12
12
|
|
13
13
|
benchmark:
|