red_amber 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +12 -0
- data/CHANGELOG.md +114 -31
- data/Gemfile +4 -2
- data/README.md +41 -25
- data/benchmark/basic.yml +79 -0
- data/benchmark/combine.yml +63 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +33 -0
- data/benchmark/reshape.yml +27 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/doc/DataFrame.md +332 -53
- data/doc/Vector.md +3 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +6 -5
- data/lib/red_amber/data_frame_combinable.rb +283 -0
- data/lib/red_amber/data_frame_displayable.rb +2 -0
- data/lib/red_amber/data_frame_selectable.rb +9 -9
- data/lib/red_amber/data_frame_variable_operation.rb +4 -4
- data/lib/red_amber/group.rb +99 -18
- data/lib/red_amber/helper.rb +1 -13
- data/lib/red_amber/vector.rb +7 -0
- data/lib/red_amber/vector_functions.rb +0 -8
- data/lib/red_amber/vector_updatable.rb +60 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +1 -0
- data/red_amber.gemspec +1 -1
- metadata +21 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f70451aad21c0750fb2a0bfe165baf5862ac3837541135cf9d58df4ecd732ac
|
4
|
+
data.tar.gz: c31f143278bf3792bc03e0e727e12df72fca5a001b3d6c098c3f028df456e2f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5026422e8f0c0b1b1b25f6baa97b540287937a3c0d224a16ce13c17c16a11ccd54682682f17fdf5e176190f7db40bfda7bbe5659e89a212eed8f1bf9b3567e9
|
7
|
+
data.tar.gz: 82b500a1570b8fc8925a7c988bd7d3f3db677588537ee9f8d75f965ca98f5d730a0a55be5fa6637e3beeb4aed11b55a9afd3e4dfd52aee687e123f390f0d8d2b
|
data/.rubocop.yml
CHANGED
@@ -60,6 +60,7 @@ Layout/LineLength:
|
|
60
60
|
Metrics/AbcSize:
|
61
61
|
Max: 30
|
62
62
|
Exclude:
|
63
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 43
|
63
64
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 55
|
64
65
|
- 'lib/red_amber/data_frame_reshaping.rb' # Max 40.91
|
65
66
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 51
|
@@ -80,11 +81,13 @@ Metrics/ClassLength:
|
|
80
81
|
- 'test/**/*'
|
81
82
|
- 'lib/red_amber/data_frame.rb' #Max: 131
|
82
83
|
- 'lib/red_amber/vector.rb' #Max: 102
|
84
|
+
- 'lib/red_amber/group.rb' #Max: 103
|
83
85
|
|
84
86
|
# Max: 7
|
85
87
|
Metrics/CyclomaticComplexity:
|
86
88
|
Max: 12
|
87
89
|
Exclude:
|
90
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 15
|
88
91
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 18
|
89
92
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
90
93
|
- 'lib/red_amber/helper.rb' # Max: 15
|
@@ -95,6 +98,7 @@ Metrics/CyclomaticComplexity:
|
|
95
98
|
Metrics/MethodLength:
|
96
99
|
Max: 30
|
97
100
|
Exclude:
|
101
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 38
|
98
102
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 33
|
99
103
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 38
|
100
104
|
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 35
|
@@ -103,15 +107,23 @@ Metrics/MethodLength:
|
|
103
107
|
Metrics/ModuleLength:
|
104
108
|
Max: 100
|
105
109
|
Exclude:
|
110
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 108
|
106
111
|
- 'lib/red_amber/data_frame_displayable.rb' # Max: 132
|
107
112
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 141
|
108
113
|
- 'lib/red_amber/data_frame_variable_operation.rb' # Max: 110
|
109
114
|
- 'lib/red_amber/vector_functions.rb' # Max: 114
|
110
115
|
|
116
|
+
# Max: 5
|
117
|
+
Metrics/ParameterLists:
|
118
|
+
Max: 6
|
119
|
+
# Exclude:
|
120
|
+
# - 'lib/red_amber/data_frame_combinable.rb' # Max: 6
|
121
|
+
|
111
122
|
# Max: 8
|
112
123
|
Metrics/PerceivedComplexity:
|
113
124
|
Max: 13
|
114
125
|
Exclude:
|
126
|
+
- 'lib/red_amber/data_frame_combinable.rb' # Max: 14
|
115
127
|
- 'lib/red_amber/data_frame_selectable.rb' # Max: 14
|
116
128
|
- 'lib/red_amber/helper.rb' # Max: 15
|
117
129
|
- 'lib/red_amber/vector_updatable.rb' # Max: 15
|
data/CHANGELOG.md
CHANGED
@@ -1,35 +1,137 @@
|
|
1
|
+
## [0.2.4] - 2022-12-25 (unleleased)
|
2
|
+
|
3
|
+
## [0.2.3] - 2022-11-16
|
4
|
+
|
5
|
+
- Bug fixes
|
6
|
+
|
7
|
+
- Fix DataFrame#to_s when DataFrame.size == 0 (#125)
|
8
|
+
- Remove unused lines in funcs (#128)
|
9
|
+
- Remove unused methods in helper (#128)
|
10
|
+
- Add test for invalid arg in DataFrame.new (#128)
|
11
|
+
- Add test for Vector#shift(0) (#128)
|
12
|
+
- Fix bugs for DataFrame#[], #pick and #drop with Range of Symbols and Symbol (#135)
|
13
|
+
|
14
|
+
- New features and improvements
|
15
|
+
|
16
|
+
- Upgrade dependency to Arrow 10.0.0 (#132)
|
17
|
+
|
18
|
+
It is possible to initialize by the objects responsible to `to_arrow` since 0.2.3 .
|
19
|
+
Arrays in Numo::NArray is responsible to `to_arrow` with Red Arrow Numo::NArray 0.0.6 .
|
20
|
+
This feature is proposed by the Red Data Tools member @kojix2 and implemented by @kou.
|
21
|
+
I made also Vector to be responsible to `to_arrow` and `to_arrow_array`.
|
22
|
+
It becomes a member of ducks ('quack quack'). Thanks!
|
23
|
+
|
24
|
+
- Change dev dependency to red-dataset-arrow (#117)
|
25
|
+
- Add dev dependency for red-arrow-numo-narray (#132)
|
26
|
+
- Support Numo::NArray in Vector.new (#132)
|
27
|
+
- Support Vector#to_arrow_array (#132)
|
28
|
+
|
29
|
+
- Update group (#118)
|
30
|
+
- Introduce new DataFrame group support (experimental)
|
31
|
+
|
32
|
+
This additional API will treat a grouped DataFrame as a list of DataFrames.
|
33
|
+
I think this API has pros such as:
|
34
|
+
- API is easy to understand and flexible.
|
35
|
+
- It has good compatibility with Ruby's primitive Enumerables.
|
36
|
+
- We can only use non hash-ed aggregation functions.
|
37
|
+
- Do not need grouped DataFrame state, nor `#ungroup` method.
|
38
|
+
- May be useful for concurrent operations.
|
39
|
+
|
40
|
+
This feature is implemented by Ruby, so it is pretty slow and experimental.
|
41
|
+
Use original Group API for practical purpose.
|
42
|
+
|
43
|
+
- `include Enumerable` to Group (experimental)
|
44
|
+
- Add Group#each, #inspect
|
45
|
+
- Refactor Group to align with Arrow
|
46
|
+
|
47
|
+
- Introduce DataFrame combining methods (#125)
|
48
|
+
- Introduce DataFrame#concatenate method
|
49
|
+
- Add DataFrame#merge method
|
50
|
+
- Add DataFrame#inner_join method
|
51
|
+
- Add DataFrame#full_join method
|
52
|
+
- Add DataFrame#left_join method
|
53
|
+
- Add DataFrame#right_join method
|
54
|
+
- Add DataFrame#semi_join method
|
55
|
+
- Add DataFrame#anti_join method
|
56
|
+
- Add DataFrame#intersect method
|
57
|
+
- Add DataFrame#union method
|
58
|
+
- Add DataFrame#setdiff method
|
59
|
+
- Rename #setdiff to #difference
|
60
|
+
- Support natural join in DataFrame#join
|
61
|
+
- Support partial join_key and renaming
|
62
|
+
- Fix DataFrame#join to merge key columns
|
63
|
+
- Add DataFrame#set_operable? method
|
64
|
+
- Add join/set/bind image to DataFrame.md
|
65
|
+
- Fix DataFrame#join, #right_semi, #right_anti (#128)
|
66
|
+
|
67
|
+
- Miscellaneous
|
68
|
+
- Return Vector in DataFrame#indices (#118)
|
69
|
+
|
70
|
+
- Improve tests/ci
|
71
|
+
|
72
|
+
- Improve CI
|
73
|
+
- Add CI test on macOS (#133)
|
74
|
+
- Enable bundler-cache on macOS (#128)
|
75
|
+
- Add install gobject introspection prior to glib in CI (#133)
|
76
|
+
This will stabilize CI system installation especially with cache.
|
77
|
+
|
78
|
+
- Rename workflows/test.yml to ci.yml (#133)
|
79
|
+
- Fix link in CI badge of README.md (#118)
|
80
|
+
|
81
|
+
- Add github action for coverage (#128)
|
82
|
+
|
83
|
+
- Add benchmark
|
84
|
+
- Add benchmarks with Rover (#118)
|
85
|
+
- Introduce benchmark suite (#134)
|
86
|
+
- Add benchmark for combining operations (#134)
|
87
|
+
|
88
|
+
- Measuring test coverage
|
89
|
+
- Add test coverage measurement (#128)
|
90
|
+
|
91
|
+
- Refactoring
|
92
|
+
|
93
|
+
- Remove redundant string escape in `test_vector_function` (#132)
|
94
|
+
- Refine tests to use `assert_equal_array` (#128)
|
95
|
+
- Rewrite Vector#replace (#128)
|
96
|
+
|
97
|
+
- Documentation
|
98
|
+
|
99
|
+
- Update README.md for installation (#126)
|
100
|
+
- Add clause that keys must be unique in doc. (#126)
|
101
|
+
- Rows should be called as 'records' (#126)
|
102
|
+
- Update Jupyter Notebook `83 examples of RedAmber` (#135)
|
103
|
+
|
104
|
+
- GitHub site
|
105
|
+
|
106
|
+
- Update Jupyter notebooks in Binder
|
107
|
+
- Change default branch name from 'master' to 'main' (#127)
|
108
|
+
|
109
|
+
- Thanks
|
110
|
+
|
111
|
+
Ruby Association Grant committee
|
112
|
+
It is a great honor for selecting RedAmber as a project of Ruby Association Grant 2022.
|
113
|
+
|
114
|
+
|
1
115
|
## [0.2.2] - 2022-10-04
|
2
116
|
|
3
117
|
- Bug fixes
|
4
118
|
|
5
119
|
- Return self when no replacement happen in Vector#replace. (#92)
|
6
|
-
|
7
120
|
- Limit n-digits in to_iruby. (#111)
|
8
|
-
|
9
121
|
- Fix displaying space in to_iruby. (#111)
|
10
|
-
|
11
122
|
- Raise error if key is duplicated. (#113)
|
12
|
-
|
13
123
|
- Fix DataFrame#pick/#drop with endless Range. (#113)
|
14
|
-
|
15
124
|
- Change type from dictionary to string in DataFrame reshaping methods. (#113)
|
16
|
-
|
17
125
|
- Fix arguments parser to accept Enumerator. (#114)
|
18
126
|
|
19
127
|
- New features and improvements
|
20
128
|
|
21
129
|
- Support to make a data frame from a to_arrow-responsible object. (#106) [Patch by Kenta Murata]
|
22
|
-
|
23
130
|
- Introduce DataFrame#auto_cast (experimental feature) (#105)
|
24
|
-
|
25
131
|
- Change default name in DataFrame#transpose, #to_long, #to_wide. (#110)
|
26
|
-
|
27
132
|
- Add Vector#dictionary? method. (#113)
|
28
|
-
|
29
133
|
- Add display mode 'Plain' and 'Minimum'. (#113)
|
30
|
-
|
31
134
|
- Refactor code
|
32
|
-
|
33
135
|
- Refine test_vector_selectable. (#92)
|
34
136
|
- Refine test_vector_updatable. (#92)
|
35
137
|
- Refine Vector.new. (#113)
|
@@ -38,7 +140,6 @@
|
|
38
140
|
- Documents
|
39
141
|
|
40
142
|
- Update images. (#90, #105, #113)
|
41
|
-
|
42
143
|
- Update README to use simpler examples. (#112)
|
43
144
|
- Update README with a new screenshot example. (#113)
|
44
145
|
|
@@ -61,39 +162,27 @@
|
|
61
162
|
|
62
163
|
- Fix `Vector#each` with block (#66)
|
63
164
|
`Vector#each` will return value of each element with block.
|
64
|
-
|
65
165
|
- Fix table format at size == 9 (#67)
|
66
|
-
|
67
166
|
- Fix to support Vector in `DataFrame#assign` (#77)
|
68
|
-
|
69
167
|
- Add `assert_delta` functionality for `assert_with_NaN` (#78)
|
70
|
-
|
71
168
|
- Fix Vector#is_in when self is chunked (#79)
|
72
|
-
|
73
169
|
- Fix Array type error (uint/int) (#79)
|
74
170
|
|
75
171
|
- New features and improvements
|
76
172
|
|
77
173
|
- Refine `DataFrame#indices` method (#67)
|
78
|
-
|
79
174
|
- Update DataFrame reshaping methods (#73)
|
80
|
-
|
81
175
|
- Change default option value of DataFrame reshaping
|
82
|
-
|
83
176
|
- Change the order of import_cars example
|
84
177
|
|
85
178
|
- Add `DataFrame#method_missing` to get column vector by method (#75)
|
86
|
-
|
87
179
|
- Add `DataFrame#method_missing` to get column (#75)
|
88
180
|
|
89
181
|
- Accept both args and block in `DataFrame#assign` (#75)
|
90
|
-
|
91
182
|
- Accept indices in `DataFrame#pick` and `DataFrame#drop` (#76)
|
92
183
|
|
93
184
|
- Add `DataFrame#slice_by` method (#77)
|
94
|
-
|
95
185
|
- Add new Vector functions (#78)
|
96
|
-
|
97
186
|
- Add inverse trigonometric function for Vector
|
98
187
|
- `acos`
|
99
188
|
- `asin`
|
@@ -123,25 +212,19 @@
|
|
123
212
|
- Bug fixes
|
124
213
|
|
125
214
|
- Fix order of multiple group keys (#55)
|
126
|
-
|
127
215
|
Only 1 group key comes to left. Other keys remain in right.
|
128
216
|
|
129
217
|
- Remove optional `require` for rover (#55)
|
130
|
-
|
131
218
|
Fix DataFrame.new for argument with Rover::DataFrame.
|
132
|
-
|
133
219
|
- Fix occasional failure in CI (#59)
|
134
|
-
|
135
220
|
Sometimes the CI test fails. I added -dev dependency
|
136
221
|
in Arrow install by apt, not doing in bundler.
|
137
222
|
|
138
223
|
- Fix calling :take in V#[] (#56)
|
139
|
-
|
140
224
|
Fixed to call Arrow function :take instead of :array_take in Vector#take_by_vector. This will prevent the error below
|
141
225
|
when called with Arrow::ChunkedArray.
|
142
226
|
|
143
227
|
- Raise error renaming non existing key (#61)
|
144
|
-
|
145
228
|
Add error when specified key is not exist.
|
146
229
|
|
147
230
|
- Fix DataFrame#rename #assign by array (#65)
|
data/Gemfile
CHANGED
@@ -7,7 +7,7 @@ gemspec
|
|
7
7
|
group :test do
|
8
8
|
gem 'rake'
|
9
9
|
|
10
|
-
gem 'red-parquet', '
|
10
|
+
gem 'red-parquet', '~> 10.0.0'
|
11
11
|
gem 'rover-df', '~> 0.3.0'
|
12
12
|
|
13
13
|
gem 'rubocop'
|
@@ -21,5 +21,7 @@ group :test do
|
|
21
21
|
gem 'yard'
|
22
22
|
|
23
23
|
gem 'benchmark_driver'
|
24
|
-
gem 'red-
|
24
|
+
gem 'red-arrow-numo-narray'
|
25
|
+
gem 'red-datasets-arrow'
|
26
|
+
gem 'simplecov'
|
25
27
|
end
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# RedAmber
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/red_amber)
|
4
|
-
[](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
|
5
5
|
[](https://github.com/heronshoes/red_amber/discussions)
|
6
6
|
|
7
7
|
A simple dataframe library for Ruby.
|
@@ -20,9 +20,9 @@ I recommend Ruby 3 for performance.
|
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
# Libraries required
|
23
|
-
gem 'red-arrow', '
|
23
|
+
gem 'red-arrow', '~> 10.0.0' # Requires Apache Arrow (see installation below)
|
24
24
|
|
25
|
-
gem 'red-parquet', '
|
25
|
+
gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
|
26
26
|
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
27
27
|
```
|
28
28
|
|
@@ -30,37 +30,52 @@ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
|
30
30
|
|
31
31
|
Install requirements before you install Red Amber.
|
32
32
|
|
33
|
-
- Apache Arrow
|
34
|
-
|
35
|
-
- Apache Parquet GLib (
|
33
|
+
- Apache Arrow (~> 10.0.0)
|
34
|
+
- Apache Arrow GLib (~> 10.0.0)
|
35
|
+
- Apache Parquet GLib (~> 10.0.0) # If you use IO from/to parquet
|
36
36
|
|
37
37
|
See [Apache Arrow install document](https://arrow.apache.org/install/).
|
38
38
|
|
39
|
-
Minimum installation example for the latest Ubuntu
|
40
|
-
|
41
|
-
|
39
|
+
- Minimum installation example for the latest Ubuntu:
|
40
|
+
```
|
41
|
+
sudo apt update
|
42
|
+
sudo apt install -y -V ca-certificates lsb-release wget
|
43
|
+
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
44
|
+
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
|
45
|
+
sudo apt update
|
46
|
+
sudo apt install -y -V libarrow-dev
|
47
|
+
sudo apt install -y -V libarrow-glib-dev
|
48
|
+
```
|
49
|
+
- On macOS, you can install Apache Arrow C++ library using Homebrew:
|
50
|
+
|
51
|
+
```
|
52
|
+
brew install apache-arrow
|
53
|
+
```
|
54
|
+
|
55
|
+
and GLib (C) package with:
|
56
|
+
|
57
|
+
```
|
58
|
+
brew install apache-arrow-glib
|
59
|
+
```
|
60
|
+
|
61
|
+
If you prepared Apache Arrow, add these lines to your Gemfile:
|
42
62
|
|
43
63
|
```ruby
|
64
|
+
gem 'red-arrow', '~> 10.0.0'
|
44
65
|
gem 'red_amber'
|
66
|
+
gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
|
67
|
+
gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
|
68
|
+
gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
|
69
|
+
gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
|
45
70
|
```
|
46
71
|
|
47
|
-
And then execute
|
48
|
-
|
49
|
-
```shell
|
50
|
-
bundle install
|
51
|
-
```
|
52
|
-
|
53
|
-
Or install it yourself as:
|
54
|
-
|
55
|
-
```shell
|
56
|
-
gem install red_amber
|
57
|
-
```
|
72
|
+
And then execute `bundle install` or install it yourself as `gem install red_amber`.
|
58
73
|
|
59
74
|
## Docker image and Jupyter Notebook
|
60
75
|
|
61
76
|
[RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
|
62
77
|
|
63
|
-
Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=
|
78
|
+
Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
|
64
79
|
[](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
|
65
80
|
|
66
81
|
|
@@ -71,7 +86,7 @@ The entity is a Red Arrow's Table object.
|
|
71
86
|
|
72
87
|

|
73
88
|
|
74
|
-
|
89
|
+
Let's load the library and try some examples.
|
75
90
|
|
76
91
|
```ruby
|
77
92
|
require 'red_amber' # require 'red-amber' is also OK.
|
@@ -101,7 +116,7 @@ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if
|
|
101
116
|
53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
|
102
117
|
```
|
103
118
|
|
104
|
-
For example, we can compute mean prices per
|
119
|
+
For example, we can compute mean prices per cut for the data larger than 1 carat.
|
105
120
|
|
106
121
|
```ruby
|
107
122
|
df = diamonds
|
@@ -125,7 +140,7 @@ Arrow data is immutable, so these methods always return new objects.
|
|
125
140
|
Next example will rename a column and create a new column by simple calcuration.
|
126
141
|
|
127
142
|
```ruby
|
128
|
-
usdjpy = 110.0
|
143
|
+
usdjpy = 110.0 # when the yen was stronger
|
129
144
|
|
130
145
|
df.rename('mean(price)': :mean_price_USD)
|
131
146
|
.assign(:mean_price_JPY) { mean_price_USD * usdjpy }
|
@@ -181,7 +196,8 @@ See [Vector.md](doc/Vector.md) for details.
|
|
181
196
|
|
182
197
|
## Jupyter notebook
|
183
198
|
|
184
|
-
[
|
199
|
+
[83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
|
200
|
+
([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
|
185
201
|
|
186
202
|
You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
|
187
203
|
[](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)
|
data/benchmark/basic.yml
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
contexts:
|
2
|
+
- name: HEAD
|
3
|
+
prelude: |
|
4
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
5
|
+
- gems:
|
6
|
+
red_amber: 0.2.0
|
7
|
+
- gems:
|
8
|
+
red_amber: 0.1.5
|
9
|
+
|
10
|
+
prelude: |
|
11
|
+
require 'red_amber'
|
12
|
+
require 'datasets-arrow'
|
13
|
+
|
14
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
15
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
16
|
+
|
17
|
+
slicer = df[:distance] > 1000
|
18
|
+
distance_km = df[:distance] * 1.852
|
19
|
+
|
20
|
+
benchmark:
|
21
|
+
'B01: Pick([]) by a key name': |
|
22
|
+
df[:flight]
|
23
|
+
|
24
|
+
'B02: Pick by index': |
|
25
|
+
df[df.keys[9]]
|
26
|
+
|
27
|
+
'B03: Pick by key names': |
|
28
|
+
df.pick(:carrier, :flight)
|
29
|
+
|
30
|
+
'B04: Drop by key names': |
|
31
|
+
df.drop(:year, :month, :day)
|
32
|
+
|
33
|
+
'B05: Pick by booleans': |
|
34
|
+
df.pick(df.vectors.map(&:string?))
|
35
|
+
|
36
|
+
'B06: Pick by a block': |
|
37
|
+
df.pick { keys.map { |key| key.end_with?('time') } }
|
38
|
+
|
39
|
+
'B07: Slice([]) by a index': |
|
40
|
+
df[877]
|
41
|
+
|
42
|
+
'B08: Slice by indeces': |
|
43
|
+
df.slice(0...5, -5..-1)
|
44
|
+
|
45
|
+
'B09: Slice([]) by booleans': |
|
46
|
+
df[slicer]
|
47
|
+
|
48
|
+
'B10: Slice by booleans': |
|
49
|
+
df.slice(slicer)
|
50
|
+
|
51
|
+
'B11: Remove by booleans': |
|
52
|
+
df.remove(slicer)
|
53
|
+
|
54
|
+
'B12: Slice by a block': |
|
55
|
+
df.slice { slicer }
|
56
|
+
|
57
|
+
'B13: Rename by Hash': |
|
58
|
+
df.rename(distance: :distance_mile)
|
59
|
+
|
60
|
+
'B14: Assign an existing variable': |
|
61
|
+
df.assign(distance: distance_km)
|
62
|
+
|
63
|
+
'B15: Assign a new variable': |
|
64
|
+
df.assign(distance_km: distance_km)
|
65
|
+
|
66
|
+
'B16: Sort by a key': |
|
67
|
+
df.sort(:distance)
|
68
|
+
|
69
|
+
'B17: Sort by keys': |
|
70
|
+
df.sort(:origin, '-distance')
|
71
|
+
|
72
|
+
'B18: Convert to a Hash': |
|
73
|
+
df.to_h
|
74
|
+
|
75
|
+
'B19: Output in TDR style': |
|
76
|
+
df.tdr
|
77
|
+
|
78
|
+
'B20: Inspect': |
|
79
|
+
df.inspect
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# --repeat-count 3
|
2
|
+
|
3
|
+
loop_count: 3
|
4
|
+
|
5
|
+
contexts:
|
6
|
+
- name: HEAD
|
7
|
+
prelude: |
|
8
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
9
|
+
# - gems:
|
10
|
+
# red_amber: 0.2.3
|
11
|
+
|
12
|
+
prelude: |
|
13
|
+
require 'red_amber'
|
14
|
+
include RedAmber
|
15
|
+
require 'datasets-arrow'
|
16
|
+
|
17
|
+
package = 'nycflights13'
|
18
|
+
airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
|
19
|
+
airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
|
20
|
+
flights = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
|
21
|
+
.pick(%i[month day carrier flight tailnum origin dest air_time distance])
|
22
|
+
planes = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
|
23
|
+
weather = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
|
24
|
+
|
25
|
+
flights_Q1 = flights.slice { month <= 3 }
|
26
|
+
flights_Q2 = flights.slice { month > 3 }
|
27
|
+
|
28
|
+
flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
|
29
|
+
flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
|
30
|
+
|
31
|
+
flights_left = flights_Q1.pick(...5)
|
32
|
+
flights_right = flights_Q1.pick(5..)
|
33
|
+
|
34
|
+
benchmark:
|
35
|
+
'C01: Inner join on flights_Q1 by carrier': |
|
36
|
+
flights_Q1.inner_join(airlines, :carrier)
|
37
|
+
|
38
|
+
'C02: Full join on flights_Q1 by planes': |
|
39
|
+
flights_Q1.full_join(planes, :tailnum)
|
40
|
+
|
41
|
+
'C03: Left join on flights_Q1 by planes': |
|
42
|
+
flights_Q1.left_join(planes, :tailnum)
|
43
|
+
|
44
|
+
'C04: Semi join on flights_Q1 by planes': |
|
45
|
+
flights_Q1.semi_join(planes, :tailnum)
|
46
|
+
|
47
|
+
'C05: Anti join on flights_Q1 by planes': |
|
48
|
+
flights_Q1.anti_join(planes, :tailnum)
|
49
|
+
|
50
|
+
'C06: Intersection of flights_1_2 and flights_1_3': |
|
51
|
+
flights_1_2.intersect(flights_1_3)
|
52
|
+
|
53
|
+
'C07: Union of flights_1_2 and flights_1_3': |
|
54
|
+
flights_1_2.union(flights_1_3)
|
55
|
+
|
56
|
+
'C08: Difference between flights_1_2 and flights_1_3': |
|
57
|
+
flights_1_2.difference(flights_1_3)
|
58
|
+
|
59
|
+
'C09: Concatenate flight_Q1 on flight_Q2': |
|
60
|
+
flights_Q1.concatenate(flights_Q2)
|
61
|
+
|
62
|
+
'C10: Merge flights_Q1_right on flights_Q1_left': |
|
63
|
+
flights_left.merge(flights_right)
|
data/benchmark/drop_nil.yml
CHANGED
@@ -1,11 +1,23 @@
|
|
1
|
+
contexts:
|
2
|
+
- gems:
|
3
|
+
red_amber: 0.1.8
|
4
|
+
- gems:
|
5
|
+
red_amber: 0.2.2
|
6
|
+
- name: HEAD
|
7
|
+
prelude: |
|
8
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
9
|
+
require 'red_amber'
|
10
|
+
|
1
11
|
prelude: |
|
2
12
|
require 'datasets-arrow'
|
3
13
|
require 'red_amber'
|
4
14
|
|
5
15
|
penguins = RedAmber::DataFrame.new(Datasets::Penguins.new.to_arrow)
|
6
16
|
|
7
|
-
def
|
8
|
-
penguins.remove { vectors.map
|
17
|
+
def remove_nil(penguins)
|
18
|
+
penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
|
9
19
|
end
|
10
20
|
|
11
|
-
benchmark:
|
21
|
+
benchmark:
|
22
|
+
'Remove and reduce': remove_nil(penguins)
|
23
|
+
'remove_nil method': penguins.remove_nil
|
data/benchmark/group.yml
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
contexts:
|
2
|
+
- name: HEAD
|
3
|
+
prelude: |
|
4
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
5
|
+
- gems:
|
6
|
+
red_amber: 0.2.2
|
7
|
+
|
8
|
+
prelude: |
|
9
|
+
require 'red_amber'
|
10
|
+
require 'datasets-arrow'
|
11
|
+
|
12
|
+
ds = Datasets::Rdatasets.new('nycflights13', 'flights')
|
13
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
14
|
+
.assign(:flight) { flight.map(&:to_s) }
|
15
|
+
|
16
|
+
slicer = df[:distance] > 1000
|
17
|
+
distance_km = df[:distance] * 1.852
|
18
|
+
|
19
|
+
benchmark:
|
20
|
+
'G01: sum distance by destination': |
|
21
|
+
df.group(:dest).sum(:distance)
|
22
|
+
|
23
|
+
'G02: sum arr_delay by month and day': |
|
24
|
+
df.group(:month, :day).sum(:arr_delay)
|
25
|
+
|
26
|
+
'G03: sum arr_delay, mean distance by flight': |
|
27
|
+
df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
|
28
|
+
|
29
|
+
'G04: mean air_time, distance by flight': |
|
30
|
+
df.group(:flight).mean(:air_time, :distance)
|
31
|
+
|
32
|
+
'G05: sum dep_delay, arr_delay by carrer': |
|
33
|
+
df.group(:carrier).sum(:dep_delay, :arr_delay)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# --repeat-count 3
|
2
|
+
|
3
|
+
contexts:
|
4
|
+
- name: HEAD
|
5
|
+
prelude: |
|
6
|
+
$LOAD_PATH.unshift(File.expand_path('lib'))
|
7
|
+
- gems:
|
8
|
+
red_amber: 0.2.2
|
9
|
+
|
10
|
+
prelude: |
|
11
|
+
require 'red_amber'
|
12
|
+
require 'datasets-arrow'
|
13
|
+
|
14
|
+
ds = Datasets::Rdatasets.new('tidyr', 'billboard')
|
15
|
+
df = RedAmber::DataFrame.new(ds.to_arrow)
|
16
|
+
sub_df = df.pick(:track, df.keys.select{ |k| k.start_with? 'wk' })
|
17
|
+
long_df = df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
|
18
|
+
|
19
|
+
benchmark:
|
20
|
+
'R01: Transpose a DataFrame': |
|
21
|
+
sub_df.transpose(name: :week)
|
22
|
+
|
23
|
+
'R02: Reshape to longer DataFrame': |
|
24
|
+
df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
|
25
|
+
|
26
|
+
'R03: Reshape to wider DataFrame': |
|
27
|
+
long_df.to_wide(name: :week, value: :rank)
|
@@ -2,12 +2,12 @@ prelude: |
|
|
2
2
|
require 'rover'
|
3
3
|
require 'red_amber'
|
4
4
|
|
5
|
-
penguins_csv = '
|
5
|
+
penguins_csv = 'tmp/penguins.csv'
|
6
6
|
|
7
7
|
unless File.exist?(penguins_csv)
|
8
8
|
require 'datasets-arrow'
|
9
|
-
|
10
|
-
RedAmber::DataFrame.new(
|
9
|
+
ds = Datasets::Penguins.new
|
10
|
+
RedAmber::DataFrame.new(ds).save(penguins_csv)
|
11
11
|
end
|
12
12
|
|
13
13
|
benchmark:
|