red_amber 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a16699a945f41bf98790f698998126cc6b4a5e916eccb805e78448ec029f9310
4
- data.tar.gz: 5e7fa732f64567fd85e5a74b046e80861824f13d15dc910278b6c62359db9a22
3
+ metadata.gz: 6f70451aad21c0750fb2a0bfe165baf5862ac3837541135cf9d58df4ecd732ac
4
+ data.tar.gz: c31f143278bf3792bc03e0e727e12df72fca5a001b3d6c098c3f028df456e2f0
5
5
  SHA512:
6
- metadata.gz: 6ae7a6e3a8015b6b9736fb934526d9dc96b43830f0890ccbc16e175e539a8df1053432a63dde84a31dbd3a170aa6256b681127c510117723427bce815568c981
7
- data.tar.gz: a0e7d86a7bdc6be7ec493ef5331ced5ecf4e6b89458f4252f208435905a7e4e80a088a718098073fb0c65c86d76297c70c978cd4dec28b1eb1a0d915bb7e3608
6
+ metadata.gz: c5026422e8f0c0b1b1b25f6baa97b540287937a3c0d224a16ce13c17c16a11ccd54682682f17fdf5e176190f7db40bfda7bbe5659e89a212eed8f1bf9b3567e9
7
+ data.tar.gz: 82b500a1570b8fc8925a7c988bd7d3f3db677588537ee9f8d75f965ca98f5d730a0a55be5fa6637e3beeb4aed11b55a9afd3e4dfd52aee687e123f390f0d8d2b
data/.rubocop.yml CHANGED
@@ -60,6 +60,7 @@ Layout/LineLength:
60
60
  Metrics/AbcSize:
61
61
  Max: 30
62
62
  Exclude:
63
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 43
63
64
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 55
64
65
  - 'lib/red_amber/data_frame_reshaping.rb' # Max 40.91
65
66
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 51
@@ -80,11 +81,13 @@ Metrics/ClassLength:
80
81
  - 'test/**/*'
81
82
  - 'lib/red_amber/data_frame.rb' #Max: 131
82
83
  - 'lib/red_amber/vector.rb' #Max: 102
84
+ - 'lib/red_amber/group.rb' #Max: 103
83
85
 
84
86
  # Max: 7
85
87
  Metrics/CyclomaticComplexity:
86
88
  Max: 12
87
89
  Exclude:
90
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 15
88
91
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 18
89
92
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
90
93
  - 'lib/red_amber/helper.rb' # Max: 15
@@ -95,6 +98,7 @@ Metrics/CyclomaticComplexity:
95
98
  Metrics/MethodLength:
96
99
  Max: 30
97
100
  Exclude:
101
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 38
98
102
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 33
99
103
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 38
100
104
  - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 35
@@ -103,15 +107,23 @@ Metrics/MethodLength:
103
107
  Metrics/ModuleLength:
104
108
  Max: 100
105
109
  Exclude:
110
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 108
106
111
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 132
107
112
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 141
108
113
  - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 110
109
114
  - 'lib/red_amber/vector_functions.rb' # Max: 114
110
115
 
116
+ # Max: 5
117
+ Metrics/ParameterLists:
118
+ Max: 6
119
+ # Exclude:
120
+ # - 'lib/red_amber/data_frame_combinable.rb' # Max: 6
121
+
111
122
  # Max: 8
112
123
  Metrics/PerceivedComplexity:
113
124
  Max: 13
114
125
  Exclude:
126
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 14
115
127
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
116
128
  - 'lib/red_amber/helper.rb' # Max: 15
117
129
  - 'lib/red_amber/vector_updatable.rb' # Max: 15
data/CHANGELOG.md CHANGED
@@ -1,35 +1,137 @@
1
+ ## [0.2.4] - 2022-12-25 (unleleased)
2
+
3
+ ## [0.2.3] - 2022-11-16
4
+
5
+ - Bug fixes
6
+
7
+ - Fix DataFrame#to_s when DataFrame.size == 0 (#125)
8
+ - Remove unused lines in funcs (#128)
9
+ - Remove unused methods in helper (#128)
10
+ - Add test for invalid arg in DataFrame.new (#128)
11
+ - Add test for Vector#shift(0) (#128)
12
+ - Fix bugs for DataFrame#[], #pick and #drop with Range of Symbols and Symbol (#135)
13
+
14
+ - New features and improvements
15
+
16
+ - Upgrade dependency to Arrow 10.0.0 (#132)
17
+
18
+ It is possible to initialize by the objects responsible to `to_arrow` since 0.2.3 .
19
+ Arrays in Numo::NArray is responsible to `to_arrow` with Red Arrow Numo::NArray 0.0.6 .
20
+ This feature is proposed by the Red Data Tools member @kojix2 and implemented by @kou.
21
+ I made also Vector to be responsible to `to_arrow` and `to_arrow_array`.
22
+ It becomes a member of ducks ('quack quack'). Thanks!
23
+
24
+ - Change dev dependency to red-dataset-arrow (#117)
25
+ - Add dev dependency for red-arrow-numo-narray (#132)
26
+ - Support Numo::NArray in Vector.new (#132)
27
+ - Support Vector#to_arrow_array (#132)
28
+
29
+ - Update group (#118)
30
+ - Introduce new DataFrame group support (experimental)
31
+
32
+ This additional API will treat a grouped DataFrame as a list of DataFrames.
33
+ I think this API has pros such as:
34
+ - API is easy to understand and flexible.
35
+ - It has good compatibility with Ruby's primitive Enumerables.
36
+ - We can only use non hash-ed aggregation functions.
37
+ - Do not need grouped DataFrame state, nor `#ungroup` method.
38
+ - May be useful for concurrent operations.
39
+
40
+ This feature is implemented by Ruby, so it is pretty slow and experimental.
41
+ Use original Group API for practical purpose.
42
+
43
+ - `include Enumerable` to Group (experimental)
44
+ - Add Group#each, #inspect
45
+ - Refactor Group to align with Arrow
46
+
47
+ - Introduce DataFrame combining methods (#125)
48
+ - Introduce DataFrame#concatenate method
49
+ - Add DataFrame#merge method
50
+ - Add DataFrame#inner_join method
51
+ - Add DataFrame#full_join method
52
+ - Add DataFrame#left_join method
53
+ - Add DataFrame#right_join method
54
+ - Add DataFrame#semi_join method
55
+ - Add DataFrame#anti_join method
56
+ - Add DataFrame#intersect method
57
+ - Add DataFrame#union method
58
+ - Add DataFrame#setdiff method
59
+ - Rename #setdiff to #difference
60
+ - Support natural join in DataFrame#join
61
+ - Support partial join_key and renaming
62
+ - Fix DataFrame#join to merge key columns
63
+ - Add DataFrame#set_operable? method
64
+ - Add join/set/bind image to DataFrame.md
65
+ - Fix DataFrame#join, #right_semi, #right_anti (#128)
66
+
67
+ - Miscellaneous
68
+ - Return Vector in DataFrame#indices (#118)
69
+
70
+ - Improve tests/ci
71
+
72
+ - Improve CI
73
+ - Add CI test on macOS (#133)
74
+ - Enable bundler-cache on macOS (#128)
75
+ - Add install gobject introspection prior to glib in CI (#133)
76
+ This will stabilize CI system installation especially with cache.
77
+
78
+ - Rename workflows/test.yml to ci.yml (#133)
79
+ - Fix link in CI badge of README.md (#118)
80
+
81
+ - Add github action for coverage (#128)
82
+
83
+ - Add benchmark
84
+ - Add benchmarks with Rover (#118)
85
+ - Introduce benchmark suite (#134)
86
+ - Add benchmark for combining operations (#134)
87
+
88
+ - Measuring test coverage
89
+ - Add test coverage measurement (#128)
90
+
91
+ - Refactoring
92
+
93
+ - Remove redundant string escape in `test_vector_function` (#132)
94
+ - Refine tests to use `assert_equal_array` (#128)
95
+ - Rewrite Vector#replace (#128)
96
+
97
+ - Documentation
98
+
99
+ - Update README.md for installation (#126)
100
+ - Add clause that keys must be unique in doc. (#126)
101
+ - Rows should be called as 'records' (#126)
102
+ - Update Jupyter Notebook `83 examples of RedAmber` (#135)
103
+
104
+ - GitHub site
105
+
106
+ - Update Jupyter notebooks in Binder
107
+ - Change default branch name from 'master' to 'main' (#127)
108
+
109
+ - Thanks
110
+
111
+ Ruby Association Grant committee
112
+ It is a great honor for selecting RedAmber as a project of Ruby Association Grant 2022.
113
+
114
+
1
115
  ## [0.2.2] - 2022-10-04
2
116
 
3
117
  - Bug fixes
4
118
 
5
119
  - Return self when no replacement happen in Vector#replace. (#92)
6
-
7
120
  - Limit n-digits in to_iruby. (#111)
8
-
9
121
  - Fix displaying space in to_iruby. (#111)
10
-
11
122
  - Raise error if key is duplicated. (#113)
12
-
13
123
  - Fix DataFrame#pick/#drop with endless Range. (#113)
14
-
15
124
  - Change type from dictionary to string in DataFrame reshaping methods. (#113)
16
-
17
125
  - Fix arguments parser to accept Enumerator. (#114)
18
126
 
19
127
  - New features and improvements
20
128
 
21
129
  - Support to make a data frame from a to_arrow-responsible object. (#106) [Patch by Kenta Murata]
22
-
23
130
  - Introduce DataFrame#auto_cast (experimental feature) (#105)
24
-
25
131
  - Change default name in DataFrame#transpose, #to_long, #to_wide. (#110)
26
-
27
132
  - Add Vector#dictionary? method. (#113)
28
-
29
133
  - Add display mode 'Plain' and 'Minimum'. (#113)
30
-
31
134
  - Refactor code
32
-
33
135
  - Refine test_vector_selectable. (#92)
34
136
  - Refine test_vector_updatable. (#92)
35
137
  - Refine Vector.new. (#113)
@@ -38,7 +140,6 @@
38
140
  - Documents
39
141
 
40
142
  - Update images. (#90, #105, #113)
41
-
42
143
  - Update README to use simpler examples. (#112)
43
144
  - Update README with a new screenshot example. (#113)
44
145
 
@@ -61,39 +162,27 @@
61
162
 
62
163
  - Fix `Vector#each` with block (#66)
63
164
  `Vector#each` will return value of each element with block.
64
-
65
165
  - Fix table format at size == 9 (#67)
66
-
67
166
  - Fix to support Vector in `DataFrame#assign` (#77)
68
-
69
167
  - Add `assert_delta` functionality for `assert_with_NaN` (#78)
70
-
71
168
  - Fix Vector#is_in when self is chunked (#79)
72
-
73
169
  - Fix Array type error (uint/int) (#79)
74
170
 
75
171
  - New features and improvements
76
172
 
77
173
  - Refine `DataFrame#indices` method (#67)
78
-
79
174
  - Update DataFrame reshaping methods (#73)
80
-
81
175
  - Change default option value of DataFrame reshaping
82
-
83
176
  - Change the order of import_cars example
84
177
 
85
178
  - Add `DataFrame#method_missing` to get column vector by method (#75)
86
-
87
179
  - Add `DataFrame#method_missing` to get column (#75)
88
180
 
89
181
  - Accept both args and block in `DataFrame#assign` (#75)
90
-
91
182
  - Accept indices in `DataFrame#pick` and `DataFrame#drop` (#76)
92
183
 
93
184
  - Add `DataFrame#slice_by` method (#77)
94
-
95
185
  - Add new Vector functions (#78)
96
-
97
186
  - Add inverse trigonometric function for Vector
98
187
  - `acos`
99
188
  - `asin`
@@ -123,25 +212,19 @@
123
212
  - Bug fixes
124
213
 
125
214
  - Fix order of multiple group keys (#55)
126
-
127
215
  Only 1 group key comes to left. Other keys remain in right.
128
216
 
129
217
  - Remove optional `require` for rover (#55)
130
-
131
218
  Fix DataFrame.new for argument with Rover::DataFrame.
132
-
133
219
  - Fix occasional failure in CI (#59)
134
-
135
220
  Sometimes the CI test fails. I added -dev dependency
136
221
  in Arrow install by apt, not doing in bundler.
137
222
 
138
223
  - Fix calling :take in V#[] (#56)
139
-
140
224
  Fixed to call Arrow function :take instead of :array_take in Vector#take_by_vector. This will prevent the error below
141
225
  when called with Arrow::ChunkedArray.
142
226
 
143
227
  - Raise error renaming non existing key (#61)
144
-
145
228
  Add error when specified key is not exist.
146
229
 
147
230
  - Fix DataFrame#rename #assign by array (#65)
data/Gemfile CHANGED
@@ -7,7 +7,7 @@ gemspec
7
7
  group :test do
8
8
  gem 'rake'
9
9
 
10
- gem 'red-parquet', '>= 9.0.0'
10
+ gem 'red-parquet', '~> 10.0.0'
11
11
  gem 'rover-df', '~> 0.3.0'
12
12
 
13
13
  gem 'rubocop'
@@ -21,5 +21,7 @@ group :test do
21
21
  gem 'yard'
22
22
 
23
23
  gem 'benchmark_driver'
24
- gem 'red-datasets'
24
+ gem 'red-arrow-numo-narray'
25
+ gem 'red-datasets-arrow'
26
+ gem 'simplecov'
25
27
  end
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RedAmber
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
4
- [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/test.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/test.yml)
4
+ [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
5
5
  [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
6
6
 
7
7
  A simple dataframe library for Ruby.
@@ -20,9 +20,9 @@ I recommend Ruby 3 for performance.
20
20
 
21
21
  ```ruby
22
22
  # Libraries required
23
- gem 'red-arrow', '>= 9.0.0'
23
+ gem 'red-arrow', '~> 10.0.0' # Requires Apache Arrow (see installation below)
24
24
 
25
- gem 'red-parquet', '>= 9.0.0' # Optional, if you use IO from/to parquet
25
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
26
26
  gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
27
27
  ```
28
28
 
@@ -30,37 +30,52 @@ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
30
30
 
31
31
  Install requirements before you install Red Amber.
32
32
 
33
- - Apache Arrow GLib (>= 9.0.0)
34
-
35
- - Apache Parquet GLib (>= 9.0.0) # If you use IO from/to parquet
33
+ - Apache Arrow (~> 10.0.0)
34
+ - Apache Arrow GLib (~> 10.0.0)
35
+ - Apache Parquet GLib (~> 10.0.0) # If you use IO from/to parquet
36
36
 
37
37
  See [Apache Arrow install document](https://arrow.apache.org/install/).
38
38
 
39
- Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
40
-
41
- Add this line to your Gemfile:
39
+ - Minimum installation example for the latest Ubuntu:
40
+ ```
41
+ sudo apt update
42
+ sudo apt install -y -V ca-certificates lsb-release wget
43
+ wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
44
+ sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
45
+ sudo apt update
46
+ sudo apt install -y -V libarrow-dev
47
+ sudo apt install -y -V libarrow-glib-dev
48
+ ```
49
+ - On macOS, you can install Apache Arrow C++ library using Homebrew:
50
+
51
+ ```
52
+ brew install apache-arrow
53
+ ```
54
+
55
+ and GLib (C) package with:
56
+
57
+ ```
58
+ brew install apache-arrow-glib
59
+ ```
60
+
61
+ If you prepared Apache Arrow, add these lines to your Gemfile:
42
62
 
43
63
  ```ruby
64
+ gem 'red-arrow', '~> 10.0.0'
44
65
  gem 'red_amber'
66
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
67
+ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
68
+ gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
69
+ gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
45
70
  ```
46
71
 
47
- And then execute:
48
-
49
- ```shell
50
- bundle install
51
- ```
52
-
53
- Or install it yourself as:
54
-
55
- ```shell
56
- gem install red_amber
57
- ```
72
+ And then execute `bundle install` or install it yourself as `gem install red_amber`.
58
73
 
59
74
  ## Docker image and Jupyter Notebook
60
75
 
61
76
  [RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
62
77
 
63
- Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=README.ipynb).
78
+ Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
64
79
  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
65
80
 
66
81
 
@@ -71,7 +86,7 @@ The entity is a Red Arrow's Table object.
71
86
 
72
87
  ![dataframe model of RedAmber](doc/image/dataframe_model.png)
73
88
 
74
- Load the library.
89
+ Let's load the library and try some examples.
75
90
 
76
91
  ```ruby
77
92
  require 'red_amber' # require 'red-amber' is also OK.
@@ -101,7 +116,7 @@ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if
101
116
  53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
102
117
  ```
103
118
 
104
- For example, we can compute mean prices per 'cut' for the data larger than 1 carat.
119
+ For example, we can compute mean prices per cut for the data larger than 1 carat.
105
120
 
106
121
  ```ruby
107
122
  df = diamonds
@@ -125,7 +140,7 @@ Arrow data is immutable, so these methods always return new objects.
125
140
  Next example will rename a column and create a new column by simple calcuration.
126
141
 
127
142
  ```ruby
128
- usdjpy = 110.0
143
+ usdjpy = 110.0 # when the yen was stronger
129
144
 
130
145
  df.rename('mean(price)': :mean_price_USD)
131
146
  .assign(:mean_price_JPY) { mean_price_USD * usdjpy }
@@ -181,7 +196,8 @@ See [Vector.md](doc/Vector.md) for details.
181
196
 
182
197
  ## Jupyter notebook
183
198
 
184
- [73 Examples of Red Amber](binder/examples_of_red_amber.ipynb) shows more examples in jupyter notebook.
199
+ [83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
200
+ ([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
185
201
 
186
202
  You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
187
203
  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)
@@ -0,0 +1,79 @@
1
+ contexts:
2
+ - name: HEAD
3
+ prelude: |
4
+ $LOAD_PATH.unshift(File.expand_path('lib'))
5
+ - gems:
6
+ red_amber: 0.2.0
7
+ - gems:
8
+ red_amber: 0.1.5
9
+
10
+ prelude: |
11
+ require 'red_amber'
12
+ require 'datasets-arrow'
13
+
14
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
15
+ df = RedAmber::DataFrame.new(ds.to_arrow)
16
+
17
+ slicer = df[:distance] > 1000
18
+ distance_km = df[:distance] * 1.852
19
+
20
+ benchmark:
21
+ 'B01: Pick([]) by a key name': |
22
+ df[:flight]
23
+
24
+ 'B02: Pick by index': |
25
+ df[df.keys[9]]
26
+
27
+ 'B03: Pick by key names': |
28
+ df.pick(:carrier, :flight)
29
+
30
+ 'B04: Drop by key names': |
31
+ df.drop(:year, :month, :day)
32
+
33
+ 'B05: Pick by booleans': |
34
+ df.pick(df.vectors.map(&:string?))
35
+
36
+ 'B06: Pick by a block': |
37
+ df.pick { keys.map { |key| key.end_with?('time') } }
38
+
39
+ 'B07: Slice([]) by a index': |
40
+ df[877]
41
+
42
+ 'B08: Slice by indeces': |
43
+ df.slice(0...5, -5..-1)
44
+
45
+ 'B09: Slice([]) by booleans': |
46
+ df[slicer]
47
+
48
+ 'B10: Slice by booleans': |
49
+ df.slice(slicer)
50
+
51
+ 'B11: Remove by booleans': |
52
+ df.remove(slicer)
53
+
54
+ 'B12: Slice by a block': |
55
+ df.slice { slicer }
56
+
57
+ 'B13: Rename by Hash': |
58
+ df.rename(distance: :distance_mile)
59
+
60
+ 'B14: Assign an existing variable': |
61
+ df.assign(distance: distance_km)
62
+
63
+ 'B15: Assign a new variable': |
64
+ df.assign(distance_km: distance_km)
65
+
66
+ 'B16: Sort by a key': |
67
+ df.sort(:distance)
68
+
69
+ 'B17: Sort by keys': |
70
+ df.sort(:origin, '-distance')
71
+
72
+ 'B18: Convert to a Hash': |
73
+ df.to_h
74
+
75
+ 'B19: Output in TDR style': |
76
+ df.tdr
77
+
78
+ 'B20: Inspect': |
79
+ df.inspect
@@ -0,0 +1,63 @@
1
+ # --repeat-count 3
2
+
3
+ loop_count: 3
4
+
5
+ contexts:
6
+ - name: HEAD
7
+ prelude: |
8
+ $LOAD_PATH.unshift(File.expand_path('lib'))
9
+ # - gems:
10
+ # red_amber: 0.2.3
11
+
12
+ prelude: |
13
+ require 'red_amber'
14
+ include RedAmber
15
+ require 'datasets-arrow'
16
+
17
+ package = 'nycflights13'
18
+ airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
19
+ airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
20
+ flights = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
21
+ .pick(%i[month day carrier flight tailnum origin dest air_time distance])
22
+ planes = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
23
+ weather = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
24
+
25
+ flights_Q1 = flights.slice { month <= 3 }
26
+ flights_Q2 = flights.slice { month > 3 }
27
+
28
+ flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
29
+ flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
30
+
31
+ flights_left = flights_Q1.pick(...5)
32
+ flights_right = flights_Q1.pick(5..)
33
+
34
+ benchmark:
35
+ 'C01: Inner join on flights_Q1 by carrier': |
36
+ flights_Q1.inner_join(airlines, :carrier)
37
+
38
+ 'C02: Full join on flights_Q1 by planes': |
39
+ flights_Q1.full_join(planes, :tailnum)
40
+
41
+ 'C03: Left join on flights_Q1 by planes': |
42
+ flights_Q1.left_join(planes, :tailnum)
43
+
44
+ 'C04: Semi join on flights_Q1 by planes': |
45
+ flights_Q1.semi_join(planes, :tailnum)
46
+
47
+ 'C05: Anti join on flights_Q1 by planes': |
48
+ flights_Q1.anti_join(planes, :tailnum)
49
+
50
+ 'C06: Intersection of flights_1_2 and flights_1_3': |
51
+ flights_1_2.intersect(flights_1_3)
52
+
53
+ 'C07: Union of flights_1_2 and flights_1_3': |
54
+ flights_1_2.union(flights_1_3)
55
+
56
+ 'C08: Difference between flights_1_2 and flights_1_3': |
57
+ flights_1_2.difference(flights_1_3)
58
+
59
+ 'C09: Concatenate flight_Q1 on flight_Q2': |
60
+ flights_Q1.concatenate(flights_Q2)
61
+
62
+ 'C10: Merge flights_Q1_right on flights_Q1_left': |
63
+ flights_left.merge(flights_right)
@@ -1,11 +1,23 @@
1
+ contexts:
2
+ - gems:
3
+ red_amber: 0.1.8
4
+ - gems:
5
+ red_amber: 0.2.2
6
+ - name: HEAD
7
+ prelude: |
8
+ $LOAD_PATH.unshift(File.expand_path('lib'))
9
+ require 'red_amber'
10
+
1
11
  prelude: |
2
12
  require 'datasets-arrow'
3
13
  require 'red_amber'
4
14
 
5
15
  penguins = RedAmber::DataFrame.new(Datasets::Penguins.new.to_arrow)
6
16
 
7
- def drop_nil(penguins)
8
- penguins.remove { vectors.map { |v| v.is_nil} }
17
+ def remove_nil(penguins)
18
+ penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
9
19
  end
10
20
 
11
- benchmark: drop_nil(penguins)
21
+ benchmark:
22
+ 'Remove and reduce': remove_nil(penguins)
23
+ 'remove_nil method': penguins.remove_nil
@@ -0,0 +1,33 @@
1
+ contexts:
2
+ - name: HEAD
3
+ prelude: |
4
+ $LOAD_PATH.unshift(File.expand_path('lib'))
5
+ - gems:
6
+ red_amber: 0.2.2
7
+
8
+ prelude: |
9
+ require 'red_amber'
10
+ require 'datasets-arrow'
11
+
12
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
13
+ df = RedAmber::DataFrame.new(ds.to_arrow)
14
+ .assign(:flight) { flight.map(&:to_s) }
15
+
16
+ slicer = df[:distance] > 1000
17
+ distance_km = df[:distance] * 1.852
18
+
19
+ benchmark:
20
+ 'G01: sum distance by destination': |
21
+ df.group(:dest).sum(:distance)
22
+
23
+ 'G02: sum arr_delay by month and day': |
24
+ df.group(:month, :day).sum(:arr_delay)
25
+
26
+ 'G03: sum arr_delay, mean distance by flight': |
27
+ df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
28
+
29
+ 'G04: mean air_time, distance by flight': |
30
+ df.group(:flight).mean(:air_time, :distance)
31
+
32
+ 'G05: sum dep_delay, arr_delay by carrer': |
33
+ df.group(:carrier).sum(:dep_delay, :arr_delay)
@@ -0,0 +1,27 @@
1
+ # --repeat-count 3
2
+
3
+ contexts:
4
+ - name: HEAD
5
+ prelude: |
6
+ $LOAD_PATH.unshift(File.expand_path('lib'))
7
+ - gems:
8
+ red_amber: 0.2.2
9
+
10
+ prelude: |
11
+ require 'red_amber'
12
+ require 'datasets-arrow'
13
+
14
+ ds = Datasets::Rdatasets.new('tidyr', 'billboard')
15
+ df = RedAmber::DataFrame.new(ds.to_arrow)
16
+ sub_df = df.pick(:track, df.keys.select{ |k| k.start_with? 'wk' })
17
+ long_df = df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
18
+
19
+ benchmark:
20
+ 'R01: Transpose a DataFrame': |
21
+ sub_df.transpose(name: :week)
22
+
23
+ 'R02: Reshape to longer DataFrame': |
24
+ df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
25
+
26
+ 'R03: Reshape to wider DataFrame': |
27
+ long_df.to_wide(name: :week, value: :rank)
@@ -2,12 +2,12 @@ prelude: |
2
2
  require 'rover'
3
3
  require 'red_amber'
4
4
 
5
- penguins_csv = 'benchmark/cache/penguins.csv'
5
+ penguins_csv = 'tmp/penguins.csv'
6
6
 
7
7
  unless File.exist?(penguins_csv)
8
8
  require 'datasets-arrow'
9
- arrow = Datasets::Penguins.new.to_arrow
10
- RedAmber::DataFrame.new(arrow).save(penguins_csv)
9
+ ds = Datasets::Penguins.new
10
+ RedAmber::DataFrame.new(ds).save(penguins_csv)
11
11
  end
12
12
 
13
13
  benchmark: