red_amber 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a16699a945f41bf98790f698998126cc6b4a5e916eccb805e78448ec029f9310
4
- data.tar.gz: 5e7fa732f64567fd85e5a74b046e80861824f13d15dc910278b6c62359db9a22
3
+ metadata.gz: 6f70451aad21c0750fb2a0bfe165baf5862ac3837541135cf9d58df4ecd732ac
4
+ data.tar.gz: c31f143278bf3792bc03e0e727e12df72fca5a001b3d6c098c3f028df456e2f0
5
5
  SHA512:
6
- metadata.gz: 6ae7a6e3a8015b6b9736fb934526d9dc96b43830f0890ccbc16e175e539a8df1053432a63dde84a31dbd3a170aa6256b681127c510117723427bce815568c981
7
- data.tar.gz: a0e7d86a7bdc6be7ec493ef5331ced5ecf4e6b89458f4252f208435905a7e4e80a088a718098073fb0c65c86d76297c70c978cd4dec28b1eb1a0d915bb7e3608
6
+ metadata.gz: c5026422e8f0c0b1b1b25f6baa97b540287937a3c0d224a16ce13c17c16a11ccd54682682f17fdf5e176190f7db40bfda7bbe5659e89a212eed8f1bf9b3567e9
7
+ data.tar.gz: 82b500a1570b8fc8925a7c988bd7d3f3db677588537ee9f8d75f965ca98f5d730a0a55be5fa6637e3beeb4aed11b55a9afd3e4dfd52aee687e123f390f0d8d2b
data/.rubocop.yml CHANGED
@@ -60,6 +60,7 @@ Layout/LineLength:
60
60
  Metrics/AbcSize:
61
61
  Max: 30
62
62
  Exclude:
63
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 43
63
64
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 55
64
65
  - 'lib/red_amber/data_frame_reshaping.rb' # Max 40.91
65
66
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 51
@@ -80,11 +81,13 @@ Metrics/ClassLength:
80
81
  - 'test/**/*'
81
82
  - 'lib/red_amber/data_frame.rb' #Max: 131
82
83
  - 'lib/red_amber/vector.rb' #Max: 102
84
+ - 'lib/red_amber/group.rb' #Max: 103
83
85
 
84
86
  # Max: 7
85
87
  Metrics/CyclomaticComplexity:
86
88
  Max: 12
87
89
  Exclude:
90
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 15
88
91
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 18
89
92
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
90
93
  - 'lib/red_amber/helper.rb' # Max: 15
@@ -95,6 +98,7 @@ Metrics/CyclomaticComplexity:
95
98
  Metrics/MethodLength:
96
99
  Max: 30
97
100
  Exclude:
101
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 38
98
102
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 33
99
103
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 38
100
104
  - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 35
@@ -103,15 +107,23 @@ Metrics/MethodLength:
103
107
  Metrics/ModuleLength:
104
108
  Max: 100
105
109
  Exclude:
110
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 108
106
111
  - 'lib/red_amber/data_frame_displayable.rb' # Max: 132
107
112
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 141
108
113
  - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 110
109
114
  - 'lib/red_amber/vector_functions.rb' # Max: 114
110
115
 
116
+ # Max: 5
117
+ Metrics/ParameterLists:
118
+ Max: 6
119
+ # Exclude:
120
+ # - 'lib/red_amber/data_frame_combinable.rb' # Max: 6
121
+
111
122
  # Max: 8
112
123
  Metrics/PerceivedComplexity:
113
124
  Max: 13
114
125
  Exclude:
126
+ - 'lib/red_amber/data_frame_combinable.rb' # Max: 14
115
127
  - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
116
128
  - 'lib/red_amber/helper.rb' # Max: 15
117
129
  - 'lib/red_amber/vector_updatable.rb' # Max: 15
data/CHANGELOG.md CHANGED
@@ -1,35 +1,137 @@
1
+ ## [0.2.4] - 2022-12-25 (unleleased)
2
+
3
+ ## [0.2.3] - 2022-11-16
4
+
5
+ - Bug fixes
6
+
7
+ - Fix DataFrame#to_s when DataFrame.size == 0 (#125)
8
+ - Remove unused lines in funcs (#128)
9
+ - Remove unused methods in helper (#128)
10
+ - Add test for invalid arg in DataFrame.new (#128)
11
+ - Add test for Vector#shift(0) (#128)
12
+ - Fix bugs for DataFrame#[], #pick and #drop with Range of Symbols and Symbol (#135)
13
+
14
+ - New features and improvements
15
+
16
+ - Upgrade dependency to Arrow 10.0.0 (#132)
17
+
18
+ It is possible to initialize by the objects responsible to `to_arrow` since 0.2.3 .
19
+ Arrays in Numo::NArray is responsible to `to_arrow` with Red Arrow Numo::NArray 0.0.6 .
20
+ This feature is proposed by the Red Data Tools member @kojix2 and implemented by @kou.
21
+ I made also Vector to be responsible to `to_arrow` and `to_arrow_array`.
22
+ It becomes a member of ducks ('quack quack'). Thanks!
23
+
24
+ - Change dev dependency to red-dataset-arrow (#117)
25
+ - Add dev dependency for red-arrow-numo-narray (#132)
26
+ - Support Numo::NArray in Vector.new (#132)
27
+ - Support Vector#to_arrow_array (#132)
28
+
29
+ - Update group (#118)
30
+ - Introduce new DataFrame group support (experimental)
31
+
32
+ This additional API will treat a grouped DataFrame as a list of DataFrames.
33
+ I think this API has pros such as:
34
+ - API is easy to understand and flexible.
35
+ - It has good compatibility with Ruby's primitive Enumerables.
36
+ - We can only use non hash-ed aggregation functions.
37
+ - Do not need grouped DataFrame state, nor `#ungroup` method.
38
+ - May be useful for concurrent operations.
39
+
40
+ This feature is implemented by Ruby, so it is pretty slow and experimental.
41
+ Use original Group API for practical purpose.
42
+
43
+ - `include Enumerable` to Group (experimental)
44
+ - Add Group#each, #inspect
45
+ - Refactor Group to align with Arrow
46
+
47
+ - Introduce DataFrame combining methods (#125)
48
+ - Introduce DataFrame#concatenate method
49
+ - Add DataFrame#merge method
50
+ - Add DataFrame#inner_join method
51
+ - Add DataFrame#full_join method
52
+ - Add DataFrame#left_join method
53
+ - Add DataFrame#right_join method
54
+ - Add DataFrame#semi_join method
55
+ - Add DataFrame#anti_join method
56
+ - Add DataFrame#intersect method
57
+ - Add DataFrame#union method
58
+ - Add DataFrame#setdiff method
59
+ - Rename #setdiff to #difference
60
+ - Support natural join in DataFrame#join
61
+ - Support partial join_key and renaming
62
+ - Fix DataFrame#join to merge key columns
63
+ - Add DataFrame#set_operable? method
64
+ - Add join/set/bind image to DataFrame.md
65
+ - Fix DataFrame#join, #right_semi, #right_anti (#128)
66
+
67
+ - Miscellaneous
68
+ - Return Vector in DataFrame#indices (#118)
69
+
70
+ - Improve tests/ci
71
+
72
+ - Improve CI
73
+ - Add CI test on macOS (#133)
74
+ - Enable bundler-cache on macOS (#128)
75
+ - Add install gobject introspection prior to glib in CI (#133)
76
+ This will stabilize CI system installation especially with cache.
77
+
78
+ - Rename workflows/test.yml to ci.yml (#133)
79
+ - Fix link in CI badge of README.md (#118)
80
+
81
+ - Add github action for coverage (#128)
82
+
83
+ - Add benchmark
84
+ - Add benchmarks with Rover (#118)
85
+ - Introduce benchmark suite (#134)
86
+ - Add benchmark for combining operations (#134)
87
+
88
+ - Measuring test coverage
89
+ - Add test coverage measurement (#128)
90
+
91
+ - Refactoring
92
+
93
+ - Remove redundant string escape in `test_vector_function` (#132)
94
+ - Refine tests to use `assert_equal_array` (#128)
95
+ - Rewrite Vector#replace (#128)
96
+
97
+ - Documentation
98
+
99
+ - Update README.md for installation (#126)
100
+ - Add clause that keys must be unique in doc. (#126)
101
+ - Rows should be called as 'records' (#126)
102
+ - Update Jupyter Notebook `83 examples of RedAmber` (#135)
103
+
104
+ - GitHub site
105
+
106
+ - Update Jupyter notebooks in Binder
107
+ - Change default branch name from 'master' to 'main' (#127)
108
+
109
+ - Thanks
110
+
111
+ Ruby Association Grant committee
112
+ It is a great honor for selecting RedAmber as a project of Ruby Association Grant 2022.
113
+
114
+
1
115
  ## [0.2.2] - 2022-10-04
2
116
 
3
117
  - Bug fixes
4
118
 
5
119
  - Return self when no replacement happen in Vector#replace. (#92)
6
-
7
120
  - Limit n-digits in to_iruby. (#111)
8
-
9
121
  - Fix displaying space in to_iruby. (#111)
10
-
11
122
  - Raise error if key is duplicated. (#113)
12
-
13
123
  - Fix DataFrame#pick/#drop with endless Range. (#113)
14
-
15
124
  - Change type from dictionary to string in DataFrame reshaping methods. (#113)
16
-
17
125
  - Fix arguments parser to accept Enumerator. (#114)
18
126
 
19
127
  - New features and improvements
20
128
 
21
129
  - Support to make a data frame from a to_arrow-responsible object. (#106) [Patch by Kenta Murata]
22
-
23
130
  - Introduce DataFrame#auto_cast (experimental feature) (#105)
24
-
25
131
  - Change default name in DataFrame#transpose, #to_long, #to_wide. (#110)
26
-
27
132
  - Add Vector#dictionary? method. (#113)
28
-
29
133
  - Add display mode 'Plain' and 'Minimum'. (#113)
30
-
31
134
  - Refactor code
32
-
33
135
  - Refine test_vector_selectable. (#92)
34
136
  - Refine test_vector_updatable. (#92)
35
137
  - Refine Vector.new. (#113)
@@ -38,7 +140,6 @@
38
140
  - Documents
39
141
 
40
142
  - Update images. (#90, #105, #113)
41
-
42
143
  - Update README to use simpler examples. (#112)
43
144
  - Update README with a new screenshot example. (#113)
44
145
 
@@ -61,39 +162,27 @@
61
162
 
62
163
  - Fix `Vector#each` with block (#66)
63
164
  `Vector#each` will return value of each element with block.
64
-
65
165
  - Fix table format at size == 9 (#67)
66
-
67
166
  - Fix to support Vector in `DataFrame#assign` (#77)
68
-
69
167
  - Add `assert_delta` functionality for `assert_with_NaN` (#78)
70
-
71
168
  - Fix Vector#is_in when self is chunked (#79)
72
-
73
169
  - Fix Array type error (uint/int) (#79)
74
170
 
75
171
  - New features and improvements
76
172
 
77
173
  - Refine `DataFrame#indices` method (#67)
78
-
79
174
  - Update DataFrame reshaping methods (#73)
80
-
81
175
  - Change default option value of DataFrame reshaping
82
-
83
176
  - Change the order of import_cars example
84
177
 
85
178
  - Add `DataFrame#method_missing` to get column vector by method (#75)
86
-
87
179
  - Add `DataFrame#method_missing` to get column (#75)
88
180
 
89
181
  - Accept both args and block in `DataFrame#assign` (#75)
90
-
91
182
  - Accept indices in `DataFrame#pick` and `DataFrame#drop` (#76)
92
183
 
93
184
  - Add `DataFrame#slice_by` method (#77)
94
-
95
185
  - Add new Vector functions (#78)
96
-
97
186
  - Add inverse trigonometric function for Vector
98
187
  - `acos`
99
188
  - `asin`
@@ -123,25 +212,19 @@
123
212
  - Bug fixes
124
213
 
125
214
  - Fix order of multiple group keys (#55)
126
-
127
215
  Only 1 group key comes to left. Other keys remain in right.
128
216
 
129
217
  - Remove optional `require` for rover (#55)
130
-
131
218
  Fix DataFrame.new for argument with Rover::DataFrame.
132
-
133
219
  - Fix occasional failure in CI (#59)
134
-
135
220
  Sometimes the CI test fails. I added -dev dependency
136
221
  in Arrow install by apt, not doing in bundler.
137
222
 
138
223
  - Fix calling :take in V#[] (#56)
139
-
140
224
  Fixed to call Arrow function :take instead of :array_take in Vector#take_by_vector. This will prevent the error below
141
225
  when called with Arrow::ChunkedArray.
142
226
 
143
227
  - Raise error renaming non existing key (#61)
144
-
145
228
  Add error when specified key is not exist.
146
229
 
147
230
  - Fix DataFrame#rename #assign by array (#65)
data/Gemfile CHANGED
@@ -7,7 +7,7 @@ gemspec
7
7
  group :test do
8
8
  gem 'rake'
9
9
 
10
- gem 'red-parquet', '>= 9.0.0'
10
+ gem 'red-parquet', '~> 10.0.0'
11
11
  gem 'rover-df', '~> 0.3.0'
12
12
 
13
13
  gem 'rubocop'
@@ -21,5 +21,7 @@ group :test do
21
21
  gem 'yard'
22
22
 
23
23
  gem 'benchmark_driver'
24
- gem 'red-datasets'
24
+ gem 'red-arrow-numo-narray'
25
+ gem 'red-datasets-arrow'
26
+ gem 'simplecov'
25
27
  end
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RedAmber
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
4
- [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/test.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/test.yml)
4
+ [![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
5
5
  [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
6
6
 
7
7
  A simple dataframe library for Ruby.
@@ -20,9 +20,9 @@ I recommend Ruby 3 for performance.
20
20
 
21
21
  ```ruby
22
22
  # Libraries required
23
- gem 'red-arrow', '>= 9.0.0'
23
+ gem 'red-arrow', '~> 10.0.0' # Requires Apache Arrow (see installation below)
24
24
 
25
- gem 'red-parquet', '>= 9.0.0' # Optional, if you use IO from/to parquet
25
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
26
26
  gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
27
27
  ```
28
28
 
@@ -30,37 +30,52 @@ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
30
30
 
31
31
  Install requirements before you install Red Amber.
32
32
 
33
- - Apache Arrow GLib (>= 9.0.0)
34
-
35
- - Apache Parquet GLib (>= 9.0.0) # If you use IO from/to parquet
33
+ - Apache Arrow (~> 10.0.0)
34
+ - Apache Arrow GLib (~> 10.0.0)
35
+ - Apache Parquet GLib (~> 10.0.0) # If you use IO from/to parquet
36
36
 
37
37
  See [Apache Arrow install document](https://arrow.apache.org/install/).
38
38
 
39
- Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
40
-
41
- Add this line to your Gemfile:
39
+ - Minimum installation example for the latest Ubuntu:
40
+ ```
41
+ sudo apt update
42
+ sudo apt install -y -V ca-certificates lsb-release wget
43
+ wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
44
+ sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
45
+ sudo apt update
46
+ sudo apt install -y -V libarrow-dev
47
+ sudo apt install -y -V libarrow-glib-dev
48
+ ```
49
+ - On macOS, you can install Apache Arrow C++ library using Homebrew:
50
+
51
+ ```
52
+ brew install apache-arrow
53
+ ```
54
+
55
+ and GLib (C) package with:
56
+
57
+ ```
58
+ brew install apache-arrow-glib
59
+ ```
60
+
61
+ If you prepared Apache Arrow, add these lines to your Gemfile:
42
62
 
43
63
  ```ruby
64
+ gem 'red-arrow', '~> 10.0.0'
44
65
  gem 'red_amber'
66
+ gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
67
+ gem 'rover-df', '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
68
+ gem 'red-datasets-arrow' # Optional, recommended if you use Red Datasets
69
+ gem 'red-arrow-numo-narray' # Optional, recommended if you use inputs from Numo::NArray
45
70
  ```
46
71
 
47
- And then execute:
48
-
49
- ```shell
50
- bundle install
51
- ```
52
-
53
- Or install it yourself as:
54
-
55
- ```shell
56
- gem install red_amber
57
- ```
72
+ And then execute `bundle install` or install it yourself as `gem install red_amber`.
58
73
 
59
74
  ## Docker image and Jupyter Notebook
60
75
 
61
76
  [RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
62
77
 
63
- Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=README.ipynb).
78
+ Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
64
79
  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
65
80
 
66
81
 
@@ -71,7 +86,7 @@ The entity is a Red Arrow's Table object.
71
86
 
72
87
  ![dataframe model of RedAmber](doc/image/dataframe_model.png)
73
88
 
74
- Load the library.
89
+ Let's load the library and try some examples.
75
90
 
76
91
  ```ruby
77
92
  require 'red_amber' # require 'red-amber' is also OK.
@@ -101,7 +116,7 @@ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if
101
116
  53939 0.75 Ideal D SI2 62.2 55.0 2757 5.83 ... 3.64
102
117
  ```
103
118
 
104
- For example, we can compute mean prices per 'cut' for the data larger than 1 carat.
119
+ For example, we can compute mean prices per cut for the data larger than 1 carat.
105
120
 
106
121
  ```ruby
107
122
  df = diamonds
@@ -125,7 +140,7 @@ Arrow data is immutable, so these methods always return new objects.
125
140
  Next example will rename a column and create a new column by simple calcuration.
126
141
 
127
142
  ```ruby
128
- usdjpy = 110.0
143
+ usdjpy = 110.0 # when the yen was stronger
129
144
 
130
145
  df.rename('mean(price)': :mean_price_USD)
131
146
  .assign(:mean_price_JPY) { mean_price_USD * usdjpy }
@@ -181,7 +196,8 @@ See [Vector.md](doc/Vector.md) for details.
181
196
 
182
197
  ## Jupyter notebook
183
198
 
184
- [73 Examples of Red Amber](binder/examples_of_red_amber.ipynb) shows more examples in jupyter notebook.
199
+ [83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
200
+ ([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
185
201
 
186
202
  You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
187
203
  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)
@@ -0,0 +1,79 @@
1
+ contexts:
2
+ - name: HEAD
3
+ prelude: |
4
+ $LOAD_PATH.unshift(File.expand_path('lib'))
5
+ - gems:
6
+ red_amber: 0.2.0
7
+ - gems:
8
+ red_amber: 0.1.5
9
+
10
+ prelude: |
11
+ require 'red_amber'
12
+ require 'datasets-arrow'
13
+
14
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
15
+ df = RedAmber::DataFrame.new(ds.to_arrow)
16
+
17
+ slicer = df[:distance] > 1000
18
+ distance_km = df[:distance] * 1.852
19
+
20
+ benchmark:
21
+ 'B01: Pick([]) by a key name': |
22
+ df[:flight]
23
+
24
+ 'B02: Pick by index': |
25
+ df[df.keys[9]]
26
+
27
+ 'B03: Pick by key names': |
28
+ df.pick(:carrier, :flight)
29
+
30
+ 'B04: Drop by key names': |
31
+ df.drop(:year, :month, :day)
32
+
33
+ 'B05: Pick by booleans': |
34
+ df.pick(df.vectors.map(&:string?))
35
+
36
+ 'B06: Pick by a block': |
37
+ df.pick { keys.map { |key| key.end_with?('time') } }
38
+
39
+ 'B07: Slice([]) by a index': |
40
+ df[877]
41
+
42
+ 'B08: Slice by indeces': |
43
+ df.slice(0...5, -5..-1)
44
+
45
+ 'B09: Slice([]) by booleans': |
46
+ df[slicer]
47
+
48
+ 'B10: Slice by booleans': |
49
+ df.slice(slicer)
50
+
51
+ 'B11: Remove by booleans': |
52
+ df.remove(slicer)
53
+
54
+ 'B12: Slice by a block': |
55
+ df.slice { slicer }
56
+
57
+ 'B13: Rename by Hash': |
58
+ df.rename(distance: :distance_mile)
59
+
60
+ 'B14: Assign an existing variable': |
61
+ df.assign(distance: distance_km)
62
+
63
+ 'B15: Assign a new variable': |
64
+ df.assign(distance_km: distance_km)
65
+
66
+ 'B16: Sort by a key': |
67
+ df.sort(:distance)
68
+
69
+ 'B17: Sort by keys': |
70
+ df.sort(:origin, '-distance')
71
+
72
+ 'B18: Convert to a Hash': |
73
+ df.to_h
74
+
75
+ 'B19: Output in TDR style': |
76
+ df.tdr
77
+
78
+ 'B20: Inspect': |
79
+ df.inspect
@@ -0,0 +1,63 @@
1
+ # --repeat-count 3
2
+
3
+ loop_count: 3
4
+
5
+ contexts:
6
+ - name: HEAD
7
+ prelude: |
8
+ $LOAD_PATH.unshift(File.expand_path('lib'))
9
+ # - gems:
10
+ # red_amber: 0.2.3
11
+
12
+ prelude: |
13
+ require 'red_amber'
14
+ include RedAmber
15
+ require 'datasets-arrow'
16
+
17
+ package = 'nycflights13'
18
+ airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
19
+ airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
20
+ flights = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
21
+ .pick(%i[month day carrier flight tailnum origin dest air_time distance])
22
+ planes = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
23
+ weather = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
24
+
25
+ flights_Q1 = flights.slice { month <= 3 }
26
+ flights_Q2 = flights.slice { month > 3 }
27
+
28
+ flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
29
+ flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
30
+
31
+ flights_left = flights_Q1.pick(...5)
32
+ flights_right = flights_Q1.pick(5..)
33
+
34
+ benchmark:
35
+ 'C01: Inner join on flights_Q1 by carrier': |
36
+ flights_Q1.inner_join(airlines, :carrier)
37
+
38
+ 'C02: Full join on flights_Q1 by planes': |
39
+ flights_Q1.full_join(planes, :tailnum)
40
+
41
+ 'C03: Left join on flights_Q1 by planes': |
42
+ flights_Q1.left_join(planes, :tailnum)
43
+
44
+ 'C04: Semi join on flights_Q1 by planes': |
45
+ flights_Q1.semi_join(planes, :tailnum)
46
+
47
+ 'C05: Anti join on flights_Q1 by planes': |
48
+ flights_Q1.anti_join(planes, :tailnum)
49
+
50
+ 'C06: Intersection of flights_1_2 and flights_1_3': |
51
+ flights_1_2.intersect(flights_1_3)
52
+
53
+ 'C07: Union of flights_1_2 and flights_1_3': |
54
+ flights_1_2.union(flights_1_3)
55
+
56
+ 'C08: Difference between flights_1_2 and flights_1_3': |
57
+ flights_1_2.difference(flights_1_3)
58
+
59
+ 'C09: Concatenate flight_Q1 on flight_Q2': |
60
+ flights_Q1.concatenate(flights_Q2)
61
+
62
+ 'C10: Merge flights_Q1_right on flights_Q1_left': |
63
+ flights_left.merge(flights_right)
@@ -1,11 +1,23 @@
1
+ contexts:
2
+ - gems:
3
+ red_amber: 0.1.8
4
+ - gems:
5
+ red_amber: 0.2.2
6
+ - name: HEAD
7
+ prelude: |
8
+ $LOAD_PATH.unshift(File.expand_path('lib'))
9
+ require 'red_amber'
10
+
1
11
  prelude: |
2
12
  require 'datasets-arrow'
3
13
  require 'red_amber'
4
14
 
5
15
  penguins = RedAmber::DataFrame.new(Datasets::Penguins.new.to_arrow)
6
16
 
7
- def drop_nil(penguins)
8
- penguins.remove { vectors.map { |v| v.is_nil} }
17
+ def remove_nil(penguins)
18
+ penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
9
19
  end
10
20
 
11
- benchmark: drop_nil(penguins)
21
+ benchmark:
22
+ 'Remove and reduce': remove_nil(penguins)
23
+ 'remove_nil method': penguins.remove_nil
@@ -0,0 +1,33 @@
1
+ contexts:
2
+ - name: HEAD
3
+ prelude: |
4
+ $LOAD_PATH.unshift(File.expand_path('lib'))
5
+ - gems:
6
+ red_amber: 0.2.2
7
+
8
+ prelude: |
9
+ require 'red_amber'
10
+ require 'datasets-arrow'
11
+
12
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
13
+ df = RedAmber::DataFrame.new(ds.to_arrow)
14
+ .assign(:flight) { flight.map(&:to_s) }
15
+
16
+ slicer = df[:distance] > 1000
17
+ distance_km = df[:distance] * 1.852
18
+
19
+ benchmark:
20
+ 'G01: sum distance by destination': |
21
+ df.group(:dest).sum(:distance)
22
+
23
+ 'G02: sum arr_delay by month and day': |
24
+ df.group(:month, :day).sum(:arr_delay)
25
+
26
+ 'G03: sum arr_delay, mean distance by flight': |
27
+ df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
28
+
29
+ 'G04: mean air_time, distance by flight': |
30
+ df.group(:flight).mean(:air_time, :distance)
31
+
32
+ 'G05: sum dep_delay, arr_delay by carrer': |
33
+ df.group(:carrier).sum(:dep_delay, :arr_delay)
@@ -0,0 +1,27 @@
1
+ # --repeat-count 3
2
+
3
+ contexts:
4
+ - name: HEAD
5
+ prelude: |
6
+ $LOAD_PATH.unshift(File.expand_path('lib'))
7
+ - gems:
8
+ red_amber: 0.2.2
9
+
10
+ prelude: |
11
+ require 'red_amber'
12
+ require 'datasets-arrow'
13
+
14
+ ds = Datasets::Rdatasets.new('tidyr', 'billboard')
15
+ df = RedAmber::DataFrame.new(ds.to_arrow)
16
+ sub_df = df.pick(:track, df.keys.select{ |k| k.start_with? 'wk' })
17
+ long_df = df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
18
+
19
+ benchmark:
20
+ 'R01: Transpose a DataFrame': |
21
+ sub_df.transpose(name: :week)
22
+
23
+ 'R02: Reshape to longer DataFrame': |
24
+ df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
25
+
26
+ 'R03: Reshape to wider DataFrame': |
27
+ long_df.to_wide(name: :week, value: :rank)
@@ -2,12 +2,12 @@ prelude: |
2
2
  require 'rover'
3
3
  require 'red_amber'
4
4
 
5
- penguins_csv = 'benchmark/cache/penguins.csv'
5
+ penguins_csv = 'tmp/penguins.csv'
6
6
 
7
7
  unless File.exist?(penguins_csv)
8
8
  require 'datasets-arrow'
9
- arrow = Datasets::Penguins.new.to_arrow
10
- RedAmber::DataFrame.new(arrow).save(penguins_csv)
9
+ ds = Datasets::Penguins.new
10
+ RedAmber::DataFrame.new(ds).save(penguins_csv)
11
11
  end
12
12
 
13
13
  benchmark: