RubyGems - red_amber - Versions diffs - 0.2.2 → 0.2.3 - Mend

red_amber 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/.rubocop.yml +12 -0
data/CHANGELOG.md +114 -31
data/Gemfile +4 -2
data/README.md +41 -25
data/benchmark/basic.yml +79 -0
data/benchmark/combine.yml +63 -0
data/benchmark/drop_nil.yml +15 -3
data/benchmark/group.yml +33 -0
data/benchmark/reshape.yml +27 -0
data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
data/benchmark/rover/flights.yml +23 -0
data/benchmark/rover/penguins.yml +23 -0
data/benchmark/rover/planes.yml +23 -0
data/benchmark/rover/weather.yml +23 -0
data/doc/DataFrame.md +332 -53
data/doc/Vector.md +3 -0
data/doc/image/dataframe/join.png +0 -0
data/doc/image/dataframe/set_and_bind.png +0 -0
data/doc/image/dataframe_model.png +0 -0
data/lib/red_amber/data_frame.rb +6 -5
data/lib/red_amber/data_frame_combinable.rb +283 -0
data/lib/red_amber/data_frame_displayable.rb +2 -0
data/lib/red_amber/data_frame_selectable.rb +9 -9
data/lib/red_amber/data_frame_variable_operation.rb +4 -4
data/lib/red_amber/group.rb +99 -18
data/lib/red_amber/helper.rb +1 -13
data/lib/red_amber/vector.rb +7 -0
data/lib/red_amber/vector_functions.rb +0 -8
data/lib/red_amber/vector_updatable.rb +60 -65
data/lib/red_amber/version.rb +1 -1
data/lib/red_amber.rb +1 -0
data/red_amber.gemspec +1 -1
metadata +21 -10

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a16699a945f41bf98790f698998126cc6b4a5e916eccb805e78448ec029f9310
-  data.tar.gz: 5e7fa732f64567fd85e5a74b046e80861824f13d15dc910278b6c62359db9a22
+  metadata.gz: 6f70451aad21c0750fb2a0bfe165baf5862ac3837541135cf9d58df4ecd732ac
+  data.tar.gz: c31f143278bf3792bc03e0e727e12df72fca5a001b3d6c098c3f028df456e2f0
 SHA512:
-  metadata.gz: 6ae7a6e3a8015b6b9736fb934526d9dc96b43830f0890ccbc16e175e539a8df1053432a63dde84a31dbd3a170aa6256b681127c510117723427bce815568c981
-  data.tar.gz: a0e7d86a7bdc6be7ec493ef5331ced5ecf4e6b89458f4252f208435905a7e4e80a088a718098073fb0c65c86d76297c70c978cd4dec28b1eb1a0d915bb7e3608
+  metadata.gz: c5026422e8f0c0b1b1b25f6baa97b540287937a3c0d224a16ce13c17c16a11ccd54682682f17fdf5e176190f7db40bfda7bbe5659e89a212eed8f1bf9b3567e9
+  data.tar.gz: 82b500a1570b8fc8925a7c988bd7d3f3db677588537ee9f8d75f965ca98f5d730a0a55be5fa6637e3beeb4aed11b55a9afd3e4dfd52aee687e123f390f0d8d2b

data/.rubocop.yml CHANGED Viewed

@@ -60,6 +60,7 @@ Layout/LineLength:
 Metrics/AbcSize:
   Max: 30
   Exclude:
+    - 'lib/red_amber/data_frame_combinable.rb' # Max: 43
     - 'lib/red_amber/data_frame_displayable.rb' # Max: 55
     - 'lib/red_amber/data_frame_reshaping.rb' # Max 40.91
     - 'lib/red_amber/data_frame_selectable.rb' # Max: 51
@@ -80,11 +81,13 @@ Metrics/ClassLength:
     - 'test/**/*'
     - 'lib/red_amber/data_frame.rb' #Max: 131
     - 'lib/red_amber/vector.rb' #Max: 102
+    - 'lib/red_amber/group.rb' #Max: 103
 # Max: 7
 Metrics/CyclomaticComplexity:
   Max: 12
   Exclude:
+    - 'lib/red_amber/data_frame_combinable.rb' # Max: 15
     - 'lib/red_amber/data_frame_displayable.rb' # Max: 18
     - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
     - 'lib/red_amber/helper.rb' # Max: 15
@@ -95,6 +98,7 @@ Metrics/CyclomaticComplexity:
 Metrics/MethodLength:
   Max: 30
   Exclude:
+    - 'lib/red_amber/data_frame_combinable.rb' # Max: 38
     - 'lib/red_amber/data_frame_displayable.rb' # Max: 33
     - 'lib/red_amber/data_frame_selectable.rb' # Max: 38
     - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 35
@@ -103,15 +107,23 @@ Metrics/MethodLength:
 Metrics/ModuleLength:
   Max: 100
   Exclude:
+    - 'lib/red_amber/data_frame_combinable.rb' # Max: 108
     - 'lib/red_amber/data_frame_displayable.rb' # Max: 132
     - 'lib/red_amber/data_frame_selectable.rb' # Max: 141
     - 'lib/red_amber/data_frame_variable_operation.rb' # Max: 110
     - 'lib/red_amber/vector_functions.rb' # Max: 114
+# Max: 5
+Metrics/ParameterLists:
+  Max: 6
+  # Exclude:
+  #   - 'lib/red_amber/data_frame_combinable.rb' # Max: 6
 # Max: 8
 Metrics/PerceivedComplexity:
   Max: 13
   Exclude:
+    - 'lib/red_amber/data_frame_combinable.rb' # Max: 14
     - 'lib/red_amber/data_frame_selectable.rb' # Max: 14
     - 'lib/red_amber/helper.rb' # Max: 15
     - 'lib/red_amber/vector_updatable.rb' # Max: 15

data/CHANGELOG.md CHANGED Viewed

@@ -1,35 +1,137 @@
+## [0.2.4] - 2022-12-25 (unleleased)
+## [0.2.3] - 2022-11-16
+- Bug fixes
+  - Fix DataFrame#to_s when DataFrame.size == 0 (#125)
+  - Remove unused lines in funcs (#128)
+  - Remove unused methods in helper (#128)
+  - Add test for invalid arg in DataFrame.new (#128)
+  - Add test for Vector#shift(0) (#128)
+  - Fix bugs for DataFrame#[], #pick and #drop with Range of Symbols and Symbol (#135)
+- New features and improvements
+  - Upgrade dependency to Arrow 10.0.0 (#132)
+    It is possible to initialize by the objects responsible to `to_arrow` since 0.2.3 .
+    Arrays in Numo::NArray is responsible to `to_arrow` with Red Arrow Numo::NArray 0.0.6 .
+    This feature is proposed by the Red Data Tools member @kojix2 and implemented by @kou.
+    I made also Vector to be responsible to `to_arrow` and `to_arrow_array`.
+    It becomes a member of ducks ('quack quack'). Thanks!
+    - Change dev dependency to red-dataset-arrow (#117)
+    - Add dev dependency for red-arrow-numo-narray (#132)
+    - Support Numo::NArray in Vector.new (#132)
+    - Support Vector#to_arrow_array (#132)
+  - Update group (#118)
+    - Introduce new DataFrame group support (experimental)
+      This additional API will treat a grouped DataFrame as a list of DataFrames.
+      I think this API has pros such as:
+      - API is easy to understand and flexible.
+      - It has good compatibility with Ruby's primitive Enumerables.
+      - We can only use non hash-ed aggregation functions.
+      - Do not need grouped DataFrame state, nor `#ungroup` method.
+      - May be useful for concurrent operations.
+      This feature is implemented by Ruby, so it is pretty slow and experimental.
+      Use original Group API for practical purpose.
+    - `include Enumerable` to Group  (experimental)
+    - Add Group#each, #inspect
+    - Refactor Group to align with Arrow
+  - Introduce DataFrame combining methods (#125)
+    - Introduce DataFrame#concatenate method
+    - Add DataFrame#merge method
+    - Add DataFrame#inner_join method
+    - Add DataFrame#full_join method
+    - Add DataFrame#left_join method
+    - Add DataFrame#right_join method
+    - Add DataFrame#semi_join method
+    - Add DataFrame#anti_join method
+    - Add DataFrame#intersect method
+    - Add DataFrame#union method
+    - Add DataFrame#setdiff method
+      - Rename #setdiff to #difference
+    - Support natural join in DataFrame#join
+    - Support partial join_key and renaming
+    - Fix DataFrame#join to merge key columns
+    - Add DataFrame#set_operable? method
+    - Add join/set/bind image to DataFrame.md
+    - Fix DataFrame#join, #right_semi, #right_anti (#128)
+  - Miscellaneous
+    - Return Vector in DataFrame#indices (#118)
+- Improve tests/ci
+  - Improve CI
+    - Add CI test on macOS (#133)
+    - Enable bundler-cache on macOS (#128)
+    - Add install gobject introspection prior to glib in CI (#133)
+      This will stabilize CI system installation especially with cache.
+    - Rename workflows/test.yml to ci.yml (#133)
+      - Fix link in CI badge of README.md (#118)
+    - Add github action for coverage (#128)
+  - Add benchmark
+    - Add benchmarks with Rover (#118)
+    - Introduce benchmark suite (#134)
+    - Add benchmark for combining operations (#134)
+  - Measuring test coverage
+    - Add test coverage measurement (#128)
+- Refactoring
+  - Remove redundant string escape in `test_vector_function` (#132)
+  - Refine tests to use `assert_equal_array` (#128)
+  - Rewrite Vector#replace (#128)
+- Documentation
+  - Update README.md for installation (#126)
+  - Add clause that keys must be unique in doc. (#126)
+  - Rows should be called as 'records' (#126)
+  - Update Jupyter Notebook `83 examples of RedAmber` (#135)
+- GitHub site
+    - Update Jupyter notebooks in Binder
+    - Change default branch name from 'master' to 'main' (#127)
+- Thanks
+  Ruby Association Grant committee
+    It is a great honor for selecting RedAmber as a project of Ruby Association Grant 2022.
 ## [0.2.2] - 2022-10-04
 - Bug fixes
   - Return self when no replacement happen in Vector#replace. (#92)
   - Limit n-digits in to_iruby. (#111)
   - Fix displaying space in to_iruby. (#111)
   - Raise error if key is duplicated. (#113)
   - Fix DataFrame#pick/#drop with endless Range. (#113)
   - Change type from dictionary to string in DataFrame reshaping methods. (#113)
   - Fix arguments parser to accept Enumerator. (#114)
 - New features and improvements
   - Support to make a data frame from a to_arrow-responsible object. (#106) [Patch by Kenta Murata]
   - Introduce DataFrame#auto_cast (experimental feature) (#105)
   - Change default name in DataFrame#transpose, #to_long, #to_wide. (#110)
   - Add Vector#dictionary? method. (#113)
   - Add display mode 'Plain' and 'Minimum'. (#113)
   - Refactor code
     - Refine test_vector_selectable. (#92)
     - Refine test_vector_updatable. (#92)
     - Refine Vector.new. (#113)
@@ -38,7 +140,6 @@
   - Documents
     - Update images. (#90, #105, #113)
     - Update README to use simpler examples. (#112)
       - Update README with a new screenshot example. (#113)
@@ -61,39 +162,27 @@
   - Fix `Vector#each` with block (#66)
     `Vector#each` will return value of each element with block.
   - Fix table format at size == 9 (#67)
   - Fix to support Vector in `DataFrame#assign` (#77)
   - Add `assert_delta` functionality for `assert_with_NaN` (#78)
   - Fix Vector#is_in when self is chunked (#79)
   - Fix Array type error (uint/int) (#79)
 - New features and improvements
   - Refine `DataFrame#indices` method (#67)
   - Update DataFrame reshaping methods (#73)
     - Change default option value of DataFrame reshaping
     - Change the order of import_cars example
   - Add `DataFrame#method_missing` to get column vector by method (#75)
     - Add `DataFrame#method_missing` to get column (#75)
   - Accept both args and block in `DataFrame#assign` (#75)
   - Accept indices in `DataFrame#pick` and `DataFrame#drop` (#76)
   - Add `DataFrame#slice_by` method (#77)
   - Add new Vector functions (#78)
     - Add inverse trigonometric function for Vector
       - `acos`
       - `asin`
@@ -123,25 +212,19 @@
 - Bug fixes
   - Fix order of multiple group keys (#55)
     Only 1 group key comes to left. Other keys remain in right.
   - Remove optional `require` for rover (#55)
     Fix DataFrame.new for argument with Rover::DataFrame.
   - Fix occasional failure in CI (#59)
     Sometimes the CI test fails. I added -dev dependency
     in Arrow install by apt, not doing in bundler.
   - Fix calling :take in V#[] (#56)
     Fixed to call Arrow function :take instead of :array_take in Vector#take_by_vector. This will prevent the error below
     when called with Arrow::ChunkedArray.
   - Raise error renaming non existing key (#61)
     Add error when specified key is not exist.
   - Fix DataFrame#rename #assign by array (#65)

data/Gemfile CHANGED Viewed

@@ -7,7 +7,7 @@ gemspec
 group :test do
   gem 'rake'
-  gem 'red-parquet', '>= 9.0.0'
+  gem 'red-parquet', '~> 10.0.0'
   gem 'rover-df', '~> 0.3.0'
   gem 'rubocop'
@@ -21,5 +21,7 @@ group :test do
   gem 'yard'
   gem 'benchmark_driver'
-  gem 'red-datasets'
+  gem 'red-arrow-numo-narray'
+  gem 'red-datasets-arrow'
+  gem 'simplecov'
 end

data/README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # RedAmber
 [![Gem Version](https://badge.fury.io/rb/red_amber.svg)](https://badge.fury.io/rb/red_amber)
-[![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/test.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/test.yml)
+[![Ruby](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml/badge.svg)](https://github.com/heronshoes/red_amber/actions/workflows/ci.yml)
 [![Discussions](https://img.shields.io/github/discussions/heronshoes/red_amber)](https://github.com/heronshoes/red_amber/discussions)
 A simple dataframe library for Ruby.
@@ -20,9 +20,9 @@ I recommend Ruby 3 for performance.
 ```ruby
 # Libraries required
-gem 'red-arrow',   '>= 9.0.0'
+gem 'red-arrow',   '~> 10.0.0' # Requires Apache Arrow (see installation below)
-gem 'red-parquet', '>= 9.0.0' # Optional, if you use IO from/to parquet
+gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
 gem 'rover-df',    '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
 ```
@@ -30,37 +30,52 @@ gem 'rover-df',    '~> 0.3.0' # Optional, if you use IO from/to Rover::DataFrame
 Install requirements before you install Red Amber.
-- Apache Arrow GLib (>= 9.0.0)
-- Apache Parquet GLib (>= 9.0.0)  # If you use IO from/to parquet
+- Apache Arrow (~> 10.0.0)
+- Apache Arrow GLib (~> 10.0.0)
+- Apache Parquet GLib (~> 10.0.0)  # If you use IO from/to parquet
   See [Apache Arrow install document](https://arrow.apache.org/install/).
-  Minimum installation example for the latest Ubuntu is in the ['Prepare the Apache Arrow' section in ci test](https://github.com/heronshoes/red_amber/blob/master/.github/workflows/test.yml) of Red Amber.
-Add this line to your Gemfile:
+  - Minimum installation example for the latest Ubuntu:
+    ```
+    sudo apt update
+    sudo apt install -y -V ca-certificates lsb-release wget
+    wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+    sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+    sudo apt update
+    sudo apt install -y -V libarrow-dev
+    sudo apt install -y -V libarrow-glib-dev
+    ```
+  - On macOS, you can install Apache Arrow C++ library using Homebrew:
+    ```
+    brew install apache-arrow
+    ```
+    and GLib (C) package with:
+    ```
+    brew install apache-arrow-glib
+    ```
+If you prepared Apache Arrow, add these lines to your Gemfile:
 ```ruby
+gem 'red-arrow',   '~> 10.0.0'
 gem 'red_amber'
+gem 'red-parquet', '~> 10.0.0' # Optional, if you use IO from/to parquet
+gem 'rover-df',    '~> 0.3.0'  # Optional, if you use IO from/to Rover::DataFrame
+gem 'red-datasets-arrow'       # Optional, recommended if you use Red Datasets
+gem 'red-arrow-numo-narray'    # Optional, recommended if you use inputs from Numo::NArray
 ```
-And then execute:
-```shell
-bundle install
-```
-Or install it yourself as:
-```shell
-gem install red_amber
-```
+And then execute `bundle install` or install it yourself as `gem install red_amber`.
 ## Docker image and Jupyter Notebook
 [RubyData Docker Stacks](https://github.com/RubyData/docker-stacks) is available as a ready-to-run Docker image containing Jupyter and useful data tools as well as RedAmber (Thanks to @mrkn).
-Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=README.ipynb).
+Also you can try the contents of this README interactively by [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb).
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=red-amber.ipynb)
@@ -71,7 +86,7 @@ The entity is a Red Arrow's Table object.
 ![dataframe model of RedAmber](doc/image/dataframe_model.png)
-Load the library.
+Let's load the library and try some examples.
 ```ruby
 require 'red_amber' # require 'red-amber' is also OK.
@@ -101,7 +116,7 @@ diamonds = DataFrame.new(dataset) # from v0.2.2, should be `dataset.to_arrow` if
 53939     0.75 Ideal     D        SI2          62.2     55.0     2757     5.83 ...     3.64
 ```
-For example, we can compute mean prices per 'cut' for the data larger than 1 carat.
+For example, we can compute mean prices per cut for the data larger than 1 carat.
 ```ruby
 df = diamonds
@@ -125,7 +140,7 @@ Arrow data is immutable, so these methods always return new objects.
 Next example will rename a column and create a new column by simple calcuration.
 ```ruby
-usdjpy = 110.0
+usdjpy = 110.0 # when the yen was stronger
 df.rename('mean(price)': :mean_price_USD)
   .assign(:mean_price_JPY) { mean_price_USD * usdjpy }
@@ -181,7 +196,8 @@ See [Vector.md](doc/Vector.md) for details.
 ## Jupyter notebook
-[73 Examples of Red Amber](binder/examples_of_red_amber.ipynb) shows more examples in jupyter notebook.
+[83 Examples of Red Amber](https://github.com/heronshoes/docker-stacks/blob/RedAmber-binder/binder/examples_of_red_amber.ipynb)
+([raw file](https://raw.githubusercontent.com/heronshoes/docker-stacks/RedAmber-binder/binder/examples_of_red_amber.ipynb)) shows more examples in jupyter notebook.
 You can try this notebook on [Binder](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb).
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/heronshoes/docker-stacks/RedAmber-binder?filepath=examples_of_red_amber.ipynb)

data/benchmark/basic.yml ADDED Viewed

@@ -0,0 +1,79 @@
+contexts:
+  - name: HEAD
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path('lib'))
+  - gems:
+      red_amber: 0.2.0
+  - gems:
+      red_amber: 0.1.5
+prelude: |
+  require 'red_amber'
+  require 'datasets-arrow'
+  ds = Datasets::Rdatasets.new('nycflights13', 'flights')
+  df = RedAmber::DataFrame.new(ds.to_arrow)
+  slicer = df[:distance] > 1000
+  distance_km = df[:distance] * 1.852
+benchmark:
+  'B01: Pick([]) by a key name': |
+    df[:flight]
+  'B02: Pick by index': |
+    df[df.keys[9]]
+  'B03: Pick by key names': |
+    df.pick(:carrier, :flight)
+  'B04: Drop by key names': |
+    df.drop(:year, :month, :day)
+  'B05: Pick by booleans': |
+    df.pick(df.vectors.map(&:string?))
+  'B06: Pick by a block': |
+    df.pick { keys.map { |key| key.end_with?('time') } }
+  'B07: Slice([]) by a index': |
+    df[877]
+  'B08: Slice by indeces': |
+    df.slice(0...5, -5..-1)
+  'B09: Slice([]) by booleans': |
+    df[slicer]
+  'B10: Slice by booleans': |
+    df.slice(slicer)
+  'B11: Remove by booleans': |
+    df.remove(slicer)
+  'B12: Slice by a block': |
+    df.slice { slicer }
+  'B13: Rename by Hash': |
+    df.rename(distance: :distance_mile)
+  'B14: Assign an existing variable': |
+    df.assign(distance: distance_km)
+  'B15: Assign a new variable': |
+    df.assign(distance_km: distance_km)
+  'B16: Sort by a key': |
+    df.sort(:distance)
+  'B17: Sort by keys': |
+    df.sort(:origin, '-distance')
+  'B18: Convert to a Hash': |
+    df.to_h
+  'B19: Output in TDR style': |
+    df.tdr
+  'B20: Inspect': |
+    df.inspect

data/benchmark/combine.yml ADDED Viewed

@@ -0,0 +1,63 @@
+# --repeat-count 3
+loop_count: 3
+contexts:
+  - name: HEAD
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path('lib'))
+  # - gems:
+  #     red_amber: 0.2.3
+prelude: |
+  require 'red_amber'
+  include RedAmber
+  require 'datasets-arrow'
+  package = 'nycflights13'
+  airlines = DataFrame.new(Datasets::Rdatasets.new(package, 'airlines'))
+  airports = DataFrame.new(Datasets::Rdatasets.new(package, 'airports'))
+  flights  = DataFrame.new(Datasets::Rdatasets.new(package, 'flights'))
+    .pick(%i[month day carrier flight tailnum origin dest air_time distance])
+  planes   = DataFrame.new(Datasets::Rdatasets.new(package, 'planes'))
+  weather  = DataFrame.new(Datasets::Rdatasets.new(package, 'weather'))
+  flights_Q1 = flights.slice { month <= 3 }
+  flights_Q2 = flights.slice { month > 3 }
+  flights_1_2 = flights_Q1.slice { month.is_in(1, 2) }
+  flights_1_3 = flights_Q1.slice { month.is_in(1, 3) }
+  flights_left = flights_Q1.pick(...5)
+  flights_right = flights_Q1.pick(5..)
+benchmark:
+  'C01: Inner join on flights_Q1 by carrier': |
+    flights_Q1.inner_join(airlines, :carrier)
+  'C02: Full join on flights_Q1 by planes': |
+    flights_Q1.full_join(planes, :tailnum)
+  'C03: Left join on flights_Q1 by planes': |
+    flights_Q1.left_join(planes, :tailnum)
+  'C04: Semi join on flights_Q1 by planes': |
+    flights_Q1.semi_join(planes, :tailnum)
+  'C05: Anti join on flights_Q1 by planes': |
+    flights_Q1.anti_join(planes, :tailnum)
+  'C06: Intersection of flights_1_2 and flights_1_3': |
+    flights_1_2.intersect(flights_1_3)
+  'C07: Union of flights_1_2 and flights_1_3': |
+    flights_1_2.union(flights_1_3)
+  'C08: Difference between flights_1_2 and flights_1_3': |
+    flights_1_2.difference(flights_1_3)
+  'C09: Concatenate flight_Q1 on flight_Q2': |
+    flights_Q1.concatenate(flights_Q2)
+  'C10: Merge flights_Q1_right on flights_Q1_left': |
+    flights_left.merge(flights_right)

data/benchmark/drop_nil.yml CHANGED Viewed

@@ -1,11 +1,23 @@
+contexts:
+  - gems:
+      red_amber: 0.1.8
+  - gems:
+      red_amber: 0.2.2
+  - name: HEAD
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path('lib'))
+      require 'red_amber'
 prelude: |
   require 'datasets-arrow'
   require 'red_amber'
   penguins = RedAmber::DataFrame.new(Datasets::Penguins.new.to_arrow)
-  def drop_nil(penguins)
-    penguins.remove { vectors.map { |v| v.is_nil} }
+  def remove_nil(penguins)
+    penguins.remove { vectors.map(&:is_nil).reduce(&:|) }
   end
-benchmark: drop_nil(penguins)
+benchmark:
+  'Remove and reduce': remove_nil(penguins)
+  'remove_nil method': penguins.remove_nil

data/benchmark/group.yml ADDED Viewed

@@ -0,0 +1,33 @@
+contexts:
+  - name: HEAD
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path('lib'))
+  - gems:
+      red_amber: 0.2.2
+prelude: |
+  require 'red_amber'
+  require 'datasets-arrow'
+  ds = Datasets::Rdatasets.new('nycflights13', 'flights')
+  df = RedAmber::DataFrame.new(ds.to_arrow)
+    .assign(:flight) { flight.map(&:to_s) }
+  slicer = df[:distance] > 1000
+  distance_km = df[:distance] * 1.852
+benchmark:
+  'G01: sum distance by destination': |
+    df.group(:dest).sum(:distance)
+  'G02: sum arr_delay by month and day': |
+    df.group(:month, :day).sum(:arr_delay)
+  'G03: sum arr_delay, mean distance by flight': |
+    df.group(:flight) { [sum(:arr_delay), mean(:distance)] }
+  'G04: mean air_time, distance by flight': |
+    df.group(:flight).mean(:air_time, :distance)
+  'G05: sum dep_delay, arr_delay by carrer': |
+    df.group(:carrier).sum(:dep_delay, :arr_delay)

data/benchmark/reshape.yml ADDED Viewed

@@ -0,0 +1,27 @@
+# --repeat-count 3
+contexts:
+  - name: HEAD
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path('lib'))
+  - gems:
+      red_amber: 0.2.2
+prelude: |
+  require 'red_amber'
+  require 'datasets-arrow'
+  ds = Datasets::Rdatasets.new('tidyr', 'billboard')
+  df = RedAmber::DataFrame.new(ds.to_arrow)
+  sub_df = df.pick(:track, df.keys.select{ |k| k.start_with? 'wk' })
+  long_df = df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
+benchmark:
+  'R01: Transpose a DataFrame': |
+    sub_df.transpose(name: :week)
+  'R02: Reshape to longer DataFrame': |
+    df.to_long(:artist, :track, :'date.entered', name: :week, value: :rank)
+  'R03: Reshape to wider DataFrame': |
+    long_df.to_wide(name: :week, value: :rank)

data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} RENAMED Viewed

@@ -2,12 +2,12 @@ prelude: |
   require 'rover'
   require 'red_amber'
-  penguins_csv = 'benchmark/cache/penguins.csv'
+  penguins_csv = 'tmp/penguins.csv'
   unless File.exist?(penguins_csv)
     require 'datasets-arrow'
-    arrow = Datasets::Penguins.new.to_arrow
-    RedAmber::DataFrame.new(arrow).save(penguins_csv)
+    ds = Datasets::Penguins.new
+    RedAmber::DataFrame.new(ds).save(penguins_csv)
   end
 benchmark: