RubyGems - rumale - Versions diffs - 0.21.0 → 0.22.4 - Mend

rumale 0.21.0 → 0.22.4

Files changed (36) hide show

checksums.yaml +4 -4
data/.coveralls.yml +1 -1
data/.github/workflows/build.yml +26 -0
data/.github/workflows/coverage.yml +28 -0
data/.gitignore +1 -0
data/.rubocop.yml +1 -0
data/CHANGELOG.md +33 -0
data/Gemfile +6 -5
data/LICENSE.txt +1 -1
data/README.md +57 -21
data/ext/rumale/tree.c +24 -12
data/lib/rumale.rb +6 -0
data/lib/rumale/base/base_estimator.rb +5 -3
data/lib/rumale/decomposition/pca.rb +1 -1
data/lib/rumale/ensemble/stacking_classifier.rb +215 -0
data/lib/rumale/ensemble/stacking_regressor.rb +163 -0
data/lib/rumale/ensemble/voting_classifier.rb +126 -0
data/lib/rumale/ensemble/voting_regressor.rb +82 -0
data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
data/lib/rumale/kernel_machine/kernel_svc.rb +4 -3
data/lib/rumale/linear_model/elastic_net.rb +1 -1
data/lib/rumale/linear_model/lasso.rb +1 -1
data/lib/rumale/linear_model/linear_regression.rb +63 -34
data/lib/rumale/linear_model/logistic_regression.rb +122 -34
data/lib/rumale/linear_model/nnls.rb +137 -0
data/lib/rumale/linear_model/ridge.rb +70 -33
data/lib/rumale/linear_model/svc.rb +4 -3
data/lib/rumale/linear_model/svr.rb +4 -3
data/lib/rumale/metric_learning/mlkr.rb +161 -0
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +18 -47
data/lib/rumale/pairwise_metric.rb +1 -1
data/lib/rumale/validation.rb +13 -1
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +2 -1
metadata +26 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fd5ca16629a5258be9e577771dc8c6b42dbfdf3a60a4c43d4ee170cc17b72bea
-  data.tar.gz: 70b26bcf0b39bb8e716b9bbb4aba100c496152b1c4e71879105046440c7d8758
+  metadata.gz: 4936b7c7b0ed920383f88743f8eba2e827d586dae471e40a6974dd1fe19342fe
+  data.tar.gz: 5a33c242b3cd881b0003db5e5f2d77905d0571442eb7494a64dff08262ce0c14
 SHA512:
-  metadata.gz: 645a6bda6e3601534c69f5ecfbd840c1d6c1ed7a5a3b8bd57995621a03d970cd02e9749a8a70be5af2678c029a26a5e6e1c32376a4514a64e96d6a9b4b12aa3e
-  data.tar.gz: 5904c64da9cc30cf0c288dfbeb3051bca3333e588e153cf19619c169d713e93edcb95e6902134e58c823d621ebad9e6a56310123c110b6d810033f5f96a40fbb
+  metadata.gz: b45a243c247610d918eeb6cfbb31c461e5773b5404c989fe7e0b8758e0482d165ea1e0cf1d61642d71233458821e1b92e45eb6ff0d0fcb11080c6c1e9692ef91
+  data.tar.gz: feddfc807995b08e753b1ad635901f2db8e806e300478a1f6bdb24a5bf1123cb7fbd0ee402da92ddcdd079a8ad653eec4224e22be9d2c6609ea73ea84bc47ca1

data/.coveralls.yml CHANGED Viewed

	@@ -1 +1 @@
1	- service_name: ~~travis~~-ci
1	+ service_name: github-ci

data/.github/workflows/build.yml ADDED Viewed

@@ -0,0 +1,26 @@
+name: build
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ruby: [ '2.5', '2.6', '2.7', '3.0' ]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install BLAS and LAPACK
+        run: sudo apt-get install -y libopenblas-dev liblapacke-dev
+      - name: Set up Ruby ${{ matrix.ruby }}
+        uses: actions/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Build and test with Rake
+        env:
+          LD_LIBRARY_PATH: '/usr/lib/x86_64-linux-gnu/'
+        run: |
+          gem install --no-document bundler
+          bundle install --jobs 4 --retry 3
+          bundle exec rake

data/.github/workflows/coverage.yml ADDED Viewed

@@ -0,0 +1,28 @@
+name: coverage
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  coverage:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install BLAS and LAPACK
+        run: sudo apt-get install -y libopenblas-dev liblapacke-dev
+      - name: Set up Ruby 2.7
+        uses: actions/setup-ruby@v1
+        with:
+          ruby-version: '2.7'
+      - name: Build and test with Rake
+        run: |
+          gem install bundler
+          bundle install
+          bundle exec rake
+      - name: Coveralls GitHub Action
+        uses: coverallsapp/github-action@v1.1.2
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}

data/.gitignore CHANGED Viewed

@@ -16,6 +16,7 @@
 tags
 .DS_Store
 .ruby-version
+iterate.dat
 /spec/dump_dbl.t
 /spec/dump_int.t
 /spec/dump_mult_dbl.t

data/.rubocop.yml CHANGED Viewed

@@ -1,5 +1,6 @@
 require:
   - rubocop-performance
+  - rubocop-rake
   - rubocop-rspec
 AllCops:

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,36 @@
+# 0.22.4
+- Add classifier and regressor classes for voting ensemble method.
+  - [VotingClassifier](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/VotingClassifier.html)
+  - [VotingRegressor](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/VotingRegressor.html)
+- Refactor some codes.
+- Fix some typos on API documentation.
+# 0.22.3
+- Add regressor class for non-negative least square method.
+  - [NNLS](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/NNLS.html)
+- Add lbfgs solver to [Ridge](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/Ridge.html) and [LinearRegression](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/LinearRegression.html).
+  - In version 0.23.0, these classes will be changed to attempt to optimize with 'svd' or 'lbfgs' solver if 'auto' is given to
+  the solver parameter. If you use 'sgd' solver, you need specify it explicitly.
+- Add GC guard to native extension codes.
+- Update API documentation.
+# 0.22.2
+- Add classifier and regressor classes for stacking method.
+  - [StackingClassifier](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/StackingClassifier.html)
+  - [StackingRegressor](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/StackingRegressor.html)
+- Refactor some codes with Rubocop.
+# 0.22.1
+- Add transfomer class for [MLKR](https://yoshoku.github.io/rumale/doc/Rumale/MetricLearning/MLKR.html), that implements Metric Learning for Kernel Regression.
+- Refactor NeighbourhoodComponentAnalysis.
+- Update API documentation.
+# 0.22.0
+## Breaking change
+- Add lbfgsb.rb gem to runtime dependencies. Rumale uses lbfgsb gem for optimization.
+This eliminates the need to require the mopti gem when using [NeighbourhoodComponentAnalysis](https://yoshoku.github.io/rumale/doc/Rumale/MetricLearning/NeighbourhoodComponentAnalysis.html).
+- Add lbfgs solver to [LogisticRegression](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/LogisticRegression.html) and make it the default solver.
 # 0.21.0
 ## Breaking change
 - Change the default value of max_iter argument on LinearModel estimators to 1000.

data/Gemfile CHANGED Viewed

@@ -3,14 +3,15 @@ source 'https://rubygems.org'
 # Specify your gem's dependencies in rumale.gemspec
 gemspec
-gem 'coveralls', '~> 0.8'
 gem 'mmh3', '>= 1.0'
-gem 'mopti', '>= 0.1.0'
 gem 'numo-linalg', '>= 0.1.4'
 gem 'parallel', '>= 1.17.0'
-gem 'rake', '~> 12.0'
+gem 'rake', '~> 13.0'
 gem 'rake-compiler', '~> 1.0'
 gem 'rspec', '~> 3.0'
-gem 'rubocop', '~> 0.91'
+gem 'rubocop', '~> 1.0'
 gem 'rubocop-performance', '~> 1.8'
-gem 'rubocop-rspec', '~> 1.43'
+gem 'rubocop-rake', '~> 0.5'
+gem 'rubocop-rspec', '~> 2.0'
+gem 'simplecov', '~> 0.21'
+gem 'simplecov-lcov', '~> 0.8'

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2017-2020 Atsushi Tatsuma
+Copyright (c) 2017-2021 Atsushi Tatsuma
 All rights reserved.
 Redistribution and use in source and binary forms, with or without

data/README.md CHANGED Viewed

@@ -2,10 +2,10 @@
 ![Rumale](https://dl.dropboxusercontent.com/s/joxruk2720ur66o/rumale_header_400.png)
-[![Build Status](https://travis-ci.org/yoshoku/rumale.svg?branch=master)](https://travis-ci.org/yoshoku/rumale)
-[![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
+[![Build Status](https://github.com/yoshoku/rumale/workflows/build/badge.svg)](https://github.com/yoshoku/rumale/actions?query=workflow%3Abuild)
+[![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=main)](https://coveralls.io/github/yoshoku/rumale?branch=main)
 [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
-[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
+[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/LICENSE.txt)
 [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/)
 Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
@@ -114,10 +114,10 @@ require 'rumale'
 samples, labels = Rumale::Dataset.load_libsvm_file('pendigits')
 # Define the estimator to be evaluated.
-lr = Rumale::LinearModel::LogisticRegression.new(learning_rate: 0.00001, reg_param: 0.0001, random_seed: 1)
+lr = Rumale::LinearModel::LogisticRegression.new
 # Define the evaluation measure, splitting strategy, and cross validation.
-ev = Rumale::EvaluationMeasure::LogLoss.new
+ev = Rumale::EvaluationMeasure::Accuracy.new
 kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5, shuffle: true, random_seed: 1)
 cv = Rumale::ModelSelection::CrossValidation.new(estimator: lr, splitter: kf, evaluator: ev)
@@ -125,15 +125,15 @@ cv = Rumale::ModelSelection::CrossValidation.new(estimator: lr, splitter: kf, ev
 report = cv.perform(samples, labels)
 # Output result.
-mean_logloss = report[:test_score].inject(:+) / kf.n_splits
-puts("5-CV mean log-loss: %.3f" % mean_logloss)
+mean_accuracy = report[:test_score].sum / kf.n_splits
+puts "5-CV mean accuracy: %.1f%%" % (100.0 * mean_accuracy)
 ```
 Execution of the above scripts result in the following.
 ```bash
 $ ruby cross_validation.rb
-5-CV mean log-loss: 0.355
+5-CV mean accuracy: 95.4%
 ```
 ### Example 3. Pipeline
@@ -144,10 +144,10 @@ require 'rumale'
 # Load dataset.
 samples, labels = Rumale::Dataset.load_libsvm_file('pendigits')
-# Construct pipeline with kernel approximation and SVC.
-rbf = Rumale::KernelApproximation::RBF.new(gamma: 0.0001, n_components: 800, random_seed: 1)
-svc = Rumale::LinearModel::SVC.new(reg_param: 0.0001, random_seed: 1)
-pipeline = Rumale::Pipeline::Pipeline.new(steps: { trns: rbf, clsf: svc })
+# Construct pipeline with kernel approximation and LogisticRegression.
+rbf = Rumale::KernelApproximation::RBF.new(gamma: 1e-4, n_components: 800, random_seed: 1)
+lr = Rumale::LinearModel::LogisticRegression.new(reg_param: 1e-3)
+pipeline = Rumale::Pipeline::Pipeline.new(steps: { trns: rbf, clsf: lr })
 # Define the splitting strategy and cross validation.
 kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5, shuffle: true, random_seed: 1)
@@ -157,7 +157,7 @@ cv = Rumale::ModelSelection::CrossValidation.new(estimator: pipeline, splitter:
 report = cv.perform(samples, labels)
 # Output result.
-mean_accuracy = report[:test_score].inject(:+) / kf.n_splits
+mean_accuracy = report[:test_score].sum / kf.n_splits
 puts("5-CV mean accuracy: %.1f %%" % (mean_accuracy * 100.0))
 ```
@@ -177,7 +177,7 @@ For example, using the [OpenBLAS](https://github.com/xianyi/OpenBLAS) speeds up
 Install OpenBLAS library.
-Mac:
+macOS:
 ```bash
 $ brew install openblas
@@ -186,12 +186,13 @@ $ brew install openblas
 Ubuntu:
 ```bash
-$ sudo apt-get install gcc gfortran
-$ wget https://github.com/xianyi/OpenBLAS/archive/v0.3.5.tar.gz
-$ tar xzf v0.3.5.tar.gz
-$ cd OpenBLAS-0.3.5
-$ make USE_OPENMP=1
-$ sudo make PREFIX=/usr/local install
+$ sudo apt-get install libopenblas-dev liblapacke-dev
+```
+Windows (MSYS2):
+```bash
+$ pacman -S mingw-w64-x86_64-ruby mingw-w64-x86_64-openblas mingw-w64-x86_64-lapack
 ```
 Install Numo::Linalg gem.
@@ -207,6 +208,37 @@ require 'numo/linalg/autoloader'
 require 'rumale'
 ```
+### Numo::OpenBLAS
+[Numo::OpenBLAS](https://github.com/yoshoku/numo-openblas) downloads and builds OpenBLAS during installation
+and uses that as a background library for Numo::Linalg.
+Install compilers for building OpenBLAS.
+macOS:
+```bash
+$ brew install gcc gfortran make
+```
+Ubuntu:
+```bash
+$ sudo apt-get install gcc gfortran make
+```
+Install Numo::OpenBLAS gem.
+```bash
+$ gem install numo-openblas
+```
+Load Numo::OpenBLAS gem instead of Numo::Linalg.
+```ruby
+require 'numo/openblas'
+require 'rumale'
+```
 ### Parallel
 Several estimators in Rumale support parallel processing.
 Parallel processing in Rumale is realized by [Parallel](https://github.com/grosser/parallel) gem,
@@ -228,6 +260,10 @@ When -1 is given to n_jobs parameter, all processors are used.
 estimator = Rumale::Ensemble::RandomForestClassifier.new(n_jobs: -1, random_seed: 1)
 ```
+## Related Projects
+- [Rumale::SVM](https://github.com/yoshoku/rumale-svm) provides support vector machine algorithms in LIBSVM and LIBLINEAR with Rumale interface.
+- [Rumale::Torch](https://github.com/yoshoku/rumale-torch) provides the learning and inference by the neural network defined in torch.rb with Rumale interface.
 ## Novelties
 * [Rumale SHOP](https://suzuri.jp/yoshoku)
@@ -245,4 +281,4 @@ The gem is available as open source under the terms of the [BSD 2-clause License
 ## Code of Conduct
 Everyone interacting in the Rumale project’s codebases, issue trackers,
-chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/Rumale/blob/master/CODE_OF_CONDUCT.md).
+chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/Rumale/blob/main/CODE_OF_CONDUCT.md).

data/ext/rumale/tree.c CHANGED Viewed

@@ -5,9 +5,8 @@ RUBY_EXTERN VALUE mRumale;
 double*
 alloc_dbl_array(const long n_dimensions)
 {
-  long i;
   double* arr = ALLOC_N(double, n_dimensions);
-  for (i = 0; i < n_dimensions; i++) { arr[i] = 0.0; }
+  memset(arr, 0, n_dimensions * sizeof(double));
   return arr;
 }
@@ -257,10 +256,13 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order,
   split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
   VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
   VALUE results = rb_ary_new2(4);
-  rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
-  rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
-  rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
-  rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
+  double* params_ptr = (double*)na_get_pointer_for_read(params);
+  rb_ary_store(results, 0, DBL2NUM(params_ptr[0]));
+  rb_ary_store(results, 1, DBL2NUM(params_ptr[1]));
+  rb_ary_store(results, 2, DBL2NUM(params_ptr[2]));
+  rb_ary_store(results, 3, DBL2NUM(params_ptr[3]));
+  RB_GC_GUARD(params);
+  RB_GC_GUARD(criterion);
   return results;
 }
@@ -375,10 +377,13 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order,
   split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
   VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
   VALUE results = rb_ary_new2(4);
-  rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
-  rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
-  rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
-  rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
+  double* params_ptr = (double*)na_get_pointer_for_read(params);
+  rb_ary_store(results, 0, DBL2NUM(params_ptr[0]));
+  rb_ary_store(results, 1, DBL2NUM(params_ptr[1]));
+  rb_ary_store(results, 2, DBL2NUM(params_ptr[2]));
+  rb_ary_store(results, 3, DBL2NUM(params_ptr[3]));
+  RB_GC_GUARD(params);
+  RB_GC_GUARD(criterion);
   return results;
 }
@@ -464,8 +469,10 @@ find_split_params_grad_reg
   double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
   VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
   VALUE results = rb_ary_new2(2);
-  rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
-  rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
+  double* params_ptr = (double*)na_get_pointer_for_read(params);
+  rb_ary_store(results, 0, DBL2NUM(params_ptr[0]));
+  rb_ary_store(results, 1, DBL2NUM(params_ptr[1]));
+  RB_GC_GUARD(params);
   return results;
 }
@@ -497,6 +504,9 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_,
   xfree(histogram);
+  RB_GC_GUARD(y_nary);
+  RB_GC_GUARD(criterion);
   return ret;
 }
@@ -531,6 +541,8 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
   xfree(sum_vec);
+  RB_GC_GUARD(criterion);
   return ret;
 }

data/lib/rumale.rb CHANGED Viewed

@@ -30,6 +30,7 @@ require 'rumale/linear_model/linear_regression'
 require 'rumale/linear_model/ridge'
 require 'rumale/linear_model/lasso'
 require 'rumale/linear_model/elastic_net'
+require 'rumale/linear_model/nnls'
 require 'rumale/kernel_machine/kernel_svc'
 require 'rumale/kernel_machine/kernel_pca'
 require 'rumale/kernel_machine/kernel_fda'
@@ -59,6 +60,10 @@ require 'rumale/ensemble/random_forest_classifier'
 require 'rumale/ensemble/random_forest_regressor'
 require 'rumale/ensemble/extra_trees_classifier'
 require 'rumale/ensemble/extra_trees_regressor'
+require 'rumale/ensemble/stacking_classifier'
+require 'rumale/ensemble/stacking_regressor'
+require 'rumale/ensemble/voting_classifier'
+require 'rumale/ensemble/voting_regressor'
 require 'rumale/clustering/k_means'
 require 'rumale/clustering/mini_batch_k_means'
 require 'rumale/clustering/k_medoids'
@@ -77,6 +82,7 @@ require 'rumale/manifold/tsne'
 require 'rumale/manifold/mds'
 require 'rumale/metric_learning/fisher_discriminant_analysis'
 require 'rumale/metric_learning/neighbourhood_component_analysis'
+require 'rumale/metric_learning/mlkr'
 require 'rumale/neural_network/adam'
 require 'rumale/neural_network/base_mlp'
 require 'rumale/neural_network/mlp_regressor'

data/lib/rumale/base/base_estimator.rb CHANGED Viewed

@@ -11,13 +11,15 @@ module Rumale
       private
-      def enable_linalg?
+      def enable_linalg?(warning: true)
         if defined?(Numo::Linalg).nil?
-          warn('If you want to use features that depend on Numo::Linalg, you should install and load Numo::Linalg in advance.')
+          warn('If you want to use features that depend on Numo::Linalg, you should install and load Numo::Linalg in advance.') if warning
           return false
         end
         if Numo::Linalg::VERSION < '0.1.4'
-          warn('The loaded Numo::Linalg does not implement the methods required by Rumale. Please load Numo::Linalg version 0.1.4 or later.')
+          if warning
+            warn('The loaded Numo::Linalg does not implement the methods required by Rumale. Please load Numo::Linalg version 0.1.4 or later.')
+          end
           return false
         end
         true

data/lib/rumale/decomposition/pca.rb CHANGED Viewed

@@ -59,7 +59,7 @@ module Rumale
         @params[:solver] = if solver == 'auto'
                              load_linalg? ? 'evd' : 'fpt'
                            else
-                             solver != 'evd' ? 'fpt' : 'evd'
+                             solver != 'evd' ? 'fpt' : 'evd' # rubocop:disable Style/NegatedIfElseCondition
                            end
         @params[:n_components] = n_components
         @params[:max_iter] = max_iter

data/lib/rumale/ensemble/stacking_classifier.rb ADDED Viewed

@@ -0,0 +1,215 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/classifier'
+require 'rumale/preprocessing/label_encoder'
+module Rumale
+  module Ensemble
+    # StackingClassifier is a class that implements classifier with stacking method.
+    #
+    # @example
+    #   estimators = {
+    #     lgr: Rumale::LinearModel::LogisticRegression.new(reg_param: 1e-2, random_seed: 1),
+    #     mlp: Rumale::NeuralNetwork::MLPClassifier.new(hidden_units: [256], random_seed: 1),
+    #     rnd: Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
+    #   }
+    #   meta_estimator = Rumale::LinearModel::LogisticRegression.new(random_seed: 1)
+    #   classifier = Rumale::Ensemble::StackedClassifier.new(
+    #     estimators: estimators, meta_estimator: meta_estimator, random_seed: 1
+    #   )
+    #   classifier.fit(training_samples, training_labels)
+    #   results = classifier.predict(testing_samples)
+    #
+    # *Reference*
+    # - Zhou, Z-H., "Ensemble Methods - Foundations and Algorithms," CRC Press Taylor and Francis Group, Chapman and Hall/CRC, 2012.
+    class StackingClassifier
+      include Base::BaseEstimator
+      include Base::Classifier
+      # Return the base classifiers.
+      # @return [Hash<Symbol,Classifier>]
+      attr_reader :estimators
+      # Return the meta classifier.
+      # @return [Classifier]
+      attr_reader :meta_estimator
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the method used by each base classifier.
+      # @return [Hash<Symbol,Symbol>]
+      attr_reader :stack_method
+      # Create a new classifier with stacking method.
+      #
+      # @param estimators [Hash<Symbol,Classifier>] The base classifiers for extracting meta features.
+      # @param meta_estimator [Classifier/Nil] The meta classifier that predicts class label.
+      #   If nil is given, LogisticRegression is used.
+      # @param n_splits [Integer] The number of folds for cross validation with stratified k-fold on meta feature extraction in training phase.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset on cross validation.
+      # @param stack_method [String] The method name of base classifier for using meta feature extraction.
+      #   If 'auto' is given, it searches the callable method in the order 'predict_proba', 'decision_function', and 'predict'
+      #   on each classifier.
+      # @param passthrough [Boolean] The flag indicating whether to concatenate the original features and meta features when training the meta classifier.
+      # @param random_seed [Integer/Nil] The seed value using to initialize the random generator on cross validation.
+      def initialize(estimators:, meta_estimator: nil, n_splits: 5, shuffle: true, stack_method: 'auto', passthrough: false, random_seed: nil)
+        check_params_type(Hash, estimators: estimators)
+        check_params_numeric(n_splits: n_splits)
+        check_params_string(stack_method: stack_method)
+        check_params_boolean(shuffle: shuffle, passthrough: passthrough)
+        check_params_numeric_or_nil(random_seed: random_seed)
+        @estimators = estimators
+        @meta_estimator = meta_estimator || Rumale::LinearModel::LogisticRegression.new
+        @classes = nil
+        @stack_method = nil
+        @output_size = nil
+        @params = {}
+        @params[:n_splits] = n_splits
+        @params[:shuffle] = shuffle
+        @params[:stack_method] = stack_method
+        @params[:passthrough] = passthrough
+        @params[:random_seed] = random_seed || srand
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [StackedClassifier] The learned classifier itself.
+      def fit(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_label_array(y)
+        check_sample_label_size(x, y)
+        n_samples, n_features = x.shape
+        @encoder = Rumale::Preprocessing::LabelEncoder.new
+        y_encoded = @encoder.fit_transform(y)
+        @classes = Numo::NArray[*@encoder.classes]
+        # training base classifiers with all training data.
+        @estimators.each_key { |name| @estimators[name].fit(x, y_encoded) }
+        # detecting feature extraction method and its size of output for each base classifier.
+        @stack_method = detect_stack_method
+        @output_size = detect_output_size(n_features)
+        # extracting meta features with base classifiers.
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        kf = Rumale::ModelSelection::StratifiedKFold.new(
+          n_splits: @params[:n_splits], shuffle: @params[:shuffle], random_seed: @params[:random_seed]
+        )
+        kf.split(x, y_encoded).each do |train_ids, valid_ids|
+          x_train = x[train_ids, true]
+          y_train = y_encoded[train_ids]
+          x_valid = x[valid_ids, true]
+          f_start = 0
+          @estimators.each_key do |name|
+            est_fold = Marshal.load(Marshal.dump(@estimators[name]))
+            f_last = f_start + @output_size[name]
+            f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+            z[valid_ids, f_position] = est_fold.fit(x_train, y_train).public_send(@stack_method[name], x_valid)
+            f_start = f_last
+          end
+        end
+        # concatenating original features.
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        # training meta classifier.
+        @meta_estimator.fit(z, y_encoded)
+        self
+      end
+      # Calculate confidence scores for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) The confidence score per sample.
+      def decision_function(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        @meta_estimator.decision_function(z)
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) The predicted class label per sample.
+      def predict(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        Numo::Int32.cast(@encoder.inverse_transform(@meta_estimator.predict(z)))
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probabilities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) The predicted probability of each class per sample.
+      def predict_proba(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        @meta_estimator.predict_proba(z)
+      end
+      # Transform the given data with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed with the learned model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for samples.
+      def transform(x)
+        x = check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        f_start = 0
+        @estimators.each_key do |name|
+          f_last = f_start + @output_size[name]
+          f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+          z[true, f_position] = @estimators[name].public_send(@stack_method[name], x)
+          f_start = f_last
+        end
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        z
+      end
+      # Fit the model with training data, and then transform them with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for training data.
+      def fit_transform(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_label_array(y)
+        fit(x, y).transform(x)
+      end
+      private
+      STACK_METHODS = %i[predict_proba decision_function predict].freeze
+      private_constant :STACK_METHODS
+      def detect_stack_method
+        if @params[:stack_method] == 'auto'
+          @estimators.each_key.with_object({}) { |name, obj| obj[name] = STACK_METHODS.detect { |m| @estimators[name].respond_to?(m) } }
+        else
+          @estimators.each_key.with_object({}) { |name, obj| obj[name] = @params[:stack_method].to_sym }
+        end
+      end
+      def detect_output_size(n_features)
+        x_dummy = Numo::DFloat.new(2, n_features).rand
+        @estimators.each_key.with_object({}) do |name, obj|
+          output_dummy = @estimators[name].public_send(@stack_method[name], x_dummy)
+          obj[name] = output_dummy.ndim == 1 ? 1 : output_dummy.shape[1]
+        end
+      end
+    end
+  end
+end