RubyGems - rumale - Versions diffs - 0.20.2 → 0.22.2 - Mend

rumale 0.20.2 → 0.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/.github/workflows/build.yml +23 -0
data/.rubocop.yml +10 -0
data/CHANGELOG.md +26 -0
data/Gemfile +5 -2
data/README.md +17 -14
data/lib/rumale.rb +3 -0
data/lib/rumale/clustering/snn.rb +1 -1
data/lib/rumale/decomposition/pca.rb +1 -1
data/lib/rumale/ensemble/stacking_classifier.rb +214 -0
data/lib/rumale/ensemble/stacking_regressor.rb +163 -0
data/lib/rumale/evaluation_measure/roc_auc.rb +3 -0
data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
data/lib/rumale/kernel_machine/kernel_svc.rb +4 -3
data/lib/rumale/linear_model/base_sgd.rb +1 -1
data/lib/rumale/linear_model/elastic_net.rb +2 -2
data/lib/rumale/linear_model/lasso.rb +2 -2
data/lib/rumale/linear_model/linear_regression.rb +3 -3
data/lib/rumale/linear_model/logistic_regression.rb +123 -35
data/lib/rumale/linear_model/ridge.rb +3 -3
data/lib/rumale/linear_model/svc.rb +6 -5
data/lib/rumale/linear_model/svr.rb +6 -5
data/lib/rumale/metric_learning/mlkr.rb +161 -0
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +19 -48
data/lib/rumale/pairwise_metric.rb +1 -1
data/lib/rumale/pipeline/pipeline.rb +1 -1
data/lib/rumale/tree/base_decision_tree.rb +2 -9
data/lib/rumale/tree/gradient_tree_regressor.rb +3 -10
data/lib/rumale/validation.rb +1 -1
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +2 -1
metadata +25 -8
data/.coveralls.yml +0 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5d8c93acbf38fbd07e5df224010abbdd4269a6ce3bbf8112a0eba652a606785d
-  data.tar.gz: e7cb00a802420854835c92f011425f3054bfcc1052bf7b3664da1f95834ef435
+  metadata.gz: 703a6895f4218ca45c5d5ae5e86559b077cf1be213d4939eb1e9ab94eac4621d
+  data.tar.gz: 5862466e565d1e6030c35494b5028ae980a47d373e90050c62266055fcecd374
 SHA512:
-  metadata.gz: f95fdd89b84dad02e516ee0479b1cddfb101cb96de897b6e7fa3fba546272a243cff5cfe954cb51942ec1ab23cf3028b183db86b52fab00a35d15be7eee5bf92
-  data.tar.gz: e5f6235e88dd47b9002a2154cabd2c1e64afb6cbb5b0745b411c7e5559351e925c9db8ec332724e301b83215662b3582e79a9e997f0338846514b234dabf1fc3
+  metadata.gz: 988d55c681a102e0c65b9133c6aeafc049e33755955f959d6e6046f5601dd192af881424355a2b373ed2e7a5a16b74236698aef5372e09584b10fe28d1b7bc21
+  data.tar.gz: adc58efa3b46d9fc1a87ddb2a4df32472507d61f21a3a0eb07026068cc5e41af166fb0a0f8ae23f1b23aec649b22835a50edbed79d35255e8cc231b82b31eb8c

data/.github/workflows/build.yml ADDED

@@ -0,0 +1,23 @@
+name: build
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby: [ '2.5', '2.6', '2.7' ]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install BLAS and LAPACK
+        run: sudo apt-get install -y libopenblas-dev liblapacke-dev
+      - name: Set up Ruby ${{ matrix.ruby }}
+        uses: actions/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+      - name: Build and test with Rake
+        run: |
+          gem install bundler
+          bundle install --jobs 4 --retry 3
+          bundle exec rake

data/.rubocop.yml CHANGED

@@ -1,5 +1,6 @@
 require:
   - rubocop-performance
+  - rubocop-rake
   - rubocop-rspec
 AllCops:
@@ -20,6 +21,9 @@ Layout/LineLength:
   Max: 145
   IgnoredPatterns: ['(\A|\s)#']
+Lint/ConstantDefinitionInBlock:
+  Enabled: false
 Lint/MissingSuper:
   Enabled: false
@@ -70,6 +74,9 @@ Style/StringConcatenation:
 RSpec/MultipleExpectations:
   Enabled: false
+RSpec/MultipleMemoizedHelpers:
+  Max: 25
 RSpec/NestedGroups:
   Max: 4
@@ -81,3 +88,6 @@ RSpec/InstanceVariable:
 RSpec/LeakyConstantDeclaration:
   Enabled: false
+Performance/Sum:
+  Enabled: false

data/CHANGELOG.md CHANGED

@@ -1,3 +1,29 @@
+# 0.22.2
+- Add classifier and regressor classes for stacking method.
+  - [StackingClassifier](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/StackingClassifier.html)
+  - [StackingRegressor](https://yoshoku.github.io/rumale/doc/Rumale/Ensemble/StackingRegressor.html)
+- Refactor some codes with Rubocop.
+# 0.22.1
+- Add transfomer class for [MLKR](https://yoshoku.github.io/rumale/doc/Rumale/MetricLearning/MLKR.html), that implements Metric Learning for Kernel Regression.
+- Refactor NeighbourhoodComponentAnalysis.
+- Update API documentation.
+# 0.22.0
+## Breaking change
+- Add lbfgsb.rb gem to runtime dependencies. Rumale uses lbfgsb gem for optimization.
+This eliminates the need to require the mopti gem when using [NeighbourhoodComponentAnalysis](https://yoshoku.github.io/rumale/doc/Rumale/MetricLearning/NeighbourhoodComponentAnalysis.html).
+- Add lbfgs solver to [LogisticRegression](https://yoshoku.github.io/rumale/doc/Rumale/LinearModel/LogisticRegression.html) and make it the default solver.
+# 0.21.0
+## Breaking change
+- Change the default value of max_iter argument on LinearModel estimators to 1000.
+# 0.20.3
+- Fix to use automatic solver of PCA in NeighbourhoodComponentAnalysis.
+- Refactor some codes with Rubocop.
+- Update README.
 # 0.20.2
 - Add cross-validator class for time-series data.
   - [TimeSeriesSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/TimeSeriesSplit.html)

data/Gemfile CHANGED

@@ -3,11 +3,14 @@ source 'https://rubygems.org'
 # Specify your gem's dependencies in rumale.gemspec
 gemspec
-gem 'coveralls', '~> 0.8'
 gem 'mmh3', '>= 1.0'
-gem 'mopti', '>= 0.1.0'
 gem 'numo-linalg', '>= 0.1.4'
 gem 'parallel', '>= 1.17.0'
 gem 'rake', '~> 12.0'
 gem 'rake-compiler', '~> 1.0'
 gem 'rspec', '~> 3.0'
+gem 'rubocop', '~> 1.0'
+gem 'rubocop-performance', '~> 1.8'
+gem 'rubocop-rake', '~> 0.5'
+gem 'rubocop-rspec', '~> 2.0'
+gem 'simplecov', '~> 0.19'

data/README.md CHANGED

@@ -2,10 +2,9 @@
 ![Rumale](https://dl.dropboxusercontent.com/s/joxruk2720ur66o/rumale_header_400.png)
-[![Build Status](https://travis-ci.org/yoshoku/rumale.svg?branch=master)](https://travis-ci.org/yoshoku/rumale)
-[![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
+[![Build Status](https://github.com/yoshoku/rumale/workflows/build/badge.svg)](https://github.com/yoshoku/rumale/actions?query=workflow%3Abuild)
 [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
-[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
+[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/LICENSE.txt)
 [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/)
 Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
@@ -114,10 +113,10 @@ require 'rumale'
 samples, labels = Rumale::Dataset.load_libsvm_file('pendigits')
 # Define the estimator to be evaluated.
-lr = Rumale::LinearModel::LogisticRegression.new(learning_rate: 0.00001, reg_param: 0.0001, random_seed: 1)
+lr = Rumale::LinearModel::LogisticRegression.new
 # Define the evaluation measure, splitting strategy, and cross validation.
-ev = Rumale::EvaluationMeasure::LogLoss.new
+ev = Rumale::EvaluationMeasure::Accuracy.new
 kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5, shuffle: true, random_seed: 1)
 cv = Rumale::ModelSelection::CrossValidation.new(estimator: lr, splitter: kf, evaluator: ev)
@@ -125,15 +124,15 @@ cv = Rumale::ModelSelection::CrossValidation.new(estimator: lr, splitter: kf, ev
 report = cv.perform(samples, labels)
 # Output result.
-mean_logloss = report[:test_score].inject(:+) / kf.n_splits
-puts("5-CV mean log-loss: %.3f" % mean_logloss)
+mean_accuracy = report[:test_score].sum / kf.n_splits
+puts "5-CV mean accuracy: %.1f%%" % (100.0 * mean_accuracy)
 ```
 Execution of the above scripts result in the following.
 ```bash
 $ ruby cross_validation.rb
-5-CV mean log-loss: 0.355
+5-CV mean accuracy: 95.4%
 ```
 ### Example 3. Pipeline
@@ -144,10 +143,10 @@ require 'rumale'
 # Load dataset.
 samples, labels = Rumale::Dataset.load_libsvm_file('pendigits')
-# Construct pipeline with kernel approximation and SVC.
-rbf = Rumale::KernelApproximation::RBF.new(gamma: 0.0001, n_components: 800, random_seed: 1)
-svc = Rumale::LinearModel::SVC.new(reg_param: 0.0001, random_seed: 1)
-pipeline = Rumale::Pipeline::Pipeline.new(steps: { trns: rbf, clsf: svc })
+# Construct pipeline with kernel approximation and LogisticRegression.
+rbf = Rumale::KernelApproximation::RBF.new(gamma: 1e-4, n_components: 800, random_seed: 1)
+lr = Rumale::LinearModel::LogisticRegression.new(reg_param: 1e-3)
+pipeline = Rumale::Pipeline::Pipeline.new(steps: { trns: rbf, clsf: lr })
 # Define the splitting strategy and cross validation.
 kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5, shuffle: true, random_seed: 1)
@@ -157,7 +156,7 @@ cv = Rumale::ModelSelection::CrossValidation.new(estimator: pipeline, splitter:
 report = cv.perform(samples, labels)
 # Output result.
-mean_accuracy = report[:test_score].inject(:+) / kf.n_splits
+mean_accuracy = report[:test_score].sum / kf.n_splits
 puts("5-CV mean accuracy: %.1f %%" % (mean_accuracy * 100.0))
 ```
@@ -228,6 +227,10 @@ When -1 is given to n_jobs parameter, all processors are used.
 estimator = Rumale::Ensemble::RandomForestClassifier.new(n_jobs: -1, random_seed: 1)
 ```
+## Novelties
+* [Rumale SHOP](https://suzuri.jp/yoshoku)
 ## Contributing
 Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/rumale.
@@ -241,4 +244,4 @@ The gem is available as open source under the terms of the [BSD 2-clause License
 ## Code of Conduct
 Everyone interacting in the Rumale project’s codebases, issue trackers,
-chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/Rumale/blob/master/CODE_OF_CONDUCT.md).
+chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/Rumale/blob/main/CODE_OF_CONDUCT.md).

data/lib/rumale.rb CHANGED

@@ -59,6 +59,8 @@ require 'rumale/ensemble/random_forest_classifier'
 require 'rumale/ensemble/random_forest_regressor'
 require 'rumale/ensemble/extra_trees_classifier'
 require 'rumale/ensemble/extra_trees_regressor'
+require 'rumale/ensemble/stacking_classifier'
+require 'rumale/ensemble/stacking_regressor'
 require 'rumale/clustering/k_means'
 require 'rumale/clustering/mini_batch_k_means'
 require 'rumale/clustering/k_medoids'
@@ -77,6 +79,7 @@ require 'rumale/manifold/tsne'
 require 'rumale/manifold/mds'
 require 'rumale/metric_learning/fisher_discriminant_analysis'
 require 'rumale/metric_learning/neighbourhood_component_analysis'
+require 'rumale/metric_learning/mlkr'
 require 'rumale/neural_network/adam'
 require 'rumale/neural_network/base_mlp'
 require 'rumale/neural_network/mlp_regressor'

data/lib/rumale/clustering/snn.rb CHANGED

@@ -51,7 +51,7 @@ module Rumale
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
       #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
       # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
-      def fit_predict(x)
+      def fit_predict(x) # rubocop:disable Lint/UselessMethodDefinition
         super
       end

data/lib/rumale/decomposition/pca.rb CHANGED

@@ -59,7 +59,7 @@ module Rumale
         @params[:solver] = if solver == 'auto'
                              load_linalg? ? 'evd' : 'fpt'
                            else
-                             solver != 'evd' ? 'fpt' : 'evd'
+                             solver != 'evd' ? 'fpt' : 'evd' # rubocop:disable Style/NegatedIfElseCondition
                            end
         @params[:n_components] = n_components
         @params[:max_iter] = max_iter

data/lib/rumale/ensemble/stacking_classifier.rb ADDED

@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/classifier'
+module Rumale
+  module Ensemble
+    # StackingClassifier is a class that implements classifier with stacking method.
+    #
+    # @example
+    #   estimators = {
+    #     lgr: Rumale::LinearModel::LogisticRegression.new(reg_param: 1e-2, random_seed: 1),
+    #     mlp: Rumele::NeuralNetwork::MLPClassifier.new(hidden_units: [256], random_seed: 1),
+    #     rnd: Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
+    #   }
+    #   meta_estimator = Rumale::LinearModel::LogisticRegression.new(random_seed: 1)
+    #   classifier = Rumale::Ensemble::StackedClassifier.new(
+    #     estimators: estimators, meta_estimator: meta_estimator, random_seed: 1
+    #   )
+    #   classifier.fit(training_samples, traininig_labels)
+    #   results = classifier.predict(testing_samples)
+    #
+    # *Reference*
+    # - Zhou, Z-H., "Ensemble Mehotds - Foundations and Algorithms," CRC Press Taylor and Francis Group, Chapman and Hall/CRC, 2012.
+    class StackingClassifier
+      include Base::BaseEstimator
+      include Base::Classifier
+      # Return the base classifiers.
+      # @return [Hash<Symbol,Classifier>]
+      attr_reader :estimators
+      # Return the meta classifier.
+      # @return [Classifier]
+      attr_reader :meta_estimator
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the method used by each base classifier.
+      # @return [Hash<Symbol,Symbol>]
+      attr_reader :stack_method
+      # Create a new classifier with stacking method.
+      #
+      # @param estimators [Hash<Symbol,Classifier>] The base classifiers for extracting meta features.
+      # @param meta_estimator [Classifier/Nil] The meta classifier that predicts class label.
+      #   If nil is given, LogisticRegression is used.
+      # @param n_splits [Integer] The number of folds for cross validation with stratified k-fold on meta feature extraction in training phase.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset on cross validation.
+      # @param stack_method [String] The method name of base classifier for using meta feature extraction.
+      #   If 'auto' is given, it searches the callable method in the order 'predict_proba', 'decision_function', and 'predict'
+      #   on each classifier.
+      # @param passthrough [Boolean] The flag indicating whether to concatenate the original features and meta features when training the meta classifier.
+      # @param random_seed [Integer/Nil] The seed value using to initialize the random generator on cross validation.
+      def initialize(estimators:, meta_estimator: nil, n_splits: 5, shuffle: true, stack_method: 'auto', passthrough: false, random_seed: nil)
+        check_params_type(Hash, estimators: estimators)
+        check_params_numeric(n_splits: n_splits)
+        check_params_string(stack_method: stack_method)
+        check_params_boolean(shuffle: shuffle, passthrough: passthrough)
+        check_params_numeric_or_nil(random_seed: random_seed)
+        @estimators = estimators
+        @meta_estimator = meta_estimator || Rumale::LinearModel::LogisticRegression.new
+        @classes = nil
+        @stack_method = nil
+        @output_size = nil
+        @params = {}
+        @params[:n_splits] = n_splits
+        @params[:shuffle] = shuffle
+        @params[:stack_method] = stack_method
+        @params[:passthrough] = passthrough
+        @params[:random_seed] = random_seed || srand
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [StackedClassifier] The learned classifier itself.
+      def fit(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_label_array(y)
+        check_sample_label_size(x, y)
+        n_samples, n_features = x.shape
+        @encoder = Rumale::Preprocessing::LabelEncoder.new
+        y_encoded = @encoder.fit_transform(y)
+        @classes = Numo::NArray[*@encoder.classes]
+        # training base classifiers with all training data.
+        @estimators.each_key { |name| @estimators[name].fit(x, y_encoded) }
+        # detecting feature extraction method and its size of output for each base classifier.
+        @stack_method = detect_stack_method
+        @output_size = detect_output_size(n_features)
+        # extracting meta features with base classifiers.
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        kf = Rumale::ModelSelection::StratifiedKFold.new(
+          n_splits: @params[:n_splits], shuffle: @params[:shuffle], random_seed: @params[:random_seed]
+        )
+        kf.split(x, y_encoded).each do |train_ids, valid_ids|
+          x_train = x[train_ids, true]
+          y_train = y_encoded[train_ids]
+          x_valid = x[valid_ids, true]
+          f_start = 0
+          @estimators.each_key do |name|
+            est_fold = Marshal.load(Marshal.dump(@estimators[name]))
+            f_last = f_start + @output_size[name]
+            f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+            z[valid_ids, f_position] = est_fold.fit(x_train, y_train).public_send(@stack_method[name], x_valid)
+            f_start = f_last
+          end
+        end
+        # concatenating original features.
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        # training meta classifier.
+        @meta_estimator.fit(z, y_encoded)
+        self
+      end
+      # Calculate confidence scores for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) The confidence score per sample.
+      def decision_function(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        @meta_estimator.decision_function(z)
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) The predicted class label per sample.
+      def predict(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        Numo::Int32.cast(@encoder.inverse_transform(@meta_estimator.predict(z)))
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) The predicted probability of each class per sample.
+      def predict_proba(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        @meta_estimator.predict_proba(z)
+      end
+      # Transform the given data with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed with the learned model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for samples.
+      def transform(x)
+        x = check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        f_start = 0
+        @estimators.each_key do |name|
+          f_last = f_start + @output_size[name]
+          f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+          z[true, f_position] = @estimators[name].public_send(@stack_method[name], x)
+          f_start = f_last
+        end
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        z
+      end
+      # Fit the model with training data, and then transform them with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for training data.
+      def fit_transform(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_label_array(y)
+        fit(x, y).transform(x)
+      end
+      private
+      STACK_METHODS = %i[predict_proba decision_function predict].freeze
+      private_constant :STACK_METHODS
+      def detect_stack_method
+        if @params[:stack_method] == 'auto'
+          @estimators.each_key.with_object({}) { |name, obj| obj[name] = STACK_METHODS.detect { |m| @estimators[name].respond_to?(m) } }
+        else
+          @estimators.each_key.with_object({}) { |name, obj| obj[name] = @params[:stack_method].to_sym }
+        end
+      end
+      def detect_output_size(n_features)
+        x_dummy = Numo::DFloat.new(2, n_features).rand
+        @estimators.each_key.with_object({}) do |name, obj|
+          output_dummy = @estimators[name].public_send(@stack_method[name], x_dummy)
+          obj[name] = output_dummy.ndim == 1 ? 1 : output_dummy.shape[1]
+        end
+      end
+    end
+  end
+end