RubyGems - svmkit - Versions diffs - 0.7.2 → 0.7.3 - Mend

svmkit 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/HISTORY.md +5 -1
data/lib/svmkit.rb +1 -0
data/lib/svmkit/ensemble/ada_boost_classifier.rb +1 -1
data/lib/svmkit/ensemble/ada_boost_regressor.rb +1 -1
data/lib/svmkit/ensemble/random_forest_classifier.rb +1 -1
data/lib/svmkit/ensemble/random_forest_regressor.rb +1 -1
data/lib/svmkit/model_selection/grid_search_cv.rb +247 -0
data/lib/svmkit/pipeline/pipeline.rb +11 -1
data/lib/svmkit/utils.rb +1 -1
data/lib/svmkit/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f20192e678f6f066eb1d40c066f0e9a0efefd3a0
-  data.tar.gz: 1be802cdbbfb2ee7a641fb78d1409c2ee49b8450
+  metadata.gz: ca1916101dd6c77c5be1a157c2bfa8dafe9c543e
+  data.tar.gz: c4751b21fd3d0667bb7d378f8b524fc2f70069d9
 SHA512:
-  metadata.gz: 43471c5a4ef290781d5d2270732313fbcffba60a4351805d6c7bb8abec7537bcd8ac50260600fbfb1ff52c947c45c3f6f19b9ccecd47e6015e6ac45da5c855a6
-  data.tar.gz: 908f675396a2da835b82da8cf117a4a17d6d90d489618cf110e993de6c03d6ec8e6651115df333033314b0f54c1e931f68da8ff541a1b5e22886741f48496259
+  metadata.gz: db878c8b28e88649fed654b292358c11ec91369cd52ec03e01d06d053fbeb90ebff87248be628c8ab081fd820c1460bb3783448242531ef5f30b4b06337af87c
+  data.tar.gz: bfbfc580897a4a3161afa865cd14ac15a0a322cf80ae728f7b70c8d45cb41ff4566b1f1e748e6c1eb9412a4c401b737a569bf6b9af12a113a4ca4a6d75f8b9b8

data/HISTORY.md CHANGED

@@ -1,6 +1,10 @@
+# 0.7.3
+- Add class for grid search performing hyperparameter optimization.
+- Add argument validations to Pipeline.
 # 0.7.2
 - Add class for Pipeline that constructs chain of transformers and estimators.
-- Fix some typos on document.
+- Fix some typos on document ([#1](https://github.com/yoshoku/SVMKit/pull/1)).
 # 0.7.1
 - Fix to use CSV class in parsing libsvm format file.

data/lib/svmkit.rb CHANGED

@@ -55,6 +55,7 @@ require 'svmkit/preprocessing/one_hot_encoder'
 require 'svmkit/model_selection/k_fold'
 require 'svmkit/model_selection/stratified_k_fold'
 require 'svmkit/model_selection/cross_validation'
+require 'svmkit/model_selection/grid_search_cv'
 require 'svmkit/evaluation_measure/accuracy'
 require 'svmkit/evaluation_measure/precision'
 require 'svmkit/evaluation_measure/recall'

data/lib/svmkit/ensemble/ada_boost_classifier.rb CHANGED

@@ -109,7 +109,7 @@ module SVMKit
           tree = Tree::DecisionTreeClassifier.new(
             criterion: @params[:criterion], max_depth: @params[:max_depth],
             max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
-            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
+            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
           )
           tree.fit(x[ids, true], y[ids])
           # Calculate estimator error.

data/lib/svmkit/ensemble/ada_boost_regressor.rb CHANGED

@@ -111,7 +111,7 @@ module SVMKit
           tree = Tree::DecisionTreeRegressor.new(
             criterion: @params[:criterion], max_depth: @params[:max_depth],
             max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
-            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
+            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
           )
           tree.fit(x[ids, true], y[ids])
           p = tree.predict(x)

data/lib/svmkit/ensemble/random_forest_classifier.rb CHANGED

@@ -97,7 +97,7 @@ module SVMKit
           tree = Tree::DecisionTreeClassifier.new(
             criterion: @params[:criterion], max_depth: @params[:max_depth],
             max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
-            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
+            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
           )
           bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
           tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])

data/lib/svmkit/ensemble/random_forest_regressor.rb CHANGED

@@ -91,7 +91,7 @@ module SVMKit
           tree = Tree::DecisionTreeRegressor.new(
             criterion: @params[:criterion], max_depth: @params[:max_depth],
             max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
-            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
+            max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
           )
           bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
           tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])

data/lib/svmkit/model_selection/grid_search_cv.rb ADDED

@@ -0,0 +1,247 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/evaluator'
+require 'svmkit/base/splitter'
+require 'svmkit/pipeline/pipeline'
+module SVMKit
+  module ModelSelection
+    # GridSearchCV is a class that performs hyperparameter optimization with grid search method.
+    #
+    # @example
+    #   rfc = SVMKit::Ensemble::RandomForestClassifier.new(random_seed: 1)
+    #   pg = { n_estimators: [5, 10], max_depth: [3, 5], max_leaf_nodes: [15, 31] }
+    #   kf = SVMKit::ModelSelection::StratifiedKFold.new(n_splits: 5)
+    #   gs = SVMKit::ModelSelection::GridSearchCV.new(estimator: rfc, param_grid: pg, splitter: kf)
+    #   gs.fit(samples, labels)
+    #   p gs.cv_results
+    #   p gs.best_params
+    #
+    # @example
+    #   rbf = SVMKit::KernelApproximation::RBF.new(random_seed: 1)
+    #   svc = SVMKit::LinearModel::SVC.new(random_seed: 1)
+    #   pipe = SVMKit::Pipeline::Pipeline.new(steps: { rbf: rbf, svc: svc })
+    #   pg = { rbf__gamma: [32.0, 1.0], rbf__n_components: [4, 128], svc__reg_param: [16.0, 0.1] }
+    #   kf = SVMKit::ModelSelection::StratifiedKFold.new(n_splits: 5)
+    #   gs = SVMKit::ModelSelection::GridSearchCV.new(estimator: pipe, param_grid: pg, splitter: kf)
+    #   gs.fit(samples, labels)
+    #   p gs.cv_results
+    #   p gs.best_params
+    #
+    class GridSearchCV
+      include Base::BaseEstimator
+      include Validation
+      # Return the result of cross validation for each parameter.
+      # @return [Hash]
+      attr_reader :cv_results
+      # Return the score of the estimator learned with the best parameter.
+      # @return [Float]
+      attr_reader :best_score
+      # Return the best parameter set.
+      # @return [Hash]
+      attr_reader :best_params
+      # Return the index of the best parameter.
+      # @return [Integer]
+      attr_reader :best_index
+      # Return the estimator learned with the best parameter.
+      # @return [Estimator]
+      attr_reader :best_estimator
+      # Create a new grid search method.
+      #
+      # @param estimator [Classifier/Regresor] The estimator to be searched for optimal parameters with grid search method.
+      # @param param_grid [Array<Hash>] The parameter sets is represented with array of hash that
+      #   consists of parameter names as keys and array of parameter values as values.
+      # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset on cross validation.
+      # @param evaluator [Evaluator] The evaluator that calculates score of estimator results on cross validation.
+      #   If nil is given, the score method of estimator is used to evaluation.
+      # @param greater_is_better [Boolean] The flag that indicates whether the estimator is better as
+      #   evaluation score is larger.
+      def initialize(estimator: nil, param_grid: nil, splitter: nil, evaluator: nil, greater_is_better: true)
+        check_params_type(SVMKit::Base::BaseEstimator, estimator: estimator)
+        check_params_type(SVMKit::Base::Splitter, splitter: splitter)
+        check_params_type_or_nil(SVMKit::Base::Evaluator, evaluator: evaluator)
+        check_params_boolean(greater_is_better: greater_is_better)
+        @params = {}
+        @params[:param_grid] = valid_param_grid(param_grid)
+        @params[:estimator] = Marshal.load(Marshal.dump(estimator))
+        @params[:splitter] = Marshal.load(Marshal.dump(splitter))
+        @params[:evaluator] = Marshal.load(Marshal.dump(evaluator))
+        @params[:greater_is_better] = greater_is_better
+        @cv_results = nil
+        @best_score = nil
+        @best_params = nil
+        @best_index = nil
+        @best_estimator = nil
+      end
+      # Fit the model with given training data and all sets of parameters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
+      # @return [GridSearchCV] The learned estimator with grid search.
+      def fit(x, y)
+        check_sample_array(x)
+        init_attrs
+        param_combinations.each do |prm_set|
+          prm_set.each do |prms|
+            report = perform_cross_validation(x, y, prms)
+            store_cv_result(prms, report)
+          end
+        end
+        find_best_params
+        @best_estimator = configurated_estimator(@best_params)
+        @best_estimator.fit(x, y)
+        self
+      end
+      # Call the decision_function method of learned estimator with the best parameter.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
+      # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
+      def decision_function(x)
+        check_sample_array(x)
+        @best_estimator.decision_function(x)
+      end
+      # Call the predict method of learned estimator with the best parameter.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
+      # @return [Numo::NArray] Predicted results.
+      def predict(x)
+        check_sample_array(x)
+        @best_estimator.predict(x)
+      end
+      # Call the predict_log_proba method of learned estimator with the best parameter.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
+      def predict_log_proba(x)
+        check_sample_array(x)
+        @best_estimator.predict_log_proba(x)
+      end
+      # Call the predict_proba method of learned estimator with the best parameter.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        check_sample_array(x)
+        @best_estimator.predict_proba(x)
+      end
+      # Call the score method of learned estimator with the best parameter.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
+      # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
+      # @return [Float] The score of estimator.
+      def score(x, y)
+        check_sample_array(x)
+        @best_estimator.score(x, y)
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about GridSearchCV.
+      def marshal_dump
+        { params: @params,
+          cv_results: @cv_results,
+          best_score: @best_score,
+          best_params: @best_params,
+          best_index: @best_index,
+          best_estimator: @best_estimator }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @cv_results = obj[:cv_results]
+        @best_score = obj[:best_score]
+        @best_params = obj[:best_params]
+        @best_index = obj[:best_index]
+        @best_estimator = obj[:best_estimator]
+        nil
+      end
+      private
+      def valid_param_grid(grid)
+        raise TypeError, 'Expect class of param_grid to be Hash or Array' unless grid.is_a?(Hash) || grid.is_a?(Array)
+        grid = [grid] if grid.is_a?(Hash)
+        grid.each do |h|
+          raise TypeError, 'Expect class of elements in param_grid to be Hash' unless h.is_a?(Hash)
+          raise TypeError, 'Expect class of parameter values in param_grid to be Array' unless h.values.all? { |v| v.is_a?(Array) }
+        end
+        grid
+      end
+      def param_combinations
+        @param_combinations ||= @params[:param_grid].map do |prm|
+          x = Hash[prm.sort].map { |k, v| [k].product(v) }
+          x[0].product(*x[1...x.size]).map { |v| Hash[v] }
+        end
+      end
+      def perform_cross_validation(x, y, prms)
+        est = configurated_estimator(prms)
+        cv = CrossValidation.new(estimator: est, splitter: @params[:splitter],
+                                 evaluator: @params[:evaluator], return_train_score: true)
+        cv.perform(x, y)
+      end
+      def configurated_estimator(prms)
+        estimator = Marshal.load(Marshal.dump(@params[:estimator]))
+        if @params[:estimator].is_a?(SVMKit::Pipeline::Pipeline)
+          prms.each do |k, v|
+            est_name, prm_name = k.to_s.split('__')
+            estimator.steps[est_name.to_sym].params[prm_name.to_sym] = v
+          end
+        else
+          prms.each { |k, v| estimator.params[k] = v }
+        end
+        estimator
+      end
+      def init_attrs
+        @cv_results = %i[mean_test_score std_test_score
+                         mean_train_score std_train_score
+                         mean_fit_time std_fit_time params].map { |v| [v, []] }.to_h
+        @best_score = nil
+        @best_params = nil
+        @best_index = nil
+        @best_estimator = nil
+      end
+      def store_cv_result(prms, report)
+        test_scores = Numo::DFloat[*report[:test_score]]
+        train_scores = Numo::DFloat[*report[:train_score]]
+        fit_times = Numo::DFloat[*report[:fit_time]]
+        @cv_results[:mean_test_score].push(test_scores.mean)
+        @cv_results[:std_test_score].push(test_scores.stddev)
+        @cv_results[:mean_train_score].push(train_scores.mean)
+        @cv_results[:std_train_score].push(train_scores.stddev)
+        @cv_results[:mean_fit_time].push(fit_times.mean)
+        @cv_results[:std_fit_time].push(fit_times.stddev)
+        @cv_results[:params].push(prms)
+      end
+      def find_best_params
+        @best_score = @params[:greater_is_better] ? @cv_results[:mean_test_score].max : @cv_results[:mean_test_score].min
+        @best_index = @cv_results[:mean_test_score].index(@best_score)
+        @best_params = @cv_results[:params][@best_index]
+      end
+    end
+  end
+end

data/lib/svmkit/pipeline/pipeline.rb CHANGED

@@ -40,6 +40,7 @@ module SVMKit
       # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
       # @return [Pipeline] The learned pipeline itself.
       def fit(x, y)
+        check_sample_array(x)
         trans_x = apply_transforms(x, y, fit: true)
         last_estimator.fit(trans_x, y) unless last_estimator.nil?
         self
@@ -51,6 +52,7 @@ module SVMKit
       # @param y [Numo::NArray] (shape: [n_samples, n_outputs], default: nil) The target values or labels to be used for fitting the model.
       # @return [Numo::NArray] The predicted results by last estimator.
       def fit_predict(x, y = nil)
+        check_sample_array(x)
         trans_x = apply_transforms(x, y, fit: true)
         last_estimator.fit_predict(trans_x)
       end
@@ -61,6 +63,7 @@ module SVMKit
       # @param y [Numo::NArray] (shape: [n_samples, n_outputs], default: nil) The target values or labels to be used for fitting the model.
       # @return [Numo::NArray] The predicted results by last estimator.
       def fit_transform(x, y = nil)
+        check_sample_array(x)
         trans_x = apply_transforms(x, y, fit: true)
         last_estimator.fit_transform(trans_x, y)
       end
@@ -70,6 +73,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
       # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
       def decision_function(x)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.decision_function(trans_x)
       end
@@ -79,6 +83,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
       # @return [Numo::NArray] The predicted results by last estimator.
       def predict(x)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.predict(trans_x)
       end
@@ -88,6 +93,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
       def predict_log_proba(x)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.predict_log_proba(trans_x)
       end
@@ -97,6 +103,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
       def predict_proba(x)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.predict_proba(trans_x)
       end
@@ -106,6 +113,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
       # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed samples.
       def transform(x)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.nil? ? trans_x : last_estimator.transform(trans_x)
       end
@@ -115,8 +123,9 @@ module SVMKit
       # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The transformed samples to be restored into original space.
       # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored samples.
       def inverse_transform(z)
+        check_sample_array(z)
         itrans_z = z
-        @steps.keys.reverse.each do |name|
+        @steps.keys.reverse_each do |name|
           transformer = @steps[name]
           next if transformer.nil?
           itrans_z = transformer.inverse_transform(itrans_z)
@@ -130,6 +139,7 @@ module SVMKit
       # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
       # @return [Float] The score of last estimator
       def score(x, y)
+        check_sample_array(x)
         trans_x = apply_transforms(x)
         last_estimator.score(trans_x, y)
       end

data/lib/svmkit/utils.rb CHANGED

@@ -6,7 +6,7 @@ module SVMKit
     module_function
     # @!visibility private
-    def choice_ids(size, probs, rng=nil)
+    def choice_ids(size, probs, rng = nil)
       rng ||= Random.new
       Array.new(size) do
         target = rng.rand

data/lib/svmkit/version.rb CHANGED

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.7.2'.freeze
+  VERSION = '0.7.3'.freeze
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.7.2
+  version: 0.7.3
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-01-21 00:00:00.000000000 Z
+date: 2019-02-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -146,6 +146,7 @@ files:
 - lib/svmkit/linear_model/svc.rb
 - lib/svmkit/linear_model/svr.rb
 - lib/svmkit/model_selection/cross_validation.rb
+- lib/svmkit/model_selection/grid_search_cv.rb
 - lib/svmkit/model_selection/k_fold.rb
 - lib/svmkit/model_selection/stratified_k_fold.rb
 - lib/svmkit/multiclass/one_vs_rest_classifier.rb