RubyGems - svmkit - Versions diffs - 0.3.3 → 0.4.0 - Mend

svmkit 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/HISTORY.md +22 -0
data/lib/svmkit.rb +1 -0
data/lib/svmkit/linear_model/lasso.rb +14 -32
data/lib/svmkit/linear_model/logistic_regression.rb +37 -36
data/lib/svmkit/linear_model/ridge.rb +10 -32
data/lib/svmkit/linear_model/svc.rb +40 -39
data/lib/svmkit/linear_model/svr.rb +34 -31
data/lib/svmkit/optimizer/nadam.rb +64 -0
data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +53 -61
data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +30 -66
data/lib/svmkit/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9c0a64cc46c00a252946033b072b4d9498fb4d5cf7131830333483a336c29315
-  data.tar.gz: 1eb9415f08167772764f1eba4e67f6a3479768db75efcc32a8de85276440d41c
+  metadata.gz: cef050a2ac6b55583414cb3ce9c3678dd6d2d1c8b2be04a249222683e10465e1
+  data.tar.gz: 7c67ab0e90246f1d9b7e5d0bfb19ed76061d0edf17a05014f521b8ef41e41aed
 SHA512:
-  metadata.gz: 2f994dad593e5b752c2a062507f849483a9e4dbdd90190313b672c2f8cd9c9ed102b2fc088823665812f44e6b549bc67cab7d16fb545031bae0b57e713c3c3c3
-  data.tar.gz: d6da2f56721b8d264898fea922e2bee016987d898b4169f12bc6963044b69a4952f25d1b75380a93ce1fccd9854bb86e085c18fdf9535e273bd2eb1a328d3b98
+  metadata.gz: 15341450f3bf3ca49901ae55b507d647468261682c7fdb0b058c21a470c2eec261718b6721ca0e2ad7738cfdabd184128a588d68ad6d079e53c9b1e916efa2b1
+  data.tar.gz: fd562db538be12896c005840e065f867e342691e899b33f0524a4db26da33439bfc174141e022d4de3d805657d09e854a4593b9b05b2d9eb99f6cd41da064a1d

data/HISTORY.md CHANGED

@@ -1,3 +1,25 @@
+# 0.4.0
+## Breaking changes
+SVMKit introduces optimizer algorithm that calculates learning rates adaptively
+on each iteration of stochastic gradient descent (SGD).
+While Pegasos SGD runs fast, it sometimes fails to optimize complicated models
+like Factorization Machine.
+To solve this problem, in version 0.3.3, SVMKit introduced optimization with RMSProp on
+FactorizationMachineRegressor, Ridge and Lasso.
+This attempt realized stable optimization of those estimators.
+Following the success of the attempt, author decided to use modern optimizer algorithms
+with all SGD optimizations in SVMKit.
+Through some preliminary experiments, author implemented Nadam as the default optimizer.
+SVMKit plans to add other optimizer algorithms sequentially, so that users can select them.
+- Fix to use Nadam for optimization on SVC, SVR, LogisticRegression, Ridge, Lasso, and Factorization Machine estimators.
+  - Combine reg_param_weight and reg_param_bias parameters on Factorization Machine estimators into the unified parameter named reg_param_linear.
+  - Remove init_std paramter on Factorization Machine estimators.
+  - Remove learning_rate, decay, and momentum parameters on Ridge, Lasso, and FactorizationMachineRegressor.
+  - Remove normalize parameter on SVC, SVR, and LogisticRegression.
 # 0.3.3
 - Add class for Ridge regressor.
 - Add class for Lasso regressor.

data/lib/svmkit.rb CHANGED

@@ -13,6 +13,7 @@ require 'svmkit/base/regressor'
 require 'svmkit/base/transformer'
 require 'svmkit/base/splitter'
 require 'svmkit/base/evaluator'
+require 'svmkit/optimizer/nadam'
 require 'svmkit/kernel_approximation/rbf'
 require 'svmkit/linear_model/svc'
 require 'svmkit/linear_model/svr'

data/lib/svmkit/linear_model/lasso.rb CHANGED

@@ -3,6 +3,7 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/regressor'
+require 'svmkit/optimizer/nadam'
 module SVMKit
   module LinearModel
@@ -11,15 +12,13 @@ module SVMKit
     #
     # @example
     #   estimator =
-    #     SVMKit::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 5000, batch_size: 50, random_seed: 1)
+    #     SVMKit::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
     # *Reference*
     # - S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
     # - L. Bottou, "Large-Scale Machine Learning with Stochastic Gradient Descent," Proc. COMPSTAT'10, pp. 177--186, 2010.
-    # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. ICML'13, pp. 1139--1147, 2013.
-    # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
     class Lasso
       include Base::BaseEstimator
       include Base::Regressor
@@ -41,30 +40,23 @@ module SVMKit
       #
       # @param reg_param [Float] The regularization parameter.
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
-      # @param learning_rate [Float] The learning rate for optimization.
-      # @param decay [Float] The discounting factor for RMS prop optimization.
-      # @param momentum [Float] The momentum for optimization.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
-      def initialize(reg_param: 1.0, fit_bias: false, learning_rate: 0.01, decay: 0.9, momentum: 0.9,
-                     max_iter: 1000, batch_size: 10, random_seed: nil)
-        check_params_float(reg_param: reg_param,
-                           learning_rate: learning_rate, decay: decay, momentum: momentum)
+      def initialize(reg_param: 1.0, fit_bias: false, max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param: reg_param)
         check_params_integer(max_iter: max_iter, batch_size: batch_size)
         check_params_boolean(fit_bias: fit_bias)
         check_params_type_or_nil(Integer, random_seed: random_seed)
-        check_params_positive(reg_param: reg_param,
-                              learning_rate: learning_rate, decay: decay, momentum: momentum,
-                              max_iter: max_iter, batch_size: batch_size)
+        check_params_positive(reg_param: reg_param, max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
-        @params[:learning_rate] = learning_rate
-        @params[:decay] = decay
-        @params[:momentum] = momentum
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
@@ -138,11 +130,9 @@ module SVMKit
         rand_ids = [*0...n_samples].shuffle(random: @rng)
         weight_vec = Numo::DFloat.zeros(n_features)
         left_weight_vec = Numo::DFloat.zeros(n_features)
-        left_weight_sqrsum = Numo::DFloat.zeros(n_features)
-        left_weight_update = Numo::DFloat.zeros(n_features)
         right_weight_vec = Numo::DFloat.zeros(n_features)
-        right_weight_sqrsum = Numo::DFloat.zeros(n_features)
-        right_weight_update = Numo::DFloat.zeros(n_features)
+        left_optimizer = Optimizer::Nadam.new
+        right_optimizer = Optimizer::Nadam.new
         # Start optimization.
         @params[:max_iter].times do |_t|
           # Random sampling.
@@ -154,12 +144,8 @@ module SVMKit
           loss_grad = loss_gradient(data, values, weight_vec)
           next if loss_grad.ne(0.0).count.zero?
           # Update weight.
-          left_weight_vec, left_weight_sqrsum, left_weight_update =
-            update_weight(left_weight_vec, left_weight_sqrsum, left_weight_update,
-                          left_weight_gradient(loss_grad, data))
-          right_weight_vec, right_weight_sqrsum, right_weight_update =
-            update_weight(right_weight_vec, right_weight_sqrsum, right_weight_update,
-                          right_weight_gradient(loss_grad, data))
+          left_weight_vec = round_weight(left_optimizer.call(left_weight_vec, left_weight_gradient(loss_grad, data)))
+          right_weight_vec = round_weight(right_optimizer.call(right_weight_vec, right_weight_gradient(loss_grad, data)))
           weight_vec = left_weight_vec - right_weight_vec
         end
         split_weight_vec_bias(weight_vec)
@@ -177,12 +163,8 @@ module SVMKit
         ((@params[:reg_param] - loss_grad).expand_dims(1) * data).mean(0)
       end
-      def update_weight(weight, sqrsum, update, gr)
-        new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
-        new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
-        new_weight = weight - (new_update + @params[:momentum] * update)
-        new_weight = 0.5 * (new_weight + new_weight.abs)
-        [new_weight, new_sqrsum, new_update]
+      def round_weight(weight)
+        0.5 * (weight + weight.abs)
       end
       def expand_feature(x)

data/lib/svmkit/linear_model/logistic_regression.rb CHANGED

@@ -3,25 +3,26 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
+require 'svmkit/optimizer/nadam'
 module SVMKit
-  # This module consists of the classes that implement generalized linear models.
   module LinearModel
     # LogisticRegression is a class that implements Logistic Regression
-    # with stochastic gradient descent (SGD) optimization.
+    # with mini-batch stochastic gradient descent optimization.
     # For multiclass classification problem, it uses one-vs-the-rest strategy.
     #
     # @example
     #   estimator =
-    #     SVMKit::LinearModel::LogisticRegression.new(reg_param: 1.0, max_iter: 100, batch_size: 20, random_seed: 1)
+    #     SVMKit::LinearModel::LogisticRegression.new(reg_param: 1.0, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_labels)
     #   results = estimator.predict(testing_samples)
     #
     # *Reference*
-    # 1. S. Shalev-Shwartz, Y. Singer, N. Srebro, and A. Cotter, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Mathematical Programming, vol. 127 (1), pp. 3--30, 2011.
+    # - S. Shalev-Shwartz, Y. Singer, N. Srebro, and A. Cotter, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Mathematical Programming, vol. 127 (1), pp. 3--30, 2011.
     class LogisticRegression
       include Base::BaseEstimator
       include Base::Classifier
+      include Validation
       # Return the weight vector for Logistic Regression.
       # @return [Numo::DFloat] (shape: [n_classes, n_features])
@@ -47,23 +48,23 @@ module SVMKit
       #   If fit_bias is true, the feature vector v becoms [v; bias_scale].
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
-      # @param normalize [Boolean] The flag indicating whether to normalize the weight vector.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
-                     max_iter: 100, batch_size: 50, normalize: true, random_seed: nil)
-        SVMKit::Validation.check_params_float(reg_param: reg_param, bias_scale: bias_scale)
-        SVMKit::Validation.check_params_integer(max_iter: max_iter, batch_size: batch_size)
-        SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, normalize: normalize)
-        SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
-        SVMKit::Validation.check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter,
-                                                 batch_size: batch_size)
+                     max_iter: 1000, batch_size: 20, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param: reg_param, bias_scale: bias_scale)
+        check_params_integer(max_iter: max_iter, batch_size: batch_size)
+        check_params_boolean(fit_bias: fit_bias)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
         @params[:bias_scale] = bias_scale
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
-        @params[:normalize] = normalize
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
@@ -78,9 +79,9 @@ module SVMKit
       # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
       # @return [LogisticRegression] The learned classifier itself.
       def fit(x, y)
-        SVMKit::Validation.check_sample_array(x)
-        SVMKit::Validation.check_label_array(y)
-        SVMKit::Validation.check_sample_label_size(x, y)
+        check_sample_array(x)
+        check_label_array(y)
+        check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         n_classes = @classes.size
@@ -109,8 +110,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
       def decision_function(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         x.dot(@weight_vec.transpose) + @bias_term
       end
@@ -119,7 +119,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
       # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
       def predict(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         return Numo::Int32.cast(predict_proba(x)[true, 1].ge(0.5)) * 2 - 1 if @classes.size <= 2
@@ -133,7 +133,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
       def predict_proba(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
         return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
@@ -168,40 +168,41 @@ module SVMKit
       private
-      def binary_fit(x, bin_y)
+      def binary_fit(x, y)
         # Expand feature vectors for bias term.
         samples = @params[:fit_bias] ? expand_feature(x) : x
         # Initialize some variables.
         n_samples, n_features = samples.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
         weight_vec = Numo::DFloat.zeros(n_features)
+        optimizer = Optimizer::Nadam.new
         # Start optimization.
-        @params[:max_iter].times do |t|
+        @params[:max_iter].times do |_t|
           # random sampling
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
-          # update the weight vector.
-          df = samples[subset_ids, true].dot(weight_vec.transpose)
-          coef = bin_y[subset_ids] / (Numo::NMath.exp(-bin_y[subset_ids] * df) + 1.0) - bin_y[subset_ids]
-          mean_vec = samples[subset_ids, true].transpose.dot(coef) / @params[:batch_size]
-          weight_vec -= learning_rate(t) * (@params[:reg_param] * weight_vec + mean_vec)
-          # scale the weight vector.
-          normalize_weight_vec(weight_vec) if @params[:normalize]
+          data = samples[subset_ids, true]
+          labels = y[subset_ids]
+          # calculate gradient for loss function.
+          loss_grad = loss_gradient(data, labels, weight_vec)
+          # update weight.
+          weight_vec = optimizer.call(weight_vec, weight_gradient(loss_grad, data, weight_vec))
         end
         split_weight_vec_bias(weight_vec)
       end
-      def expand_feature(x)
-        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
+      def loss_gradient(x, y, weight)
+        z = x.dot(weight)
+        grad = y / (Numo::NMath.exp(-y * z) + 1.0) - y
+        grad
       end
-      def learning_rate(iter)
-        1.0 / (@params[:reg_param] * (iter + 1))
+      def weight_gradient(loss_grad, x, weight)
+        x.transpose.dot(loss_grad) / @params[:batch_size] + @params[:reg_param] * weight
       end
-      def normalize_weight_vec(weight_vec)
-        norm = Math.sqrt(weight_vec.dot(weight_vec))
-        weight_vec * [1.0, (1.0 / @params[:reg_param]**0.5) / (norm + 1.0e-12)].min
+      def expand_feature(x)
+        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
       end
       def split_weight_vec_bias(weight_vec)

data/lib/svmkit/linear_model/ridge.rb CHANGED

@@ -3,22 +3,19 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/regressor'
+require 'svmkit/optimizer/nadam'
 module SVMKit
   module LinearModel
     # Ridge is a class that implements Ridge Regression
-    # with stochastic gradient descent (SGD) optimization.
+    # with mini-batch stochastic gradient descent optimization.
     #
     # @example
     #   estimator =
-    #     SVMKit::LinearModel::Ridge.new(reg_param: 0.1, max_iter: 5000, batch_size: 50, random_seed: 1)
+    #     SVMKit::LinearModel::Ridge.new(reg_param: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
-    # *Reference*
-    # - S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
-    # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. ICML'13, pp. 1139--1147, 2013.
-    # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
     class Ridge
       include Base::BaseEstimator
       include Base::Regressor
@@ -40,30 +37,21 @@ module SVMKit
       #
       # @param reg_param [Float] The regularization parameter.
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
-      # @param learning_rate [Float] The learning rate for optimization.
-      # @param decay [Float] The discounting factor for RMS prop optimization.
-      # @param momentum [Float] The Nesterov momentum for optimization.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
-      def initialize(reg_param: 1.0, fit_bias: false, learning_rate: 0.01, decay: 0.9, momentum: 0.9,
-                     max_iter: 1000, batch_size: 10, random_seed: nil)
-        check_params_float(reg_param: reg_param,
-                           learning_rate: learning_rate, decay: decay, momentum: momentum)
+      def initialize(reg_param: 1.0, fit_bias: false, max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param: reg_param)
         check_params_integer(max_iter: max_iter, batch_size: batch_size)
         check_params_boolean(fit_bias: fit_bias)
         check_params_type_or_nil(Integer, random_seed: random_seed)
-        check_params_positive(reg_param: reg_param,
-                              learning_rate: learning_rate, decay: decay, momentum: momentum,
-                              max_iter: max_iter, batch_size: batch_size)
+        check_params_positive(reg_param: reg_param, max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
-        @params[:learning_rate] = learning_rate
-        @params[:decay] = decay
-        @params[:momentum] = momentum
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
@@ -136,8 +124,7 @@ module SVMKit
         n_samples, n_features = samples.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
         weight_vec = Numo::DFloat.zeros(n_features)
-        weight_sqrsum = Numo::DFloat.zeros(n_features)
-        weight_update = Numo::DFloat.zeros(n_features)
+        optimizer = Optimizer::Nadam.new
         # Start optimization.
         @params[:max_iter].times do |_t|
           # Random sampling.
@@ -146,12 +133,10 @@ module SVMKit
           data = samples[subset_ids, true]
           values = y[subset_ids]
           # Calculate gradients for loss function.
-          loss_grad = loss_gradient(data, values, weight_vec - @params[:momentum] * weight_update)
+          loss_grad = loss_gradient(data, values, weight_vec)
           next if loss_grad.ne(0.0).count.zero?
           # Update weight.
-          weight_vec, weight_sqrsum, weight_update =
-            update_weight(weight_vec, weight_sqrsum, weight_update,
-                          weight_gradient(loss_grad, data, weight_vec - @params[:momentum] * weight_update))
+          weight_vec = optimizer.call(weight_vec, weight_gradient(loss_grad, data, weight_vec))
         end
         split_weight_vec_bias(weight_vec)
       end
@@ -164,13 +149,6 @@ module SVMKit
         (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param] * weight
       end
-      def update_weight(weight, sqrsum, update, gr)
-        new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
-        new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
-        new_weight = weight - (new_update + @params[:momentum] * update)
-        [new_weight, new_sqrsum, new_update]
-      end
       def expand_feature(x)
         Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1])])
       end

data/lib/svmkit/linear_model/svc.rb CHANGED

@@ -3,26 +3,28 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
+require 'svmkit/optimizer/nadam'
 require 'svmkit/probabilistic_output'
 module SVMKit
   # This module consists of the classes that implement generalized linear models.
   module LinearModel
     # SVC is a class that implements Support Vector Classifier
-    # with stochastic gradient descent (SGD) optimization.
+    # with mini-batch stochastic gradient descent optimization.
     # For multiclass classification problem, it uses one-vs-the-rest strategy.
     #
     # @example
     #   estimator =
-    #     SVMKit::LinearModel::SVC.new(reg_param: 1.0, max_iter: 100, batch_size: 20, random_seed: 1)
+    #     SVMKit::LinearModel::SVC.new(reg_param: 1.0, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_labels)
     #   results = estimator.predict(testing_samples)
     #
     # *Reference*
-    # 1. S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
+    # - S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
     class SVC
       include Base::BaseEstimator
       include Base::Classifier
+      include Validation
       # Return the weight vector for SVC.
       # @return [Numo::DFloat] (shape: [n_classes, n_features])
@@ -48,16 +50,16 @@ module SVMKit
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
       # @param probability [Boolean] The flag indicating whether to perform probability estimation.
-      # @param normalize [Boolean] The flag indicating whether to normalize the weight vector.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
-                     max_iter: 100, batch_size: 50, probability: false, normalize: true, random_seed: nil)
-        SVMKit::Validation.check_params_float(reg_param: reg_param, bias_scale: bias_scale)
-        SVMKit::Validation.check_params_integer(max_iter: max_iter, batch_size: batch_size)
-        SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, probability: probability, normalize: normalize)
-        SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
-        SVMKit::Validation.check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter,
-                                                 batch_size: batch_size)
+                     max_iter: 1000, batch_size: 20, probability: false, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param: reg_param, bias_scale: bias_scale)
+        check_params_integer(max_iter: max_iter, batch_size: batch_size)
+        check_params_boolean(fit_bias: fit_bias, probability: probability)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
@@ -65,7 +67,7 @@ module SVMKit
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
         @params[:probability] = probability
-        @params[:normalize] = normalize
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
@@ -81,9 +83,9 @@ module SVMKit
       # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
       # @return [SVC] The learned classifier itself.
       def fit(x, y)
-        SVMKit::Validation.check_sample_array(x)
-        SVMKit::Validation.check_label_array(y)
-        SVMKit::Validation.check_sample_label_size(x, y)
+        check_sample_array(x)
+        check_label_array(y)
+        check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         n_classes = @classes.size
@@ -123,8 +125,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
       def decision_function(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         x.dot(@weight_vec.transpose) + @bias_term
       end
@@ -133,7 +134,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
       # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
       def predict(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         return Numo::Int32.cast(decision_function(x).ge(0.0)) * 2 - 1 if @classes.size <= 2
@@ -147,7 +148,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
       def predict_proba(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         if @classes.size > 2
           probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
@@ -186,43 +187,43 @@ module SVMKit
       private
-      def binary_fit(x, bin_y)
+      def binary_fit(x, y)
         # Expand feature vectors for bias term.
         samples = @params[:fit_bias] ? expand_feature(x) : x
         # Initialize some variables.
         n_samples, n_features = samples.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
         weight_vec = Numo::DFloat.zeros(n_features)
+        optimizer = Optimizer::Nadam.new
         # Start optimization.
-        @params[:max_iter].times do |t|
-          # random sampling
+        @params[:max_iter].times do |_t|
+          # random sampling.
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
-          sub_samples = samples[subset_ids, true]
-          sub_bin_y = bin_y[subset_ids]
-          target_ids = (sub_samples.dot(weight_vec.transpose) * sub_bin_y).lt(1.0).where
-          n_targets = target_ids.size
-          next if n_targets.zero?
-          # update the weight vector.
-          mean_vec = sub_samples[target_ids, true].transpose.dot(sub_bin_y[target_ids]) / n_targets
-          weight_vec -= learning_rate(t) * (@params[:reg_param] * weight_vec - mean_vec)
-          # scale the weight vector.
-          normalize_weight_vec(weight_vec) if @params[:normalize]
+          data = samples[subset_ids, true]
+          labels = y[subset_ids]
+          # calculate gradient for loss function.
+          loss_grad = loss_gradient(data, labels, weight_vec)
+          next if loss_grad.ne(0.0).count.zero?
+          # update weight.
+          weight_vec = optimizer.call(weight_vec, weight_gradient(loss_grad, data, weight_vec))
         end
         split_weight_vec_bias(weight_vec)
       end
-      def expand_feature(x)
-        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
+      def loss_gradient(x, y, weight)
+        target_ids = (x.dot(weight) * y).lt(1.0).where
+        grad = Numo::DFloat.zeros(@params[:batch_size])
+        grad[target_ids] = -y[target_ids]
+        grad
       end
-      def learning_rate(iter)
-        1.0 / (@params[:reg_param] * (iter + 1))
+      def weight_gradient(loss_grad, x, weight)
+        x.transpose.dot(loss_grad) / @params[:batch_size] + @params[:reg_param] * weight
       end
-      def normalize_weight_vec(weight_vec)
-        norm = Math.sqrt(weight_vec.dot(weight_vec))
-        weight_vec * [1.0, (1.0 / @params[:reg_param]**0.5) / (norm + 1.0e-12)].min
+      def expand_feature(x)
+        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
       end
       def split_weight_vec_bias(weight_vec)

data/lib/svmkit/linear_model/svr.rb CHANGED

@@ -3,15 +3,16 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/regressor'
+require 'svmkit/optimizer/nadam'
 module SVMKit
   module LinearModel
     # SVR is a class that implements Support Vector Regressor
-    # with stochastic gradient descent (SGD) optimization.
+    # with mini-batch stochastic gradient descent optimization.
     #
     # @example
     #   estimator =
-    #     SVMKit::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 100, batch_size: 20, random_seed: 1)
+    #     SVMKit::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_target_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -20,6 +21,7 @@ module SVMKit
     class SVR
       include Base::BaseEstimator
       include Base::Regressor
+      include Validation
       # Return the weight vector for SVR.
       # @return [Numo::DFloat] (shape: [n_outputs, n_features])
@@ -41,16 +43,17 @@ module SVMKit
       # @param epsilon [Float] The margin of tolerance.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
-      # @param normalize [Boolean] The flag indicating whether to normalize the weight vector.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0, epsilon: 0.1,
-                     max_iter: 100, batch_size: 50, normalize: true, random_seed: nil)
-        SVMKit::Validation.check_params_float(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon)
-        SVMKit::Validation.check_params_integer(max_iter: max_iter, batch_size: batch_size)
-        SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, normalize: normalize)
-        SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
-        SVMKit::Validation.check_params_positive(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon,
-                                                 max_iter: max_iter, batch_size: batch_size)
+                     max_iter: 1000, batch_size: 20, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon)
+        check_params_integer(max_iter: max_iter, batch_size: batch_size)
+        check_params_boolean(fit_bias: fit_bias)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon,
+                              max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
@@ -58,7 +61,7 @@ module SVMKit
         @params[:epsilon] = epsilon
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
-        @params[:normalize] = normalize
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
@@ -72,9 +75,9 @@ module SVMKit
       # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
       # @return [SVR] The learned regressor itself.
       def fit(x, y)
-        SVMKit::Validation.check_sample_array(x)
-        SVMKit::Validation.check_tvalue_array(y)
-        SVMKit::Validation.check_sample_tvalue_size(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
         n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
         _n_samples, n_features = x.shape
@@ -99,7 +102,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
       # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
       def predict(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         x.dot(@weight_vec.transpose) + @bias_term
       end
@@ -131,35 +134,35 @@ module SVMKit
         n_samples, n_features = samples.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
         weight_vec = Numo::DFloat.zeros(n_features)
+        optimizer = Optimizer::Nadam.new
         # Start optimization.
-        @params[:max_iter].times do |t|
+        @params[:max_iter].times do |_t|
           # random sampling
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
+          data = samples[subset_ids, true]
+          values = y[subset_ids]
           # update the weight vector.
-          z = samples[subset_ids, true].dot(weight_vec.transpose)
-          coef = Numo::DFloat.zeros(@params[:batch_size])
-          coef[(z - y[subset_ids]).gt(@params[:epsilon]).where] = 1
-          coef[(y[subset_ids] - z).gt(@params[:epsilon]).where] = -1
-          mean_vec = samples[subset_ids, true].transpose.dot(coef) / @params[:batch_size]
-          weight_vec -= learning_rate(t) * (@params[:reg_param] * weight_vec + mean_vec)
-          # scale the weight vector.
-          normalize_weight_vec(weight_vec) if @params[:normalize]
+          loss_grad = loss_gradient(data, values, weight_vec)
+          weight_vec = optimizer.call(weight_vec, weight_gradient(loss_grad, data, weight_vec))
         end
         split_weight_vec_bias(weight_vec)
       end
-      def expand_feature(x)
-        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
+      def loss_gradient(x, y, weight)
+        z = x.dot(weight)
+        grad = Numo::DFloat.zeros(@params[:batch_size])
+        grad[(z - y).gt(@params[:epsilon]).where] = 1
+        grad[(y - z).gt(@params[:epsilon]).where] = -1
+        grad
       end
-      def learning_rate(iter)
-        1.0 / (@params[:reg_param] * (iter + 1))
+      def weight_gradient(loss_grad, x, weight)
+        x.transpose.dot(loss_grad) / @params[:batch_size] + @params[:reg_param] * weight
       end
-      def normalize_weight_vec(weight_vec)
-        norm = Math.sqrt(weight_vec.dot(weight_vec))
-        weight_vec * [1.0, (1.0 / @params[:reg_param]**0.5) / (norm + 1.0e-12)].min
+      def expand_feature(x)
+        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1]) * @params[:bias_scale]])
       end
       def split_weight_vec_bias(weight_vec)

data/lib/svmkit/optimizer/nadam.rb ADDED

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+module SVMKit
+  # This module consists of the classes that implement optimizers adaptively tuning hyperparameters.
+  module Optimizer
+    # Nadam is a class that implements Nadam optimizer.
+    # This class is used for internal processes.
+    #
+    # *Reference*
+    # - T. Dozat, "Incorporating Nesterov Momentum into Adam," Tech. Repo. Stanford University, 2015.
+    class Nadam
+      include Validation
+      # Create a new optimizer with Nadam
+      #
+      # @param learning_rate [Float] The initial value of learning rate.
+      # @param momentum [Float] The initial value of momentum.
+      # @param decay1 [Float] The smoothing parameter for the first moment.
+      # @param decay2 [Float] The smoothing parameter for the second moment.
+      # @param schedule_decay [Float] The smooting parameter.
+      def initialize(learning_rate: 0.01, momentum: 0.9, decay1: 0.9, decay2: 0.999)
+        check_params_float(learning_rate: learning_rate, momentum: momentum, decay1: decay1, decay2: decay2)
+        check_params_positive(learning_rate: learning_rate, momentum: momentum, decay1: decay1, decay2: decay2)
+        @params = {}
+        @params[:learning_rate] = learning_rate
+        @params[:momentum] = momentum
+        @params[:decay1] = decay1
+        @params[:decay2] = decay2
+        @fst_moment = nil
+        @sec_moment = nil
+        @decay1_prod = 1.0
+        @iter = 0
+      end
+      # Calculate the updated weight with Nadam adaptive learning rate.
+      #
+      # @param weight [Numo::DFloat] (shape: [n_features]) The weight to be updated.
+      # @param gradient [Numo::DFloat] (shape: [n_features]) The gradient for updating the weight.
+      # @return [Numo::DFloat] (shape: [n_feautres]) The updated weight.
+      def call(weight, gradient)
+        @fst_moment ||= Numo::DFloat.zeros(weight.shape[0])
+        @sec_moment ||= Numo::DFloat.zeros(weight.shape[0])
+        @iter += 1
+        decay1_curr = @params[:decay1] * (1.0 - 0.5 * 0.96**(@iter * 0.004))
+        decay1_next = @params[:decay1] * (1.0 - 0.5 * 0.96**((@iter + 1) * 0.004))
+        decay1_prod_curr = @decay1_prod * decay1_curr
+        decay1_prod_next = @decay1_prod * decay1_curr * decay1_next
+        @decay1_prod = decay1_prod_curr
+        @fst_moment = @params[:decay1] * @fst_moment + (1.0 - @params[:decay1]) * gradient
+        @sec_moment = @params[:decay2] * @sec_moment + (1.0 - @params[:decay2]) * gradient**2
+        nm_gradient = gradient / (1.0 - decay1_prod_curr)
+        nm_fst_moment = @fst_moment / (1.0 - decay1_prod_next)
+        nm_sec_moment = @sec_moment / (1.0 - @params[:decay2]**@iter)
+        weight - (@params[:learning_rate] / (nm_sec_moment**0.5 + 1e-8)) * ((1 - decay1_curr) * nm_gradient + decay1_next * nm_fst_moment)
+      end
+    end
+  end
+end

data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb CHANGED

@@ -3,6 +3,7 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
+require 'svmkit/optimizer/nadam'
 module SVMKit
   # This module consists of the classes that implement polynomial models.
@@ -14,7 +15,7 @@ module SVMKit
     # @example
     #   estimator =
     #     SVMKit::PolynomialModel::FactorizationMachineClassifier.new(
-    #      n_factors: 10, loss: 'hinge', reg_param_bias: 0.001, reg_param_weight: 0.001, reg_param_factor: 0.001,
+    #      n_factors: 10, loss: 'hinge', reg_param_linear: 0.001, reg_param_factor: 0.001,
     #      max_iter: 5000, batch_size: 50, random_seed: 1)
     #   estimator.fit(training_samples, traininig_labels)
     #   results = estimator.predict(testing_samples)
@@ -25,6 +26,7 @@ module SVMKit
     class FactorizationMachineClassifier
       include Base::BaseEstimator
       include Base::Classifier
+      include Validation
       # Return the factor matrix for Factorization Machine.
       # @return [Numo::DFloat] (shape: [n_classes, n_factors, n_features])
@@ -50,32 +52,30 @@ module SVMKit
       #
       # @param n_factors [Integer] The maximum number of iterations.
       # @param loss [String] The loss function ('hinge' or 'logistic').
-      # @param reg_param_bias [Float] The regularization parameter for bias term.
-      # @param reg_param_weight [Float] The regularization parameter for weight vector.
+      # @param reg_param_linear [Float] The regularization parameter for linear model.
       # @param reg_param_factor [Float] The regularization parameter for factor matrix.
-      # @param init_std [Float] The standard deviation of normal random number for initialization of factor matrix.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
-      def initialize(n_factors: 2, loss: 'hinge', reg_param_bias: 1.0, reg_param_weight: 1.0, reg_param_factor: 1.0,
-                     init_std: 0.1, max_iter: 1000, batch_size: 10, random_seed: nil)
-        SVMKit::Validation.check_params_float(reg_param_bias: reg_param_bias, reg_param_weight: reg_param_weight,
-                                              reg_param_factor: reg_param_factor, init_std: init_std)
-        SVMKit::Validation.check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
-        SVMKit::Validation.check_params_string(loss: loss)
-        SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
-        SVMKit::Validation.check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
-                                                 reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
-                                                 max_iter: max_iter, batch_size: batch_size)
+      def initialize(n_factors: 2, loss: 'hinge', reg_param_linear: 1.0, reg_param_factor: 1.0,
+                     max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor)
+        check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
+        check_params_string(loss: loss)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(n_factors: n_factors,
+                              reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor,
+                              max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:n_factors] = n_factors
         @params[:loss] = loss
-        @params[:reg_param_bias] = reg_param_bias
-        @params[:reg_param_weight] = reg_param_weight
+        @params[:reg_param_linear] = reg_param_linear
         @params[:reg_param_factor] = reg_param_factor
-        @params[:init_std] = init_std
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @factor_mat = nil
@@ -91,9 +91,9 @@ module SVMKit
       # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
       # @return [FactorizationMachineClassifier] The learned classifier itself.
       def fit(x, y)
-        SVMKit::Validation.check_sample_array(x)
-        SVMKit::Validation.check_label_array(y)
-        SVMKit::Validation.check_sample_label_size(x, y)
+        check_sample_array(x)
+        check_label_array(y)
+        check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         n_classes = @classes.size
@@ -124,7 +124,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
       # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
       def decision_function(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         linear_term = @bias_term + x.dot(@weight_vec.transpose)
         factor_term = if @classes.size <= 2
                         0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
@@ -139,7 +139,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
       # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
       def predict(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         return Numo::Int32.cast(decision_function(x).ge(0.0)) * 2 - 1 if @classes.size <= 2
         n_samples, = x.shape
@@ -152,7 +152,7 @@ module SVMKit
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
       # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
       def predict_proba(x)
-        SVMKit::Validation.check_sample_array(x)
+        check_sample_array(x)
         proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
         return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2
@@ -188,84 +188,76 @@ module SVMKit
       private
-      def binary_fit(x, bin_y)
+      def binary_fit(x, y)
         # Initialize some variables.
         n_samples, n_features = x.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
-        factor_mat = rand_normal([@params[:n_factors], n_features], 0, @params[:init_std])
-        weight_vec = Numo::DFloat.zeros(n_features)
-        bias_term = 0.0
+        weight_vec = Numo::DFloat.zeros(n_features + 1)
+        factor_mat = Numo::DFloat.zeros(@params[:n_factors], n_features)
+        weight_optimizer = Optimizer::Nadam.new
+        factor_optimizers = Array.new(@params[:n_factors]) { Optimizer::Nadam.new }
         # Start optimization.
-        @params[:max_iter].times do |t|
+        @params[:max_iter].times do |_t|
           # Random sampling.
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
           data = x[subset_ids, true]
-          label = bin_y[subset_ids]
+          ex_data = expand_feature(data)
+          label = y[subset_ids]
           # Calculate gradients for loss function.
-          loss_grad = loss_gradient(data, label, factor_mat, weight_vec, bias_term)
+          loss_grad = loss_gradient(data, ex_data, label, factor_mat, weight_vec)
           next if loss_grad.ne(0.0).count.zero?
           # Update each parameter.
-          bias_term -= learning_rate(@params[:reg_param_bias], t) * bias_gradient(loss_grad, bias_term)
-          weight_vec -= learning_rate(@params[:reg_param_weight], t) * weight_gradient(loss_grad, data, weight_vec)
+          weight_vec = weight_optimizer.call(weight_vec, weight_gradient(loss_grad, ex_data, weight_vec))
           @params[:n_factors].times do |n|
-            factor_mat[n, true] -= learning_rate(@params[:reg_param_factor], t) *
-                                   factor_gradient(loss_grad, data, factor_mat[n, true])
+            factor_mat[n, true] = factor_optimizers[n].call(factor_mat[n, true],
+                                                            factor_gradient(loss_grad, data, factor_mat[n, true]))
           end
         end
-        [factor_mat, weight_vec, bias_term]
+        [factor_mat, *split_weight_vec_bias(weight_vec)]
       end
-      def bin_decision_function(x, factor, weight, bias)
-        bias + x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
+      def bin_decision_function(x, ex_x, factor, weight)
+        ex_x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
       end
-      def hinge_loss_gradient(x, y, factor, weight, bias)
-        evaluated = y * bin_decision_function(x, factor, weight, bias)
+      def hinge_loss_gradient(x, ex_x, y, factor, weight)
+        evaluated = y * bin_decision_function(x, ex_x, factor, weight)
         gradient = Numo::DFloat.zeros(evaluated.size)
         gradient[evaluated < 1.0] = -y[evaluated < 1.0]
         gradient
       end
-      def logistic_loss_gradient(x, y, factor, weight, bias)
-        evaluated = y * bin_decision_function(x, factor, weight, bias)
+      def logistic_loss_gradient(x, ex_x, y, factor, weight)
+        evaluated = y * bin_decision_function(x, ex_x, factor, weight)
         sigmoid_func = 1.0 / (Numo::NMath.exp(-evaluated) + 1.0)
         (sigmoid_func - 1.0) * y
       end
-      def loss_gradient(x, y, factor, weight, bias)
+      def loss_gradient(x, ex_x, y, factor, weight)
         if @params[:loss] == 'hinge'
-          hinge_loss_gradient(x, y, factor, weight, bias)
+          hinge_loss_gradient(x, ex_x, y, factor, weight)
         else
-          logistic_loss_gradient(x, y, factor, weight, bias)
+          logistic_loss_gradient(x, ex_x, y, factor, weight)
         end
       end
-      def learning_rate(reg_param, iter)
-        1.0 / (reg_param * (iter + 1))
-      end
-      def bias_gradient(loss_grad, bias)
-        loss_grad.mean + @params[:reg_param_bias] * bias
-      end
       def weight_gradient(loss_grad, data, weight)
-        (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_weight] * weight
+        (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_linear] * weight
       end
       def factor_gradient(loss_grad, data, factor)
-        reg_term = @params[:reg_param_factor] * factor
-        (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + reg_term
+        (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
       end
-      def rand_uniform(shape)
-        Numo::DFloat[*Array.new(shape.inject(&:*)) { @rng.rand }].reshape(*shape)
+      def expand_feature(x)
+        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1])])
       end
-      def rand_normal(shape, mu, sigma)
-        a = rand_uniform(shape)
-        b = rand_uniform(shape)
-        mu + sigma * (Numo::NMath.sqrt(-2.0 * Numo::NMath.log(a)) * Numo::NMath.sin(2.0 * Math::PI * b))
+      def split_weight_vec_bias(weight_vec)
+        weights = weight_vec[0...-1]
+        bias = weight_vec[-1]
+        [weights, bias]
       end
     end
   end

data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb CHANGED

@@ -3,6 +3,7 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/regressor'
+require 'svmkit/optimizer/nadam'
 module SVMKit
   module PolynomialModel
@@ -12,7 +13,7 @@ module SVMKit
     # @example
     #   estimator =
     #     SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
-    #      n_factors: 10, reg_param_bias: 0.1, reg_param_weight: 0.1, reg_param_factor: 0.1,
+    #      n_factors: 10, reg_param_linear: 0.1, reg_param_factor: 0.1,
     #      max_iter: 5000, batch_size: 50, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
@@ -20,8 +21,6 @@ module SVMKit
     # *Reference*
     # - S. Rendle, "Factorization Machines with libFM," ACM Transactions on Intelligent Systems and Technology, vol. 3 (3), pp. 57:1--57:22, 2012.
     # - S. Rendle, "Factorization Machines," Proc. the 10th IEEE International Conference on Data Mining (ICDM'10), pp. 995--1000, 2010.
-    # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. the 30th  International Conference on Machine Learning (ICML' 13), pp. 1139--1147, 2013.
-    # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
     class FactorizationMachineRegressor
       include Base::BaseEstimator
       include Base::Regressor
@@ -46,40 +45,27 @@ module SVMKit
       # Create a new regressor with Factorization Machine.
       #
       # @param n_factors [Integer] The maximum number of iterations.
-      # @param reg_param_bias [Float] The regularization parameter for bias term.
-      # @param reg_param_weight [Float] The regularization parameter for weight vector.
+      # @param reg_param_linear [Float] The regularization parameter for linear model.
       # @param reg_param_factor [Float] The regularization parameter for factor matrix.
-      # @param init_std [Float] The standard deviation of normal random number for initialization of factor matrix.
-      # @param learning_rate [Float] The learning rate for optimization.
-      # @param decay [Float] The discounting factor for RMS prop optimization.
-      # @param momentum [Float] The Nesterov momentum for optimization.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
+      # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
+      #   Nadam is selected automatically on current version.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
-      def initialize(n_factors: 2,
-                     reg_param_bias: 1.0, reg_param_weight: 1.0, reg_param_factor: 1.0, init_std: 0.01,
-                     learning_rate: 0.01, decay: 0.9, momentum: 0.9,
-                     max_iter: 1000, batch_size: 10, random_seed: nil)
-        check_params_float(reg_param_bias: reg_param_bias, reg_param_weight: reg_param_weight,
-                           reg_param_factor: reg_param_factor, init_std: init_std,
-                           learning_rate: learning_rate, decay: decay, momentum: momentum)
+      def initialize(n_factors: 2, reg_param_linear: 1.0, reg_param_factor: 1.0,
+                     max_iter: 1000, batch_size: 10, optimizer: nil, random_seed: nil)
+        check_params_float(reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor)
         check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
         check_params_type_or_nil(Integer, random_seed: random_seed)
-        check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
-                              reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
-                              learning_rate: learning_rate, decay: decay, momentum: momentum,
+        check_params_positive(n_factors: n_factors, reg_param_linear: reg_param_linear, reg_param_factor: reg_param_factor,
                               max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:n_factors] = n_factors
-        @params[:reg_param_bias] = reg_param_bias
-        @params[:reg_param_weight] = reg_param_weight
+        @params[:reg_param_linear] = reg_param_linear
         @params[:reg_param_factor] = reg_param_factor
-        @params[:init_std] = init_std
-        @params[:learning_rate] = learning_rate
-        @params[:decay] = decay
-        @params[:momentum] = momentum
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
+        @params[:optimizer] = optimizer
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @factor_mat = nil
@@ -160,74 +146,52 @@ module SVMKit
         # Initialize some variables.
         n_samples, n_features = x.shape
         rand_ids = [*0...n_samples].shuffle(random: @rng)
-        factor_mat = rand_normal([@params[:n_factors], n_features], 0, @params[:init_std])
-        factor_sqrsum = Numo::DFloat.zeros(factor_mat.shape)
-        factor_update = Numo::DFloat.zeros(factor_mat.shape)
-        weight_vec = Numo::DFloat.zeros(n_features)
-        weight_sqrsum = Numo::DFloat.zeros(n_features)
-        weight_update = Numo::DFloat.zeros(n_features)
-        bias_term = 0.0
-        bias_sqrsum = 0.0
-        bias_update = 0.0
+        weight_vec = Numo::DFloat.zeros(n_features + 1)
+        factor_mat = Numo::DFloat.zeros(@params[:n_factors], n_features)
+        weight_optimizer = Optimizer::Nadam.new
+        factor_optimizers = Array.new(@params[:n_factors]) { Optimizer::Nadam.new }
         # Start optimization.
         @params[:max_iter].times do |_t|
           # Random sampling.
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
           data = x[subset_ids, true]
+          ex_data = expand_feature(data)
           values = y[subset_ids]
           # Calculate gradients for loss function.
-          loss_grad = loss_gradient(data, values,
-                                    factor_mat - @params[:momentum] * factor_update,
-                                    weight_vec - @params[:momentum] * weight_update,
-                                    bias_term - @params[:momentum] * bias_update)
+          loss_grad = loss_gradient(data, ex_data, values, factor_mat, weight_vec)
           next if loss_grad.ne(0.0).count.zero?
           # Update each parameter.
-          bias_term, bias_sqrsum, bias_update =
-            update_param(bias_term, bias_sqrsum, bias_update,
-                         bias_gradient(loss_grad, bias_term - @params[:momentum] * bias_update))
-          weight_vec, weight_sqrsum, weight_update =
-            update_param(weight_vec, weight_sqrsum, weight_update,
-                         weight_gradient(loss_grad, data, weight_vec - @params[:momentum] * weight_update))
+          weight_vec = weight_optimizer.call(weight_vec, weight_gradient(loss_grad, ex_data, weight_vec))
           @params[:n_factors].times do |n|
-            factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true] =
-              update_param(factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true],
-                           factor_gradient(loss_grad, data, factor_mat[n, true] - @params[:momentum] * factor_update[n, true]))
+            factor_mat[n, true] = factor_optimizers[n].call(factor_mat[n, true],
+                                                            factor_gradient(loss_grad, data, factor_mat[n, true]))
           end
         end
-        [factor_mat, weight_vec, bias_term]
+        [factor_mat, *split_weight_vec_bias(weight_vec)]
       end
-      def loss_gradient(x, y, factor, weight, bias)
-        z = bias + x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
+      def loss_gradient(x, ex_x, y, factor, weight)
+        z = ex_x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
         2.0 * (z - y)
       end
-      def bias_gradient(loss_grad, bias)
-        loss_grad.mean + @params[:reg_param_bias] * bias
-      end
       def weight_gradient(loss_grad, data, weight)
-        (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_weight] * weight
+        (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_linear] * weight
       end
       def factor_gradient(loss_grad, data, factor)
         (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
       end
-      def update_param(param, sqrsum, update, gr)
-        new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
-        new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
-        new_param = param - (new_update + @params[:momentum] * update)
-        [new_param, new_sqrsum, new_update]
-      end
-      def rand_uniform(shape)
-        Numo::DFloat[*Array.new(shape.inject(&:*)) { @rng.rand }].reshape(*shape)
+      def expand_feature(x)
+        Numo::NArray.hstack([x, Numo::DFloat.ones([x.shape[0], 1])])
       end
-      def rand_normal(shape, mu, sigma)
-        mu + sigma * (Numo::NMath.sqrt(-2.0 * Numo::NMath.log(rand_uniform(shape))) * Numo::NMath.sin(2.0 * Math::PI * rand_uniform(shape)))
+      def split_weight_vec_bias(weight_vec)
+        weights = weight_vec[0...-1]
+        bias = weight_vec[-1]
+        [weights, bias]
       end
     end
   end

data/lib/svmkit/version.rb CHANGED

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.3.3'.freeze
+  VERSION = '0.4.0'.freeze
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.3.3
+  version: 0.4.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-05-25 00:00:00.000000000 Z
+date: 2018-06-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -139,6 +139,7 @@ files:
 - lib/svmkit/naive_bayes/naive_bayes.rb
 - lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb
 - lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb
+- lib/svmkit/optimizer/nadam.rb
 - lib/svmkit/pairwise_metric.rb
 - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
 - lib/svmkit/polynomial_model/factorization_machine_regressor.rb