RubyGems - rumale - Versions diffs - 0.20.3 → 0.22.3 - Mend

rumale 0.20.3 → 0.22.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/.coveralls.yml +1 -1
data/.github/workflows/build.yml +23 -0
data/.github/workflows/coverage.yml +28 -0
data/.gitignore +1 -0
data/.rubocop.yml +1 -0
data/CHANGELOG.md +30 -0
data/Gemfile +5 -4
data/LICENSE.txt +1 -1
data/README.md +57 -21
data/ext/rumale/tree.c +23 -10
data/lib/rumale.rb +4 -0
data/lib/rumale/base/base_estimator.rb +5 -3
data/lib/rumale/decomposition/pca.rb +1 -1
data/lib/rumale/ensemble/stacking_classifier.rb +214 -0
data/lib/rumale/ensemble/stacking_regressor.rb +163 -0
data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
data/lib/rumale/kernel_machine/kernel_svc.rb +4 -3
data/lib/rumale/linear_model/base_sgd.rb +1 -1
data/lib/rumale/linear_model/elastic_net.rb +3 -3
data/lib/rumale/linear_model/lasso.rb +3 -3
data/lib/rumale/linear_model/linear_regression.rb +65 -36
data/lib/rumale/linear_model/logistic_regression.rb +123 -35
data/lib/rumale/linear_model/nnls.rb +137 -0
data/lib/rumale/linear_model/ridge.rb +72 -35
data/lib/rumale/linear_model/svc.rb +6 -5
data/lib/rumale/linear_model/svr.rb +6 -5
data/lib/rumale/metric_learning/mlkr.rb +161 -0
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +18 -47
data/lib/rumale/pairwise_metric.rb +1 -1
data/lib/rumale/validation.rb +13 -1
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +2 -1
metadata +24 -4

data/lib/rumale/ensemble/stacking_regressor.rb ADDED

@@ -0,0 +1,163 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/regressor'
+module Rumale
+  module Ensemble
+    # StackingRegressor is a class that implements regressor with stacking method.
+    #
+    # @example
+    #   estimators = {
+    #     las: Rumale::LinearModel::Lasso.new(reg_param: 1e-2, random_seed: 1),
+    #     mlp: Rumele::NeuralNetwork::MLPRegressor.new(hidden_units: [256], random_seed: 1),
+    #     rnd: Rumale::Ensemble::RandomForestRegressor.new(random_seed: 1)
+    #   }
+    #   meta_estimator = Rumale::LinearModel::Ridge.new(random_seed: 1)
+    #   regressor = Rumale::Ensemble::StackedRegressor.new(
+    #     estimators: estimators, meta_estimator: meta_estimator, random_seed: 1
+    #   )
+    #   regressor.fit(training_samples, traininig_values)
+    #   results = regressor.predict(testing_samples)
+    #
+    # *Reference*
+    # - Zhou, Z-H., "Ensemble Mehotds - Foundations and Algorithms," CRC Press Taylor and Francis Group, Chapman and Hall/CRC, 2012.
+    class StackingRegressor
+      include Base::BaseEstimator
+      include Base::Regressor
+      # Return the base regressors.
+      # @return [Hash<Symbol,Regressor>]
+      attr_reader :estimators
+      # Return the meta regressor.
+      # @return [Regressor]
+      attr_reader :meta_estimator
+      # Create a new regressor with stacking method.
+      #
+      # @param estimators [Hash<Symbol,Regressor>] The base regressors for extracting meta features.
+      # @param meta_estimator [Regressor/Nil] The meta regressor that predicts values.
+      #   If nil is given, Ridge is used.
+      # @param n_splits [Integer] The number of folds for cross validation with k-fold on meta feature extraction in training phase.
+      # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset on cross validation.
+      # @param passthrough [Boolean] The flag indicating whether to concatenate the original features and meta features when training the meta regressor.
+      # @param random_seed [Integer/Nil] The seed value using to initialize the random generator on cross validation.
+      def initialize(estimators:, meta_estimator: nil, n_splits: 5, shuffle: true, passthrough: false, random_seed: nil)
+        check_params_type(Hash, estimators: estimators)
+        check_params_numeric(n_splits: n_splits)
+        check_params_boolean(shuffle: shuffle, passthrough: passthrough)
+        check_params_numeric_or_nil(random_seed: random_seed)
+        @estimators = estimators
+        @meta_estimator = meta_estimator || Rumale::LinearModel::Ridge.new
+        @output_size = nil
+        @params = {}
+        @params[:n_splits] = n_splits
+        @params[:shuffle] = shuffle
+        @params[:passthrough] = passthrough
+        @params[:random_seed] = random_seed || srand
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target variables to be used for fitting the model.
+      # @return [StackedRegressor] The learned regressor itself.
+      def fit(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        n_samples, n_features = x.shape
+        n_outputs = y.ndim == 1 ? 1 : y.shape[1]
+        # training base regressors with all training data.
+        @estimators.each_key { |name| @estimators[name].fit(x, y) }
+        # detecting size of output for each base regressor.
+        @output_size = detect_output_size(n_features)
+        # extracting meta features with base regressors.
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        kf = Rumale::ModelSelection::KFold.new(
+          n_splits: @params[:n_splits], shuffle: @params[:shuffle], random_seed: @params[:random_seed]
+        )
+        kf.split(x, y).each do |train_ids, valid_ids|
+          x_train = x[train_ids, true]
+          y_train = n_outputs == 1 ? y[train_ids] : y[train_ids, true]
+          x_valid = x[valid_ids, true]
+          f_start = 0
+          @estimators.each_key do |name|
+            est_fold = Marshal.load(Marshal.dump(@estimators[name]))
+            f_last = f_start + @output_size[name]
+            f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+            z[valid_ids, f_position] = est_fold.fit(x_train, y_train).predict(x_valid)
+            f_start = f_last
+          end
+        end
+        # concatenating original features.
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        # training meta regressor.
+        @meta_estimator.fit(z, y)
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) The predicted values per sample.
+      def predict(x)
+        x = check_convert_sample_array(x)
+        z = transform(x)
+        @meta_estimator.predict(z)
+      end
+      # Transform the given data with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed with the learned model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for samples.
+      def transform(x)
+        x = check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        n_components = @output_size.values.inject(:+)
+        z = Numo::DFloat.zeros(n_samples, n_components)
+        f_start = 0
+        @estimators.each_key do |name|
+          f_last = f_start + @output_size[name]
+          f_position = @output_size[name] == 1 ? f_start : f_start...f_last
+          z[true, f_position] = @estimators[name].predict(x)
+          f_start = f_last
+        end
+        z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
+        z
+      end
+      # Fit the model with training data, and then transform them with the learned model.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target variables to be used for fitting the model.
+      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for training data.
+      def fit_transform(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_tvalue_array(y)
+        fit(x, y).transform(x)
+      end
+      private
+      def detect_output_size(n_features)
+        x_dummy = Numo::DFloat.new(2, n_features).rand
+        @estimators.each_key.with_object({}) do |name, obj|
+          output_dummy = @estimators[name].predict(x_dummy)
+          obj[name] = output_dummy.ndim == 1 ? 1 : output_dummy.shape[1]
+        end
+      end
+    end
+  end
+end

data/lib/rumale/feature_extraction/feature_hasher.rb CHANGED

@@ -67,7 +67,7 @@ module Rumale
       def transform(x)
         raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
-        x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
+        x = [x] unless x.is_a?(Array)
         n_samples = x.size
         z = Numo::DFloat.zeros(n_samples, n_features)

data/lib/rumale/feature_extraction/hash_vectorizer.rb CHANGED

@@ -99,7 +99,7 @@ module Rumale
       # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
       # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
       def transform(x)
-        x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
+        x = [x] unless x.is_a?(Array)
         n_samples = x.size
         n_features = @vocabulary.size
         z = Numo::DFloat.zeros(n_samples, n_features)

data/lib/rumale/kernel_machine/kernel_svc.rb CHANGED

@@ -11,9 +11,10 @@ module Rumale
     # with stochastic gradient descent (SGD) optimization.
     # For multiclass classification problem, it uses one-vs-the-rest strategy.
     #
-    # Rumale::SVM provides kernel support vector classifier based on LIBSVM.
-    # If you prefer execution speed, you should use Rumale::SVM::SVC.
-    # https://github.com/yoshoku/rumale-svm
+    # @note
+    #   Rumale::SVM provides kernel support vector classifier based on LIBSVM.
+    #   If you prefer execution speed, you should use Rumale::SVM::SVC.
+    #   https://github.com/yoshoku/rumale-svm
     #
     # @example
     #   training_kernel_matrix = Rumale::PairwiseMetric::rbf_kernel(training_samples)

data/lib/rumale/linear_model/base_sgd.rb CHANGED

@@ -171,7 +171,7 @@ module Rumale
         @params[:fit_bias] = true
         @params[:reg_param] = 0.0
         @params[:l1_ratio] = 0.0
-        @params[:max_iter] = 200
+        @params[:max_iter] = 1000
         @params[:batch_size] = 50
         @params[:tol] = 0.0001
         @params[:verbose] = false

data/lib/rumale/linear_model/elastic_net.rb CHANGED

@@ -10,7 +10,7 @@ module Rumale
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::ElasticNet.new(reg_param: 0.1, l1_ratio: 0.5, max_iter: 200, batch_size: 50, random_seed: 1)
+    #     Rumale::LinearModel::ElasticNet.new(reg_param: 0.1, l1_ratio: 0.5, max_iter: 1000, batch_size: 50, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -59,7 +59,7 @@ module Rumale
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
                      reg_param: 1.0, l1_ratio: 0.5, fit_bias: true, bias_scale: 1.0,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,
                              reg_param: reg_param, l1_ratio: l1_ratio, bias_scale: bias_scale,
@@ -81,7 +81,7 @@ module Rumale
       # Fit the model with given training data.
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
       # @return [ElasticNet] The learned regressor itself.
       def fit(x, y)
         x = check_convert_sample_array(x)

data/lib/rumale/linear_model/lasso.rb CHANGED

@@ -10,7 +10,7 @@ module Rumale
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 500, batch_size: 20, random_seed: 1)
+    #     Rumale::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -55,7 +55,7 @@ module Rumale
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
                      reg_param: 1.0, fit_bias: true, bias_scale: 1.0,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,
                              reg_param: reg_param, bias_scale: bias_scale,
@@ -77,7 +77,7 @@ module Rumale
       # Fit the model with given training data.
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
       # @return [Lasso] The learned regressor itself.
       def fit(x, y)
         x = check_convert_sample_array(x)

data/lib/rumale/linear_model/linear_regression.rb CHANGED

@@ -6,11 +6,12 @@ require 'rumale/base/regressor'
 module Rumale
   module LinearModel
     # LinearRegression is a class that implements ordinary least square linear regression
-    # with stochastic gradient descent (SGD) optimization or singular value decomposition (SVD).
+    # with stochastic gradient descent (SGD) optimization,
+    # singular value decomposition (SVD), or L-BFGS optimization.
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::LinearRegression.new(max_iter: 500, batch_size: 20, random_seed: 1)
+    #     Rumale::LinearModel::LinearRegression.new(max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -41,34 +42,35 @@ module Rumale
       #
       # @param learning_rate [Float] The initial value of learning rate.
       #   The learning rate decreases as the iteration proceeds according to the equation: learning_rate / (1 + decay * t).
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is not 'sgd', this parameter is ignored.
       # @param decay [Float] The smoothing parameter for decreasing learning rate as the iteration proceeds.
       #   If nil is given, the decay sets to 'learning_rate'.
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is not 'sgd', this parameter is ignored.
       # @param momentum [Float] The momentum factor.
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is not 'sgd', this parameter is ignored.
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
       # @param bias_scale [Float] The scale of the bias term.
       # @param max_iter [Integer] The maximum number of epochs that indicates
       #   how many times the whole data is given to the training process.
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is 'svd', this parameter is ignored.
       # @param batch_size [Integer] The size of the mini batches.
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is not 'sgd', this parameter is ignored.
       # @param tol [Float] The tolerance of loss for terminating optimization.
-      #   If solver = 'svd', this parameter is ignored.
-      # @param solver [String] The algorithm to calculate weights. ('auto', 'sgd' or 'svd').
+      #   If solver is 'svd', this parameter is ignored.
+      # @param solver [String] The algorithm to calculate weights. ('auto', 'sgd', 'svd' or 'lbfgs').
       #   'auto' chooses the 'svd' solver if Numo::Linalg is loaded. Otherwise, it chooses the 'sgd' solver.
       #   'sgd' uses the stochastic gradient descent optimization.
       #   'svd' performs singular value decomposition of samples.
+      #   'lbfgs' uses the L-BFGS method for optimization.
       # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
       #   If nil is given, the method does not execute in parallel.
       #   If zero or less is given, it becomes equal to the number of processors.
-      #   This parameter is ignored if the Parallel gem is not loaded.
+      #   This parameter is ignored if the Parallel gem is not loaded or solver is not 'sgd'.
       # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
-      #   If solver = 'svd', this parameter is ignored.
+      #   If solver is 'svd', this parameter is ignored.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
-                     fit_bias: true, bias_scale: 1.0, max_iter: 200, batch_size: 50, tol: 1e-4,
+                     fit_bias: true, bias_scale: 1.0, max_iter: 1000, batch_size: 50, tol: 1e-4,
                      solver: 'auto',
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,
@@ -80,9 +82,9 @@ module Rumale
         super()
         @params.merge!(method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h)
         @params[:solver] = if solver == 'auto'
-                             load_linalg? ? 'svd' : 'sgd'
+                             enable_linalg?(warning: false) ? 'svd' : 'sgd'
                            else
-                             solver != 'svd' ? 'sgd' : 'svd'
+                             solver.match?(/^svd$|^sgd$|^lbfgs$/) ? solver : 'sgd'
                            end
         @params[:decay] ||= @params[:learning_rate]
         @params[:random_seed] ||= srand
@@ -95,15 +97,17 @@ module Rumale
       # Fit the model with given training data.
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
       # @return [LinearRegression] The learned regressor itself.
       def fit(x, y)
         x = check_convert_sample_array(x)
         y = check_convert_tvalue_array(y)
         check_sample_tvalue_size(x, y)
-        if @params[:solver] == 'svd' && enable_linalg?
+        if @params[:solver] == 'svd' && enable_linalg?(warning: false)
           fit_svd(x, y)
+        elsif @params[:solver] == 'lbfgs'
+          fit_lbfgs(x, y)
         else
           fit_sgd(x, y)
         end
@@ -124,24 +128,46 @@ module Rumale
       def fit_svd(x, y)
         x = expand_feature(x) if fit_bias?
         w = Numo::Linalg.pinv(x, driver: 'svd').dot(y)
+        @weight_vec, @bias_term = single_target?(y) ? split_weight(w) : split_weight_mult(w)
+      end
-        is_single_target_vals = y.shape[1].nil?
-        if @params[:fit_bias]
-          @weight_vec = is_single_target_vals ? w[0...-1].dup : w[0...-1, true].dup
-          @bias_term = is_single_target_vals ? w[-1] : w[-1, true].dup
-        else
-          @weight_vec = w.dup
-          @bias_term = is_single_target_vals ? 0 : Numo::DFloat.zeros(y.shape[1])
+      def fit_lbfgs(x, y)
+        fnc = proc do |w, x, y| # rubocop:disable Lint/ShadowingOuterLocalVariable
+          n_samples, n_features = x.shape
+          w = w.reshape(y.shape[1], n_features) unless y.shape[1].nil?
+          z = x.dot(w.transpose)
+          d = z - y
+          loss = (d**2).sum.fdiv(n_samples)
+          gradient = 2.fdiv(n_samples) * d.transpose.dot(x)
+          [loss, gradient.flatten.dup]
         end
-      end
-      def fit_sgd(x, y)
-        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
+        x = expand_feature(x) if fit_bias?
         n_features = x.shape[1]
+        n_outputs = single_target?(y) ? 1 : y.shape[1]
+        res = Lbfgsb.minimize(
+          fnc: fnc, jcb: true, x_init: init_weight(n_features, n_outputs), args: [x, y],
+          maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON,
+          verbose: @params[:verbose] ? 1 : -1
+        )
+        @weight_vec, @bias_term =
+          if single_target?(y)
+            split_weight(res[:x])
+          else
+            split_weight_mult(res[:x].reshape(n_outputs, n_features).transpose)
+          end
+      end
-        if n_outputs > 1
+      def fit_sgd(x, y)
+        if single_target?(y)
+          @weight_vec, @bias_term = partial_fit(x, y)
+        else
+          n_outputs = y.shape[1]
+          n_features = x.shape[1]
           @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
           @bias_term = Numo::DFloat.zeros(n_outputs)
           if enable_parallel?
@@ -150,20 +176,23 @@ module Rumale
           else
             n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
           end
-        else
-          @weight_vec, @bias_term = partial_fit(x, y)
         end
       end
-      def fit_bias?
-        @params[:fit_bias] == true
+      def single_target?(y)
+        y.ndim == 1
       end
-      def load_linalg?
-        return false if defined?(Numo::Linalg).nil?
-        return false if Numo::Linalg::VERSION < '0.1.4'
+      def init_weight(n_features, n_outputs)
+        Rumale::Utils.rand_normal([n_outputs, n_features], @rng.dup).flatten.dup
+      end
-        true
+      def split_weight_mult(w)
+        if fit_bias?
+          [w[0...-1, true].dup, w[-1, true].dup]
+        else
+          [w.dup, Numo::DFloat.zeros(w.shape[1])]
+        end
       end
     end
   end

data/lib/rumale/linear_model/logistic_regression.rb CHANGED

@@ -1,21 +1,24 @@
 # frozen_string_literal: true
-require 'rumale/linear_model/base_sgd'
+require 'lbfgsb'
 require 'rumale/base/classifier'
+require 'rumale/linear_model/base_sgd'
+require 'rumale/preprocessing/label_binarizer'
 module Rumale
   module LinearModel
-    # LogisticRegression is a class that implements Logistic Regression
-    # with stochastic gradient descent optimization.
-    # For multiclass classification problem, it uses one-vs-the-rest strategy.
+    # LogisticRegression is a class that implements Logistic Regression.
+    # In multiclass classification problem, it uses one-vs-the-rest strategy for the sgd solver
+    # and multinomial logistic regression for the lbfgs solver.
     #
-    # Rumale::SVM provides Logistic Regression based on LIBLINEAR.
-    # If you prefer execution speed, you should use Rumale::SVM::LogisticRegression.
-    # https://github.com/yoshoku/rumale-svm
+    # @note
+    #   Rumale::SVM provides Logistic Regression based on LIBLINEAR.
+    #   If you prefer execution speed, you should use Rumale::SVM::LogisticRegression.
+    #   https://github.com/yoshoku/rumale-svm
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::LogisticRegression.new(reg_param: 1.0, max_iter: 200, batch_size: 50, random_seed: 1)
+    #     Rumale::LinearModel::LogisticRegression.new(reg_param: 1.0, random_seed: 1)
     #   estimator.fit(training_samples, traininig_labels)
     #   results = estimator.predict(testing_samples)
     #
@@ -42,19 +45,24 @@ module Rumale
       # @return [Random]
       attr_reader :rng
-      # Create a new classifier with Logisitc Regression by the SGD optimization.
+      # Create a new classifier with Logisitc Regression.
       #
       # @param learning_rate [Float] The initial value of learning rate.
       #   The learning rate decreases as the iteration proceeds according to the equation: learning_rate / (1 + decay * t).
+      #   If solver = 'lbfgs', this parameter is ignored.
       # @param decay [Float] The smoothing parameter for decreasing learning rate as the iteration proceeds.
       #   If nil is given, the decay sets to 'reg_param * learning_rate'.
+      #   If solver = 'lbfgs', this parameter is ignored.
       # @param momentum [Float] The momentum factor.
+      #   If solver = 'lbfgs', this parameter is ignored.
       # @param penalty [String] The regularization type to be used ('l1', 'l2', and 'elasticnet').
+      #   If solver = 'lbfgs', only 'l2' can be selected for this parameter.
       # @param l1_ratio [Float] The elastic-net type regularization mixing parameter.
       #   If penalty set to 'l2' or 'l1', this parameter is ignored.
       #   If l1_ratio = 1, the regularization is similar to Lasso.
       #   If l1_ratio = 0, the regularization is similar to Ridge.
       #   If 0 < l1_ratio < 1, the regularization is a combination of L1 and L2.
+      #   If solver = 'lbfgs', this parameter is ignored.
       # @param reg_param [Float] The regularization parameter.
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
       # @param bias_scale [Float] The scale of the bias term.
@@ -62,28 +70,38 @@ module Rumale
       # @param max_iter [Integer] The maximum number of epochs that indicates
       #   how many times the whole data is given to the training process.
       # @param batch_size [Integer] The size of the mini batches.
+      #   If solver = 'lbfgs', this parameter is ignored.
       # @param tol [Float] The tolerance of loss for terminating optimization.
+      #   If solver = 'lbfgs', this value is given as tol / Lbfgsb::DBL_EPSILON to the factr argument of Lbfgsb.minimize method.
+      # @param solver [String] The algorithm for optimization. ('lbfgs' or 'sgd').
+      #   'lbfgs' uses the L-BFGS with lbfgs.rb gem.
+      #   'sgd' uses the stochastic gradient descent optimization.
       # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
       #   If nil is given, the methods do not execute in parallel.
       #   If zero or less is given, it becomes equal to the number of processors.
-      #   This parameter is ignored if the Parallel gem is not loaded.
+      #   This parameter is ignored if the Parallel gem is not loaded or the solver is 'lbfgs'.
       # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
+      #   If solver = 'lbfgs' and true is given, 'iterate.dat' file is generated by lbfgsb.rb.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
                      penalty: 'l2', reg_param: 1.0, l1_ratio: 0.5,
                      fit_bias: true, bias_scale: 1.0,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
+                     solver: 'lbfgs',
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,
                              reg_param: reg_param, l1_ratio: l1_ratio, bias_scale: bias_scale,
                              max_iter: max_iter, batch_size: batch_size, tol: tol)
         check_params_boolean(fit_bias: fit_bias, verbose: verbose)
-        check_params_string(penalty: penalty)
+        check_params_string(solver: solver, penalty: penalty)
         check_params_numeric_or_nil(decay: decay, n_jobs: n_jobs, random_seed: random_seed)
         check_params_positive(learning_rate: learning_rate, reg_param: reg_param,
                               bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
+        raise ArgumentError, "The 'lbfgs' solver supports only 'l2' penalties." if solver == 'lbfgs' && penalty != 'l2'
         super()
         @params.merge!(method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h)
+        @params[:solver] = solver == 'sgd' ? 'sgd' : 'lbfgs'
         @params[:decay] ||= @params[:reg_param] * @params[:learning_rate]
         @params[:random_seed] ||= srand
         @rng = Random.new(@params[:random_seed])
@@ -105,30 +123,10 @@ module Rumale
         check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
-        if multiclass_problem?
-          n_classes = @classes.size
-          n_features = x.shape[1]
-          @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
-          @bias_term = Numo::DFloat.zeros(n_classes)
-          if enable_parallel?
-            # :nocov:
-            models = parallel_map(n_classes) do |n|
-              bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
-              partial_fit(x, bin_y)
-            end
-            # :nocov:
-            n_classes.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
-          else
-            n_classes.times do |n|
-              bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
-              @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
-            end
-          end
+        if @params[:solver] == 'sgd'
+          fit_sgd(x, y)
         else
-          negative_label = @classes[0]
-          bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
-          @weight_vec, @bias_term = partial_fit(x, bin_y)
+          fit_lbfgs(x, y)
         end
         self
@@ -182,6 +180,96 @@ module Rumale
       def multiclass_problem?
         @classes.size > 2
       end
+      def fit_lbfgs(base_x, base_y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+        if multiclass_problem?
+          fnc = proc do |w, x, y, a|
+            n_features = x.shape[1]
+            n_classes = y.shape[1]
+            z = x.dot(w.reshape(n_classes, n_features).transpose)
+            # logsumexp and softmax
+            z_max = z.max(-1).expand_dims(-1).dup
+            z_max[~z_max.isfinite] = 0.0
+            lgsexp = Numo::NMath.log(Numo::NMath.exp(z - z_max).sum(-1)).expand_dims(-1) + z_max
+            t = z - lgsexp
+            sftmax = Numo::NMath.exp(t)
+            # loss and gradient
+            loss = -(y * t).sum + 0.5 * a * w.dot(w)
+            grad = (sftmax - y).transpose.dot(x).flatten.dup + a * w
+            [loss, grad]
+          end
+          base_x = expand_feature(base_x) if fit_bias?
+          encoder = Rumale::Preprocessing::LabelBinarizer.new
+          onehot_y = encoder.fit_transform(base_y)
+          n_classes = @classes.size
+          n_features = base_x.shape[1]
+          w_init = Numo::DFloat.zeros(n_classes * n_features)
+          verbose = @params[:verbose] ? 1 : -1
+          res = Lbfgsb.minimize(
+            fnc: fnc, jcb: true, x_init: w_init, args: [base_x, onehot_y, @params[:reg_param]],
+            maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
+          )
+          if fit_bias?
+            weight = res[:x].reshape(n_classes, n_features)
+            @weight_vec = weight[true, 0...-1].dup
+            @bias_term = weight[true, -1].dup
+          else
+            @weight_vec = res[:x].reshape(n_classes, n_features)
+            @bias_term = Numo::DFloat.zeros(n_classes)
+          end
+        else
+          fnc = proc do |w, x, y, a|
+            z = 1 + Numo::NMath.exp(-y * x.dot(w))
+            loss = Numo::NMath.log(z).sum + 0.5 * a * w.dot(w)
+            grad = (y / z - y).dot(x) + a * w
+            [loss, grad]
+          end
+          base_x = expand_feature(base_x) if fit_bias?
+          negative_label = @classes[0]
+          bin_y = Numo::Int32.cast(base_y.ne(negative_label)) * 2 - 1
+          n_features = base_x.shape[1]
+          w_init = Numo::DFloat.zeros(n_features)
+          verbose = @params[:verbose] ? 1 : -1
+          res = Lbfgsb.minimize(
+            fnc: fnc, jcb: true, x_init: w_init, args: [base_x, bin_y, @params[:reg_param]],
+            maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
+          )
+          @weight_vec, @bias_term = split_weight(res[:x])
+        end
+      end
+      def fit_sgd(x, y)
+        if multiclass_problem?
+          n_classes = @classes.size
+          n_features = x.shape[1]
+          @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
+          @bias_term = Numo::DFloat.zeros(n_classes)
+          if enable_parallel?
+            # :nocov:
+            models = parallel_map(n_classes) do |n|
+              bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
+              partial_fit(x, bin_y)
+            end
+            # :nocov:
+            n_classes.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
+          else
+            n_classes.times do |n|
+              bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
+              @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
+            end
+          end
+        else
+          negative_label = @classes[0]
+          bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
+          @weight_vec, @bias_term = partial_fit(x, bin_y)
+        end
+      end
     end
   end
 end