rumale 0.20.3 → 0.22.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/regressor'
5
+
6
+ module Rumale
7
+ module Ensemble
8
+ # StackingRegressor is a class that implements regressor with stacking method.
9
+ #
10
+ # @example
11
+ # estimators = {
12
+ # las: Rumale::LinearModel::Lasso.new(reg_param: 1e-2, random_seed: 1),
13
+ # mlp: Rumele::NeuralNetwork::MLPRegressor.new(hidden_units: [256], random_seed: 1),
14
+ # rnd: Rumale::Ensemble::RandomForestRegressor.new(random_seed: 1)
15
+ # }
16
+ # meta_estimator = Rumale::LinearModel::Ridge.new(random_seed: 1)
17
+ # regressor = Rumale::Ensemble::StackedRegressor.new(
18
+ # estimators: estimators, meta_estimator: meta_estimator, random_seed: 1
19
+ # )
20
+ # regressor.fit(training_samples, traininig_values)
21
+ # results = regressor.predict(testing_samples)
22
+ #
23
+ # *Reference*
24
+ # - Zhou, Z-H., "Ensemble Mehotds - Foundations and Algorithms," CRC Press Taylor and Francis Group, Chapman and Hall/CRC, 2012.
25
+ class StackingRegressor
26
+ include Base::BaseEstimator
27
+ include Base::Regressor
28
+
29
+ # Return the base regressors.
30
+ # @return [Hash<Symbol,Regressor>]
31
+ attr_reader :estimators
32
+
33
+ # Return the meta regressor.
34
+ # @return [Regressor]
35
+ attr_reader :meta_estimator
36
+
37
+ # Create a new regressor with stacking method.
38
+ #
39
+ # @param estimators [Hash<Symbol,Regressor>] The base regressors for extracting meta features.
40
+ # @param meta_estimator [Regressor/Nil] The meta regressor that predicts values.
41
+ # If nil is given, Ridge is used.
42
+ # @param n_splits [Integer] The number of folds for cross validation with k-fold on meta feature extraction in training phase.
43
+ # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset on cross validation.
44
+ # @param passthrough [Boolean] The flag indicating whether to concatenate the original features and meta features when training the meta regressor.
45
+ # @param random_seed [Integer/Nil] The seed value using to initialize the random generator on cross validation.
46
+ def initialize(estimators:, meta_estimator: nil, n_splits: 5, shuffle: true, passthrough: false, random_seed: nil)
47
+ check_params_type(Hash, estimators: estimators)
48
+ check_params_numeric(n_splits: n_splits)
49
+ check_params_boolean(shuffle: shuffle, passthrough: passthrough)
50
+ check_params_numeric_or_nil(random_seed: random_seed)
51
+ @estimators = estimators
52
+ @meta_estimator = meta_estimator || Rumale::LinearModel::Ridge.new
53
+ @output_size = nil
54
+ @params = {}
55
+ @params[:n_splits] = n_splits
56
+ @params[:shuffle] = shuffle
57
+ @params[:passthrough] = passthrough
58
+ @params[:random_seed] = random_seed || srand
59
+ end
60
+
61
+ # Fit the model with given training data.
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
64
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target variables to be used for fitting the model.
65
+ # @return [StackedRegressor] The learned regressor itself.
66
+ def fit(x, y)
67
+ x = check_convert_sample_array(x)
68
+ y = check_convert_tvalue_array(y)
69
+ check_sample_tvalue_size(x, y)
70
+
71
+ n_samples, n_features = x.shape
72
+ n_outputs = y.ndim == 1 ? 1 : y.shape[1]
73
+
74
+ # training base regressors with all training data.
75
+ @estimators.each_key { |name| @estimators[name].fit(x, y) }
76
+
77
+ # detecting size of output for each base regressor.
78
+ @output_size = detect_output_size(n_features)
79
+
80
+ # extracting meta features with base regressors.
81
+ n_components = @output_size.values.inject(:+)
82
+ z = Numo::DFloat.zeros(n_samples, n_components)
83
+
84
+ kf = Rumale::ModelSelection::KFold.new(
85
+ n_splits: @params[:n_splits], shuffle: @params[:shuffle], random_seed: @params[:random_seed]
86
+ )
87
+
88
+ kf.split(x, y).each do |train_ids, valid_ids|
89
+ x_train = x[train_ids, true]
90
+ y_train = n_outputs == 1 ? y[train_ids] : y[train_ids, true]
91
+ x_valid = x[valid_ids, true]
92
+ f_start = 0
93
+ @estimators.each_key do |name|
94
+ est_fold = Marshal.load(Marshal.dump(@estimators[name]))
95
+ f_last = f_start + @output_size[name]
96
+ f_position = @output_size[name] == 1 ? f_start : f_start...f_last
97
+ z[valid_ids, f_position] = est_fold.fit(x_train, y_train).predict(x_valid)
98
+ f_start = f_last
99
+ end
100
+ end
101
+
102
+ # concatenating original features.
103
+ z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
104
+
105
+ # training meta regressor.
106
+ @meta_estimator.fit(z, y)
107
+
108
+ self
109
+ end
110
+
111
+ # Predict values for samples.
112
+ #
113
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
114
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) The predicted values per sample.
115
+ def predict(x)
116
+ x = check_convert_sample_array(x)
117
+ z = transform(x)
118
+ @meta_estimator.predict(z)
119
+ end
120
+
121
+ # Transform the given data with the learned model.
122
+ #
123
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed with the learned model.
124
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for samples.
125
+ def transform(x)
126
+ x = check_convert_sample_array(x)
127
+ n_samples = x.shape[0]
128
+ n_components = @output_size.values.inject(:+)
129
+ z = Numo::DFloat.zeros(n_samples, n_components)
130
+ f_start = 0
131
+ @estimators.each_key do |name|
132
+ f_last = f_start + @output_size[name]
133
+ f_position = @output_size[name] == 1 ? f_start : f_start...f_last
134
+ z[true, f_position] = @estimators[name].predict(x)
135
+ f_start = f_last
136
+ end
137
+ z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
138
+ z
139
+ end
140
+
141
+ # Fit the model with training data, and then transform them with the learned model.
142
+ #
143
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
144
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target variables to be used for fitting the model.
145
+ # @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for training data.
146
+ def fit_transform(x, y)
147
+ x = check_convert_sample_array(x)
148
+ y = check_convert_tvalue_array(y)
149
+ fit(x, y).transform(x)
150
+ end
151
+
152
+ private
153
+
154
+ def detect_output_size(n_features)
155
+ x_dummy = Numo::DFloat.new(2, n_features).rand
156
+ @estimators.each_key.with_object({}) do |name, obj|
157
+ output_dummy = @estimators[name].predict(x_dummy)
158
+ obj[name] = output_dummy.ndim == 1 ? 1 : output_dummy.shape[1]
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
@@ -67,7 +67,7 @@ module Rumale
67
67
  def transform(x)
68
68
  raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
69
69
 
70
- x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
70
+ x = [x] unless x.is_a?(Array)
71
71
  n_samples = x.size
72
72
 
73
73
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -99,7 +99,7 @@ module Rumale
99
99
  # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
100
100
  # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
101
101
  def transform(x)
102
- x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
102
+ x = [x] unless x.is_a?(Array)
103
103
  n_samples = x.size
104
104
  n_features = @vocabulary.size
105
105
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -11,9 +11,10 @@ module Rumale
11
11
  # with stochastic gradient descent (SGD) optimization.
12
12
  # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
13
  #
14
- # Rumale::SVM provides kernel support vector classifier based on LIBSVM.
15
- # If you prefer execution speed, you should use Rumale::SVM::SVC.
16
- # https://github.com/yoshoku/rumale-svm
14
+ # @note
15
+ # Rumale::SVM provides kernel support vector classifier based on LIBSVM.
16
+ # If you prefer execution speed, you should use Rumale::SVM::SVC.
17
+ # https://github.com/yoshoku/rumale-svm
17
18
  #
18
19
  # @example
19
20
  # training_kernel_matrix = Rumale::PairwiseMetric::rbf_kernel(training_samples)
@@ -171,7 +171,7 @@ module Rumale
171
171
  @params[:fit_bias] = true
172
172
  @params[:reg_param] = 0.0
173
173
  @params[:l1_ratio] = 0.0
174
- @params[:max_iter] = 200
174
+ @params[:max_iter] = 1000
175
175
  @params[:batch_size] = 50
176
176
  @params[:tol] = 0.0001
177
177
  @params[:verbose] = false
@@ -10,7 +10,7 @@ module Rumale
10
10
  #
11
11
  # @example
12
12
  # estimator =
13
- # Rumale::LinearModel::ElasticNet.new(reg_param: 0.1, l1_ratio: 0.5, max_iter: 200, batch_size: 50, random_seed: 1)
13
+ # Rumale::LinearModel::ElasticNet.new(reg_param: 0.1, l1_ratio: 0.5, max_iter: 1000, batch_size: 50, random_seed: 1)
14
14
  # estimator.fit(training_samples, traininig_values)
15
15
  # results = estimator.predict(testing_samples)
16
16
  #
@@ -59,7 +59,7 @@ module Rumale
59
59
  # @param random_seed [Integer] The seed value using to initialize the random generator.
60
60
  def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
61
61
  reg_param: 1.0, l1_ratio: 0.5, fit_bias: true, bias_scale: 1.0,
62
- max_iter: 200, batch_size: 50, tol: 1e-4,
62
+ max_iter: 1000, batch_size: 50, tol: 1e-4,
63
63
  n_jobs: nil, verbose: false, random_seed: nil)
64
64
  check_params_numeric(learning_rate: learning_rate, momentum: momentum,
65
65
  reg_param: reg_param, l1_ratio: l1_ratio, bias_scale: bias_scale,
@@ -81,7 +81,7 @@ module Rumale
81
81
  # Fit the model with given training data.
82
82
  #
83
83
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
84
- # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
84
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
85
85
  # @return [ElasticNet] The learned regressor itself.
86
86
  def fit(x, y)
87
87
  x = check_convert_sample_array(x)
@@ -10,7 +10,7 @@ module Rumale
10
10
  #
11
11
  # @example
12
12
  # estimator =
13
- # Rumale::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 500, batch_size: 20, random_seed: 1)
13
+ # Rumale::LinearModel::Lasso.new(reg_param: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
14
14
  # estimator.fit(training_samples, traininig_values)
15
15
  # results = estimator.predict(testing_samples)
16
16
  #
@@ -55,7 +55,7 @@ module Rumale
55
55
  # @param random_seed [Integer] The seed value using to initialize the random generator.
56
56
  def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
57
57
  reg_param: 1.0, fit_bias: true, bias_scale: 1.0,
58
- max_iter: 200, batch_size: 50, tol: 1e-4,
58
+ max_iter: 1000, batch_size: 50, tol: 1e-4,
59
59
  n_jobs: nil, verbose: false, random_seed: nil)
60
60
  check_params_numeric(learning_rate: learning_rate, momentum: momentum,
61
61
  reg_param: reg_param, bias_scale: bias_scale,
@@ -77,7 +77,7 @@ module Rumale
77
77
  # Fit the model with given training data.
78
78
  #
79
79
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
- # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
80
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
81
81
  # @return [Lasso] The learned regressor itself.
82
82
  def fit(x, y)
83
83
  x = check_convert_sample_array(x)
@@ -6,11 +6,12 @@ require 'rumale/base/regressor'
6
6
  module Rumale
7
7
  module LinearModel
8
8
  # LinearRegression is a class that implements ordinary least square linear regression
9
- # with stochastic gradient descent (SGD) optimization or singular value decomposition (SVD).
9
+ # with stochastic gradient descent (SGD) optimization,
10
+ # singular value decomposition (SVD), or L-BFGS optimization.
10
11
  #
11
12
  # @example
12
13
  # estimator =
13
- # Rumale::LinearModel::LinearRegression.new(max_iter: 500, batch_size: 20, random_seed: 1)
14
+ # Rumale::LinearModel::LinearRegression.new(max_iter: 1000, batch_size: 20, random_seed: 1)
14
15
  # estimator.fit(training_samples, traininig_values)
15
16
  # results = estimator.predict(testing_samples)
16
17
  #
@@ -41,34 +42,35 @@ module Rumale
41
42
  #
42
43
  # @param learning_rate [Float] The initial value of learning rate.
43
44
  # The learning rate decreases as the iteration proceeds according to the equation: learning_rate / (1 + decay * t).
44
- # If solver = 'svd', this parameter is ignored.
45
+ # If solver is not 'sgd', this parameter is ignored.
45
46
  # @param decay [Float] The smoothing parameter for decreasing learning rate as the iteration proceeds.
46
47
  # If nil is given, the decay sets to 'learning_rate'.
47
- # If solver = 'svd', this parameter is ignored.
48
+ # If solver is not 'sgd', this parameter is ignored.
48
49
  # @param momentum [Float] The momentum factor.
49
- # If solver = 'svd', this parameter is ignored.
50
+ # If solver is not 'sgd', this parameter is ignored.
50
51
  # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
51
52
  # @param bias_scale [Float] The scale of the bias term.
52
53
  # @param max_iter [Integer] The maximum number of epochs that indicates
53
54
  # how many times the whole data is given to the training process.
54
- # If solver = 'svd', this parameter is ignored.
55
+ # If solver is 'svd', this parameter is ignored.
55
56
  # @param batch_size [Integer] The size of the mini batches.
56
- # If solver = 'svd', this parameter is ignored.
57
+ # If solver is not 'sgd', this parameter is ignored.
57
58
  # @param tol [Float] The tolerance of loss for terminating optimization.
58
- # If solver = 'svd', this parameter is ignored.
59
- # @param solver [String] The algorithm to calculate weights. ('auto', 'sgd' or 'svd').
59
+ # If solver is 'svd', this parameter is ignored.
60
+ # @param solver [String] The algorithm to calculate weights. ('auto', 'sgd', 'svd' or 'lbfgs').
60
61
  # 'auto' chooses the 'svd' solver if Numo::Linalg is loaded. Otherwise, it chooses the 'sgd' solver.
61
62
  # 'sgd' uses the stochastic gradient descent optimization.
62
63
  # 'svd' performs singular value decomposition of samples.
64
+ # 'lbfgs' uses the L-BFGS method for optimization.
63
65
  # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
64
66
  # If nil is given, the method does not execute in parallel.
65
67
  # If zero or less is given, it becomes equal to the number of processors.
66
- # This parameter is ignored if the Parallel gem is not loaded.
68
+ # This parameter is ignored if the Parallel gem is not loaded or solver is not 'sgd'.
67
69
  # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
68
- # If solver = 'svd', this parameter is ignored.
70
+ # If solver is 'svd', this parameter is ignored.
69
71
  # @param random_seed [Integer] The seed value using to initialize the random generator.
70
72
  def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
71
- fit_bias: true, bias_scale: 1.0, max_iter: 200, batch_size: 50, tol: 1e-4,
73
+ fit_bias: true, bias_scale: 1.0, max_iter: 1000, batch_size: 50, tol: 1e-4,
72
74
  solver: 'auto',
73
75
  n_jobs: nil, verbose: false, random_seed: nil)
74
76
  check_params_numeric(learning_rate: learning_rate, momentum: momentum,
@@ -80,9 +82,9 @@ module Rumale
80
82
  super()
81
83
  @params.merge!(method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h)
82
84
  @params[:solver] = if solver == 'auto'
83
- load_linalg? ? 'svd' : 'sgd'
85
+ enable_linalg?(warning: false) ? 'svd' : 'sgd'
84
86
  else
85
- solver != 'svd' ? 'sgd' : 'svd'
87
+ solver.match?(/^svd$|^sgd$|^lbfgs$/) ? solver : 'sgd'
86
88
  end
87
89
  @params[:decay] ||= @params[:learning_rate]
88
90
  @params[:random_seed] ||= srand
@@ -95,15 +97,17 @@ module Rumale
95
97
  # Fit the model with given training data.
96
98
  #
97
99
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
98
- # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
100
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
99
101
  # @return [LinearRegression] The learned regressor itself.
100
102
  def fit(x, y)
101
103
  x = check_convert_sample_array(x)
102
104
  y = check_convert_tvalue_array(y)
103
105
  check_sample_tvalue_size(x, y)
104
106
 
105
- if @params[:solver] == 'svd' && enable_linalg?
107
+ if @params[:solver] == 'svd' && enable_linalg?(warning: false)
106
108
  fit_svd(x, y)
109
+ elsif @params[:solver] == 'lbfgs'
110
+ fit_lbfgs(x, y)
107
111
  else
108
112
  fit_sgd(x, y)
109
113
  end
@@ -124,24 +128,46 @@ module Rumale
124
128
 
125
129
  def fit_svd(x, y)
126
130
  x = expand_feature(x) if fit_bias?
127
-
128
131
  w = Numo::Linalg.pinv(x, driver: 'svd').dot(y)
132
+ @weight_vec, @bias_term = single_target?(y) ? split_weight(w) : split_weight_mult(w)
133
+ end
129
134
 
130
- is_single_target_vals = y.shape[1].nil?
131
- if @params[:fit_bias]
132
- @weight_vec = is_single_target_vals ? w[0...-1].dup : w[0...-1, true].dup
133
- @bias_term = is_single_target_vals ? w[-1] : w[-1, true].dup
134
- else
135
- @weight_vec = w.dup
136
- @bias_term = is_single_target_vals ? 0 : Numo::DFloat.zeros(y.shape[1])
135
+ def fit_lbfgs(x, y)
136
+ fnc = proc do |w, x, y| # rubocop:disable Lint/ShadowingOuterLocalVariable
137
+ n_samples, n_features = x.shape
138
+ w = w.reshape(y.shape[1], n_features) unless y.shape[1].nil?
139
+ z = x.dot(w.transpose)
140
+ d = z - y
141
+ loss = (d**2).sum.fdiv(n_samples)
142
+ gradient = 2.fdiv(n_samples) * d.transpose.dot(x)
143
+ [loss, gradient.flatten.dup]
137
144
  end
138
- end
139
145
 
140
- def fit_sgd(x, y)
141
- n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
146
+ x = expand_feature(x) if fit_bias?
147
+
142
148
  n_features = x.shape[1]
149
+ n_outputs = single_target?(y) ? 1 : y.shape[1]
150
+
151
+ res = Lbfgsb.minimize(
152
+ fnc: fnc, jcb: true, x_init: init_weight(n_features, n_outputs), args: [x, y],
153
+ maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON,
154
+ verbose: @params[:verbose] ? 1 : -1
155
+ )
156
+
157
+ @weight_vec, @bias_term =
158
+ if single_target?(y)
159
+ split_weight(res[:x])
160
+ else
161
+ split_weight_mult(res[:x].reshape(n_outputs, n_features).transpose)
162
+ end
163
+ end
143
164
 
144
- if n_outputs > 1
165
+ def fit_sgd(x, y)
166
+ if single_target?(y)
167
+ @weight_vec, @bias_term = partial_fit(x, y)
168
+ else
169
+ n_outputs = y.shape[1]
170
+ n_features = x.shape[1]
145
171
  @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
146
172
  @bias_term = Numo::DFloat.zeros(n_outputs)
147
173
  if enable_parallel?
@@ -150,20 +176,23 @@ module Rumale
150
176
  else
151
177
  n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
152
178
  end
153
- else
154
- @weight_vec, @bias_term = partial_fit(x, y)
155
179
  end
156
180
  end
157
181
 
158
- def fit_bias?
159
- @params[:fit_bias] == true
182
+ def single_target?(y)
183
+ y.ndim == 1
160
184
  end
161
185
 
162
- def load_linalg?
163
- return false if defined?(Numo::Linalg).nil?
164
- return false if Numo::Linalg::VERSION < '0.1.4'
186
+ def init_weight(n_features, n_outputs)
187
+ Rumale::Utils.rand_normal([n_outputs, n_features], @rng.dup).flatten.dup
188
+ end
165
189
 
166
- true
190
+ def split_weight_mult(w)
191
+ if fit_bias?
192
+ [w[0...-1, true].dup, w[-1, true].dup]
193
+ else
194
+ [w.dup, Numo::DFloat.zeros(w.shape[1])]
195
+ end
167
196
  end
168
197
  end
169
198
  end
@@ -1,21 +1,24 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rumale/linear_model/base_sgd'
3
+ require 'lbfgsb'
4
4
  require 'rumale/base/classifier'
5
+ require 'rumale/linear_model/base_sgd'
6
+ require 'rumale/preprocessing/label_binarizer'
5
7
 
6
8
  module Rumale
7
9
  module LinearModel
8
- # LogisticRegression is a class that implements Logistic Regression
9
- # with stochastic gradient descent optimization.
10
- # For multiclass classification problem, it uses one-vs-the-rest strategy.
10
+ # LogisticRegression is a class that implements Logistic Regression.
11
+ # In multiclass classification problem, it uses one-vs-the-rest strategy for the sgd solver
12
+ # and multinomial logistic regression for the lbfgs solver.
11
13
  #
12
- # Rumale::SVM provides Logistic Regression based on LIBLINEAR.
13
- # If you prefer execution speed, you should use Rumale::SVM::LogisticRegression.
14
- # https://github.com/yoshoku/rumale-svm
14
+ # @note
15
+ # Rumale::SVM provides Logistic Regression based on LIBLINEAR.
16
+ # If you prefer execution speed, you should use Rumale::SVM::LogisticRegression.
17
+ # https://github.com/yoshoku/rumale-svm
15
18
  #
16
19
  # @example
17
20
  # estimator =
18
- # Rumale::LinearModel::LogisticRegression.new(reg_param: 1.0, max_iter: 200, batch_size: 50, random_seed: 1)
21
+ # Rumale::LinearModel::LogisticRegression.new(reg_param: 1.0, random_seed: 1)
19
22
  # estimator.fit(training_samples, traininig_labels)
20
23
  # results = estimator.predict(testing_samples)
21
24
  #
@@ -42,19 +45,24 @@ module Rumale
42
45
  # @return [Random]
43
46
  attr_reader :rng
44
47
 
45
- # Create a new classifier with Logisitc Regression by the SGD optimization.
48
+ # Create a new classifier with Logisitc Regression.
46
49
  #
47
50
  # @param learning_rate [Float] The initial value of learning rate.
48
51
  # The learning rate decreases as the iteration proceeds according to the equation: learning_rate / (1 + decay * t).
52
+ # If solver = 'lbfgs', this parameter is ignored.
49
53
  # @param decay [Float] The smoothing parameter for decreasing learning rate as the iteration proceeds.
50
54
  # If nil is given, the decay sets to 'reg_param * learning_rate'.
55
+ # If solver = 'lbfgs', this parameter is ignored.
51
56
  # @param momentum [Float] The momentum factor.
57
+ # If solver = 'lbfgs', this parameter is ignored.
52
58
  # @param penalty [String] The regularization type to be used ('l1', 'l2', and 'elasticnet').
59
+ # If solver = 'lbfgs', only 'l2' can be selected for this parameter.
53
60
  # @param l1_ratio [Float] The elastic-net type regularization mixing parameter.
54
61
  # If penalty set to 'l2' or 'l1', this parameter is ignored.
55
62
  # If l1_ratio = 1, the regularization is similar to Lasso.
56
63
  # If l1_ratio = 0, the regularization is similar to Ridge.
57
64
  # If 0 < l1_ratio < 1, the regularization is a combination of L1 and L2.
65
+ # If solver = 'lbfgs', this parameter is ignored.
58
66
  # @param reg_param [Float] The regularization parameter.
59
67
  # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
60
68
  # @param bias_scale [Float] The scale of the bias term.
@@ -62,28 +70,38 @@ module Rumale
62
70
  # @param max_iter [Integer] The maximum number of epochs that indicates
63
71
  # how many times the whole data is given to the training process.
64
72
  # @param batch_size [Integer] The size of the mini batches.
73
+ # If solver = 'lbfgs', this parameter is ignored.
65
74
  # @param tol [Float] The tolerance of loss for terminating optimization.
75
+ # If solver = 'lbfgs', this value is given as tol / Lbfgsb::DBL_EPSILON to the factr argument of Lbfgsb.minimize method.
76
+ # @param solver [String] The algorithm for optimization. ('lbfgs' or 'sgd').
77
+ # 'lbfgs' uses the L-BFGS with lbfgs.rb gem.
78
+ # 'sgd' uses the stochastic gradient descent optimization.
66
79
  # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
67
80
  # If nil is given, the methods do not execute in parallel.
68
81
  # If zero or less is given, it becomes equal to the number of processors.
69
- # This parameter is ignored if the Parallel gem is not loaded.
82
+ # This parameter is ignored if the Parallel gem is not loaded or the solver is 'lbfgs'.
70
83
  # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
84
+ # If solver = 'lbfgs' and true is given, 'iterate.dat' file is generated by lbfgsb.rb.
71
85
  # @param random_seed [Integer] The seed value using to initialize the random generator.
72
86
  def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
73
87
  penalty: 'l2', reg_param: 1.0, l1_ratio: 0.5,
74
88
  fit_bias: true, bias_scale: 1.0,
75
- max_iter: 200, batch_size: 50, tol: 1e-4,
89
+ max_iter: 1000, batch_size: 50, tol: 1e-4,
90
+ solver: 'lbfgs',
76
91
  n_jobs: nil, verbose: false, random_seed: nil)
77
92
  check_params_numeric(learning_rate: learning_rate, momentum: momentum,
78
93
  reg_param: reg_param, l1_ratio: l1_ratio, bias_scale: bias_scale,
79
94
  max_iter: max_iter, batch_size: batch_size, tol: tol)
80
95
  check_params_boolean(fit_bias: fit_bias, verbose: verbose)
81
- check_params_string(penalty: penalty)
96
+ check_params_string(solver: solver, penalty: penalty)
82
97
  check_params_numeric_or_nil(decay: decay, n_jobs: n_jobs, random_seed: random_seed)
83
98
  check_params_positive(learning_rate: learning_rate, reg_param: reg_param,
84
99
  bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
100
+ raise ArgumentError, "The 'lbfgs' solver supports only 'l2' penalties." if solver == 'lbfgs' && penalty != 'l2'
101
+
85
102
  super()
86
103
  @params.merge!(method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h)
104
+ @params[:solver] = solver == 'sgd' ? 'sgd' : 'lbfgs'
87
105
  @params[:decay] ||= @params[:reg_param] * @params[:learning_rate]
88
106
  @params[:random_seed] ||= srand
89
107
  @rng = Random.new(@params[:random_seed])
@@ -105,30 +123,10 @@ module Rumale
105
123
  check_sample_label_size(x, y)
106
124
 
107
125
  @classes = Numo::Int32[*y.to_a.uniq.sort]
108
-
109
- if multiclass_problem?
110
- n_classes = @classes.size
111
- n_features = x.shape[1]
112
- @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
113
- @bias_term = Numo::DFloat.zeros(n_classes)
114
- if enable_parallel?
115
- # :nocov:
116
- models = parallel_map(n_classes) do |n|
117
- bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
118
- partial_fit(x, bin_y)
119
- end
120
- # :nocov:
121
- n_classes.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
122
- else
123
- n_classes.times do |n|
124
- bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
125
- @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
126
- end
127
- end
126
+ if @params[:solver] == 'sgd'
127
+ fit_sgd(x, y)
128
128
  else
129
- negative_label = @classes[0]
130
- bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
131
- @weight_vec, @bias_term = partial_fit(x, bin_y)
129
+ fit_lbfgs(x, y)
132
130
  end
133
131
 
134
132
  self
@@ -182,6 +180,96 @@ module Rumale
182
180
  def multiclass_problem?
183
181
  @classes.size > 2
184
182
  end
183
+
184
+ def fit_lbfgs(base_x, base_y) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
185
+ if multiclass_problem?
186
+ fnc = proc do |w, x, y, a|
187
+ n_features = x.shape[1]
188
+ n_classes = y.shape[1]
189
+ z = x.dot(w.reshape(n_classes, n_features).transpose)
190
+ # logsumexp and softmax
191
+ z_max = z.max(-1).expand_dims(-1).dup
192
+ z_max[~z_max.isfinite] = 0.0
193
+ lgsexp = Numo::NMath.log(Numo::NMath.exp(z - z_max).sum(-1)).expand_dims(-1) + z_max
194
+ t = z - lgsexp
195
+ sftmax = Numo::NMath.exp(t)
196
+ # loss and gradient
197
+ loss = -(y * t).sum + 0.5 * a * w.dot(w)
198
+ grad = (sftmax - y).transpose.dot(x).flatten.dup + a * w
199
+ [loss, grad]
200
+ end
201
+
202
+ base_x = expand_feature(base_x) if fit_bias?
203
+ encoder = Rumale::Preprocessing::LabelBinarizer.new
204
+ onehot_y = encoder.fit_transform(base_y)
205
+ n_classes = @classes.size
206
+ n_features = base_x.shape[1]
207
+ w_init = Numo::DFloat.zeros(n_classes * n_features)
208
+
209
+ verbose = @params[:verbose] ? 1 : -1
210
+ res = Lbfgsb.minimize(
211
+ fnc: fnc, jcb: true, x_init: w_init, args: [base_x, onehot_y, @params[:reg_param]],
212
+ maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
213
+ )
214
+
215
+ if fit_bias?
216
+ weight = res[:x].reshape(n_classes, n_features)
217
+ @weight_vec = weight[true, 0...-1].dup
218
+ @bias_term = weight[true, -1].dup
219
+ else
220
+ @weight_vec = res[:x].reshape(n_classes, n_features)
221
+ @bias_term = Numo::DFloat.zeros(n_classes)
222
+ end
223
+ else
224
+ fnc = proc do |w, x, y, a|
225
+ z = 1 + Numo::NMath.exp(-y * x.dot(w))
226
+ loss = Numo::NMath.log(z).sum + 0.5 * a * w.dot(w)
227
+ grad = (y / z - y).dot(x) + a * w
228
+ [loss, grad]
229
+ end
230
+
231
+ base_x = expand_feature(base_x) if fit_bias?
232
+ negative_label = @classes[0]
233
+ bin_y = Numo::Int32.cast(base_y.ne(negative_label)) * 2 - 1
234
+ n_features = base_x.shape[1]
235
+ w_init = Numo::DFloat.zeros(n_features)
236
+
237
+ verbose = @params[:verbose] ? 1 : -1
238
+ res = Lbfgsb.minimize(
239
+ fnc: fnc, jcb: true, x_init: w_init, args: [base_x, bin_y, @params[:reg_param]],
240
+ maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
241
+ )
242
+
243
+ @weight_vec, @bias_term = split_weight(res[:x])
244
+ end
245
+ end
246
+
247
+ def fit_sgd(x, y)
248
+ if multiclass_problem?
249
+ n_classes = @classes.size
250
+ n_features = x.shape[1]
251
+ @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
252
+ @bias_term = Numo::DFloat.zeros(n_classes)
253
+ if enable_parallel?
254
+ # :nocov:
255
+ models = parallel_map(n_classes) do |n|
256
+ bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
257
+ partial_fit(x, bin_y)
258
+ end
259
+ # :nocov:
260
+ n_classes.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
261
+ else
262
+ n_classes.times do |n|
263
+ bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
264
+ @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
265
+ end
266
+ end
267
+ else
268
+ negative_label = @classes[0]
269
+ bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
270
+ @weight_vec, @bias_term = partial_fit(x, bin_y)
271
+ end
272
+ end
185
273
  end
186
274
  end
187
275
  end