svmkit 0.7.2 → 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f20192e678f6f066eb1d40c066f0e9a0efefd3a0
4
- data.tar.gz: 1be802cdbbfb2ee7a641fb78d1409c2ee49b8450
3
+ metadata.gz: ca1916101dd6c77c5be1a157c2bfa8dafe9c543e
4
+ data.tar.gz: c4751b21fd3d0667bb7d378f8b524fc2f70069d9
5
5
  SHA512:
6
- metadata.gz: 43471c5a4ef290781d5d2270732313fbcffba60a4351805d6c7bb8abec7537bcd8ac50260600fbfb1ff52c947c45c3f6f19b9ccecd47e6015e6ac45da5c855a6
7
- data.tar.gz: 908f675396a2da835b82da8cf117a4a17d6d90d489618cf110e993de6c03d6ec8e6651115df333033314b0f54c1e931f68da8ff541a1b5e22886741f48496259
6
+ metadata.gz: db878c8b28e88649fed654b292358c11ec91369cd52ec03e01d06d053fbeb90ebff87248be628c8ab081fd820c1460bb3783448242531ef5f30b4b06337af87c
7
+ data.tar.gz: bfbfc580897a4a3161afa865cd14ac15a0a322cf80ae728f7b70c8d45cb41ff4566b1f1e748e6c1eb9412a4c401b737a569bf6b9af12a113a4ca4a6d75f8b9b8
data/HISTORY.md CHANGED
@@ -1,6 +1,10 @@
1
+ # 0.7.3
2
+ - Add class for grid search performing hyperparameter optimization.
3
+ - Add argument validations to Pipeline.
4
+
1
5
  # 0.7.2
2
6
  - Add class for Pipeline that constructs chain of transformers and estimators.
3
- - Fix some typos on document.
7
+ - Fix some typos on document ([#1](https://github.com/yoshoku/SVMKit/pull/1)).
4
8
 
5
9
  # 0.7.1
6
10
  - Fix to use CSV class in parsing libsvm format file.
@@ -55,6 +55,7 @@ require 'svmkit/preprocessing/one_hot_encoder'
55
55
  require 'svmkit/model_selection/k_fold'
56
56
  require 'svmkit/model_selection/stratified_k_fold'
57
57
  require 'svmkit/model_selection/cross_validation'
58
+ require 'svmkit/model_selection/grid_search_cv'
58
59
  require 'svmkit/evaluation_measure/accuracy'
59
60
  require 'svmkit/evaluation_measure/precision'
60
61
  require 'svmkit/evaluation_measure/recall'
@@ -109,7 +109,7 @@ module SVMKit
109
109
  tree = Tree::DecisionTreeClassifier.new(
110
110
  criterion: @params[:criterion], max_depth: @params[:max_depth],
111
111
  max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
112
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
112
+ max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
113
113
  )
114
114
  tree.fit(x[ids, true], y[ids])
115
115
  # Calculate estimator error.
@@ -111,7 +111,7 @@ module SVMKit
111
111
  tree = Tree::DecisionTreeRegressor.new(
112
112
  criterion: @params[:criterion], max_depth: @params[:max_depth],
113
113
  max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
114
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
114
+ max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
115
115
  )
116
116
  tree.fit(x[ids, true], y[ids])
117
117
  p = tree.predict(x)
@@ -97,7 +97,7 @@ module SVMKit
97
97
  tree = Tree::DecisionTreeClassifier.new(
98
98
  criterion: @params[:criterion], max_depth: @params[:max_depth],
99
99
  max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
100
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
100
+ max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
101
101
  )
102
102
  bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
103
103
  tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
@@ -91,7 +91,7 @@ module SVMKit
91
91
  tree = Tree::DecisionTreeRegressor.new(
92
92
  criterion: @params[:criterion], max_depth: @params[:max_depth],
93
93
  max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
94
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
94
+ max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
95
95
  )
96
96
  bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
97
97
  tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
@@ -0,0 +1,247 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'svmkit/validation'
4
+ require 'svmkit/base/base_estimator'
5
+ require 'svmkit/base/evaluator'
6
+ require 'svmkit/base/splitter'
7
+ require 'svmkit/pipeline/pipeline'
8
+
9
+ module SVMKit
10
+ module ModelSelection
11
+ # GridSearchCV is a class that performs hyperparameter optimization with grid search method.
12
+ #
13
+ # @example
14
+ # rfc = SVMKit::Ensemble::RandomForestClassifier.new(random_seed: 1)
15
+ # pg = { n_estimators: [5, 10], max_depth: [3, 5], max_leaf_nodes: [15, 31] }
16
+ # kf = SVMKit::ModelSelection::StratifiedKFold.new(n_splits: 5)
17
+ # gs = SVMKit::ModelSelection::GridSearchCV.new(estimator: rfc, param_grid: pg, splitter: kf)
18
+ # gs.fit(samples, labels)
19
+ # p gs.cv_results
20
+ # p gs.best_params
21
+ #
22
+ # @example
23
+ # rbf = SVMKit::KernelApproximation::RBF.new(random_seed: 1)
24
+ # svc = SVMKit::LinearModel::SVC.new(random_seed: 1)
25
+ # pipe = SVMKit::Pipeline::Pipeline.new(steps: { rbf: rbf, svc: svc })
26
+ # pg = { rbf__gamma: [32.0, 1.0], rbf__n_components: [4, 128], svc__reg_param: [16.0, 0.1] }
27
+ # kf = SVMKit::ModelSelection::StratifiedKFold.new(n_splits: 5)
28
+ # gs = SVMKit::ModelSelection::GridSearchCV.new(estimator: pipe, param_grid: pg, splitter: kf)
29
+ # gs.fit(samples, labels)
30
+ # p gs.cv_results
31
+ # p gs.best_params
32
+ #
33
+ class GridSearchCV
34
+ include Base::BaseEstimator
35
+ include Validation
36
+
37
+ # Return the result of cross validation for each parameter.
38
+ # @return [Hash]
39
+ attr_reader :cv_results
40
+
41
+ # Return the score of the estimator learned with the best parameter.
42
+ # @return [Float]
43
+ attr_reader :best_score
44
+
45
+ # Return the best parameter set.
46
+ # @return [Hash]
47
+ attr_reader :best_params
48
+
49
+ # Return the index of the best parameter.
50
+ # @return [Integer]
51
+ attr_reader :best_index
52
+
53
+ # Return the estimator learned with the best parameter.
54
+ # @return [Estimator]
55
+ attr_reader :best_estimator
56
+
57
+ # Create a new grid search method.
58
+ #
59
+ # @param estimator [Classifier/Regresor] The estimator to be searched for optimal parameters with grid search method.
60
+ # @param param_grid [Array<Hash>] The parameter sets is represented with array of hash that
61
+ # consists of parameter names as keys and array of parameter values as values.
62
+ # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset on cross validation.
63
+ # @param evaluator [Evaluator] The evaluator that calculates score of estimator results on cross validation.
64
+ # If nil is given, the score method of estimator is used to evaluation.
65
+ # @param greater_is_better [Boolean] The flag that indicates whether the estimator is better as
66
+ # evaluation score is larger.
67
+ def initialize(estimator: nil, param_grid: nil, splitter: nil, evaluator: nil, greater_is_better: true)
68
+ check_params_type(SVMKit::Base::BaseEstimator, estimator: estimator)
69
+ check_params_type(SVMKit::Base::Splitter, splitter: splitter)
70
+ check_params_type_or_nil(SVMKit::Base::Evaluator, evaluator: evaluator)
71
+ check_params_boolean(greater_is_better: greater_is_better)
72
+ @params = {}
73
+ @params[:param_grid] = valid_param_grid(param_grid)
74
+ @params[:estimator] = Marshal.load(Marshal.dump(estimator))
75
+ @params[:splitter] = Marshal.load(Marshal.dump(splitter))
76
+ @params[:evaluator] = Marshal.load(Marshal.dump(evaluator))
77
+ @params[:greater_is_better] = greater_is_better
78
+ @cv_results = nil
79
+ @best_score = nil
80
+ @best_params = nil
81
+ @best_index = nil
82
+ @best_estimator = nil
83
+ end
84
+
85
+ # Fit the model with given training data and all sets of parameters.
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
89
+ # @return [GridSearchCV] The learned estimator with grid search.
90
+ def fit(x, y)
91
+ check_sample_array(x)
92
+
93
+ init_attrs
94
+
95
+ param_combinations.each do |prm_set|
96
+ prm_set.each do |prms|
97
+ report = perform_cross_validation(x, y, prms)
98
+ store_cv_result(prms, report)
99
+ end
100
+ end
101
+
102
+ find_best_params
103
+
104
+ @best_estimator = configurated_estimator(@best_params)
105
+ @best_estimator.fit(x, y)
106
+ self
107
+ end
108
+
109
+ # Call the decision_function method of learned estimator with the best parameter.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
112
+ # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
113
+ def decision_function(x)
114
+ check_sample_array(x)
115
+ @best_estimator.decision_function(x)
116
+ end
117
+
118
+ # Call the predict method of learned estimator with the best parameter.
119
+ #
120
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
121
+ # @return [Numo::NArray] Predicted results.
122
+ def predict(x)
123
+ check_sample_array(x)
124
+ @best_estimator.predict(x)
125
+ end
126
+
127
+ # Call the predict_log_proba method of learned estimator with the best parameter.
128
+ #
129
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
130
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
131
+ def predict_log_proba(x)
132
+ check_sample_array(x)
133
+ @best_estimator.predict_log_proba(x)
134
+ end
135
+
136
+ # Call the predict_proba method of learned estimator with the best parameter.
137
+ #
138
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
139
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
140
+ def predict_proba(x)
141
+ check_sample_array(x)
142
+ @best_estimator.predict_proba(x)
143
+ end
144
+
145
+ # Call the score method of learned estimator with the best parameter.
146
+ #
147
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
148
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
149
+ # @return [Float] The score of estimator.
150
+ def score(x, y)
151
+ check_sample_array(x)
152
+ @best_estimator.score(x, y)
153
+ end
154
+
155
+ # Dump marshal data.
156
+ # @return [Hash] The marshal data about GridSearchCV.
157
+ def marshal_dump
158
+ { params: @params,
159
+ cv_results: @cv_results,
160
+ best_score: @best_score,
161
+ best_params: @best_params,
162
+ best_index: @best_index,
163
+ best_estimator: @best_estimator }
164
+ end
165
+
166
+ # Load marshal data.
167
+ # @return [nil]
168
+ def marshal_load(obj)
169
+ @params = obj[:params]
170
+ @cv_results = obj[:cv_results]
171
+ @best_score = obj[:best_score]
172
+ @best_params = obj[:best_params]
173
+ @best_index = obj[:best_index]
174
+ @best_estimator = obj[:best_estimator]
175
+ nil
176
+ end
177
+
178
+ private
179
+
180
+ def valid_param_grid(grid)
181
+ raise TypeError, 'Expect class of param_grid to be Hash or Array' unless grid.is_a?(Hash) || grid.is_a?(Array)
182
+ grid = [grid] if grid.is_a?(Hash)
183
+ grid.each do |h|
184
+ raise TypeError, 'Expect class of elements in param_grid to be Hash' unless h.is_a?(Hash)
185
+ raise TypeError, 'Expect class of parameter values in param_grid to be Array' unless h.values.all? { |v| v.is_a?(Array) }
186
+ end
187
+ grid
188
+ end
189
+
190
+ def param_combinations
191
+ @param_combinations ||= @params[:param_grid].map do |prm|
192
+ x = Hash[prm.sort].map { |k, v| [k].product(v) }
193
+ x[0].product(*x[1...x.size]).map { |v| Hash[v] }
194
+ end
195
+ end
196
+
197
+ def perform_cross_validation(x, y, prms)
198
+ est = configurated_estimator(prms)
199
+ cv = CrossValidation.new(estimator: est, splitter: @params[:splitter],
200
+ evaluator: @params[:evaluator], return_train_score: true)
201
+ cv.perform(x, y)
202
+ end
203
+
204
+ def configurated_estimator(prms)
205
+ estimator = Marshal.load(Marshal.dump(@params[:estimator]))
206
+ if @params[:estimator].is_a?(SVMKit::Pipeline::Pipeline)
207
+ prms.each do |k, v|
208
+ est_name, prm_name = k.to_s.split('__')
209
+ estimator.steps[est_name.to_sym].params[prm_name.to_sym] = v
210
+ end
211
+ else
212
+ prms.each { |k, v| estimator.params[k] = v }
213
+ end
214
+ estimator
215
+ end
216
+
217
+ def init_attrs
218
+ @cv_results = %i[mean_test_score std_test_score
219
+ mean_train_score std_train_score
220
+ mean_fit_time std_fit_time params].map { |v| [v, []] }.to_h
221
+ @best_score = nil
222
+ @best_params = nil
223
+ @best_index = nil
224
+ @best_estimator = nil
225
+ end
226
+
227
+ def store_cv_result(prms, report)
228
+ test_scores = Numo::DFloat[*report[:test_score]]
229
+ train_scores = Numo::DFloat[*report[:train_score]]
230
+ fit_times = Numo::DFloat[*report[:fit_time]]
231
+ @cv_results[:mean_test_score].push(test_scores.mean)
232
+ @cv_results[:std_test_score].push(test_scores.stddev)
233
+ @cv_results[:mean_train_score].push(train_scores.mean)
234
+ @cv_results[:std_train_score].push(train_scores.stddev)
235
+ @cv_results[:mean_fit_time].push(fit_times.mean)
236
+ @cv_results[:std_fit_time].push(fit_times.stddev)
237
+ @cv_results[:params].push(prms)
238
+ end
239
+
240
+ def find_best_params
241
+ @best_score = @params[:greater_is_better] ? @cv_results[:mean_test_score].max : @cv_results[:mean_test_score].min
242
+ @best_index = @cv_results[:mean_test_score].index(@best_score)
243
+ @best_params = @cv_results[:params][@best_index]
244
+ end
245
+ end
246
+ end
247
+ end
@@ -40,6 +40,7 @@ module SVMKit
40
40
  # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
41
41
  # @return [Pipeline] The learned pipeline itself.
42
42
  def fit(x, y)
43
+ check_sample_array(x)
43
44
  trans_x = apply_transforms(x, y, fit: true)
44
45
  last_estimator.fit(trans_x, y) unless last_estimator.nil?
45
46
  self
@@ -51,6 +52,7 @@ module SVMKit
51
52
  # @param y [Numo::NArray] (shape: [n_samples, n_outputs], default: nil) The target values or labels to be used for fitting the model.
52
53
  # @return [Numo::NArray] The predicted results by last estimator.
53
54
  def fit_predict(x, y = nil)
55
+ check_sample_array(x)
54
56
  trans_x = apply_transforms(x, y, fit: true)
55
57
  last_estimator.fit_predict(trans_x)
56
58
  end
@@ -61,6 +63,7 @@ module SVMKit
61
63
  # @param y [Numo::NArray] (shape: [n_samples, n_outputs], default: nil) The target values or labels to be used for fitting the model.
62
64
  # @return [Numo::NArray] The predicted results by last estimator.
63
65
  def fit_transform(x, y = nil)
66
+ check_sample_array(x)
64
67
  trans_x = apply_transforms(x, y, fit: true)
65
68
  last_estimator.fit_transform(trans_x, y)
66
69
  end
@@ -70,6 +73,7 @@ module SVMKit
70
73
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
71
74
  # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
72
75
  def decision_function(x)
76
+ check_sample_array(x)
73
77
  trans_x = apply_transforms(x)
74
78
  last_estimator.decision_function(trans_x)
75
79
  end
@@ -79,6 +83,7 @@ module SVMKit
79
83
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
80
84
  # @return [Numo::NArray] The predicted results by last estimator.
81
85
  def predict(x)
86
+ check_sample_array(x)
82
87
  trans_x = apply_transforms(x)
83
88
  last_estimator.predict(trans_x)
84
89
  end
@@ -88,6 +93,7 @@ module SVMKit
88
93
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
89
94
  # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
90
95
  def predict_log_proba(x)
96
+ check_sample_array(x)
91
97
  trans_x = apply_transforms(x)
92
98
  last_estimator.predict_log_proba(trans_x)
93
99
  end
@@ -97,6 +103,7 @@ module SVMKit
97
103
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
98
104
  # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
99
105
  def predict_proba(x)
106
+ check_sample_array(x)
100
107
  trans_x = apply_transforms(x)
101
108
  last_estimator.predict_proba(trans_x)
102
109
  end
@@ -106,6 +113,7 @@ module SVMKit
106
113
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
107
114
  # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed samples.
108
115
  def transform(x)
116
+ check_sample_array(x)
109
117
  trans_x = apply_transforms(x)
110
118
  last_estimator.nil? ? trans_x : last_estimator.transform(trans_x)
111
119
  end
@@ -115,8 +123,9 @@ module SVMKit
115
123
  # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The transformed samples to be restored into original space.
116
124
  # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored samples.
117
125
  def inverse_transform(z)
126
+ check_sample_array(z)
118
127
  itrans_z = z
119
- @steps.keys.reverse.each do |name|
128
+ @steps.keys.reverse_each do |name|
120
129
  transformer = @steps[name]
121
130
  next if transformer.nil?
122
131
  itrans_z = transformer.inverse_transform(itrans_z)
@@ -130,6 +139,7 @@ module SVMKit
130
139
  # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
131
140
  # @return [Float] The score of last estimator
132
141
  def score(x, y)
142
+ check_sample_array(x)
133
143
  trans_x = apply_transforms(x)
134
144
  last_estimator.score(trans_x, y)
135
145
  end
@@ -6,7 +6,7 @@ module SVMKit
6
6
  module_function
7
7
 
8
8
  # @!visibility private
9
- def choice_ids(size, probs, rng=nil)
9
+ def choice_ids(size, probs, rng = nil)
10
10
  rng ||= Random.new
11
11
  Array.new(size) do
12
12
  target = rng.rand
@@ -3,5 +3,5 @@
3
3
  # SVMKit is a machine learning library in Ruby.
4
4
  module SVMKit
5
5
  # @!visibility private
6
- VERSION = '0.7.2'.freeze
6
+ VERSION = '0.7.3'.freeze
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: svmkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-21 00:00:00.000000000 Z
11
+ date: 2019-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -146,6 +146,7 @@ files:
146
146
  - lib/svmkit/linear_model/svc.rb
147
147
  - lib/svmkit/linear_model/svr.rb
148
148
  - lib/svmkit/model_selection/cross_validation.rb
149
+ - lib/svmkit/model_selection/grid_search_cv.rb
149
150
  - lib/svmkit/model_selection/k_fold.rb
150
151
  - lib/svmkit/model_selection/stratified_k_fold.rb
151
152
  - lib/svmkit/multiclass/one_vs_rest_classifier.rb