rumale 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/linear_model/base_linear_model'
4
+ require 'rumale/base/classifier'
5
+ require 'rumale/probabilistic_output'
6
+
7
+ module Rumale
8
+ # This module consists of the classes that implement generalized linear models.
9
+ module LinearModel
10
+ # SVC is a class that implements Support Vector Classifier
11
+ # with mini-batch stochastic gradient descent optimization.
12
+ # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::LinearModel::SVC.new(reg_param: 1.0, max_iter: 1000, batch_size: 20, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
22
+ class SVC < BaseLinearModel
23
+ include Base::Classifier
24
+
25
+ # Return the weight vector for SVC.
26
+ # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
+ attr_reader :weight_vec
28
+
29
+ # Return the bias term (a.k.a. intercept) for SVC.
30
+ # @return [Numo::DFloat] (shape: [n_classes])
31
+ attr_reader :bias_term
32
+
33
+ # Return the class labels.
34
+ # @return [Numo::Int32] (shape: [n_classes])
35
+ attr_reader :classes
36
+
37
+ # Return the random generator for performing random sampling.
38
+ # @return [Random]
39
+ attr_reader :rng
40
+
41
+ # Create a new classifier with Support Vector Machine by the SGD optimization.
42
+ #
43
+ # @param reg_param [Float] The regularization parameter.
44
+ # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
45
+ # @param bias_scale [Float] The scale of the bias term.
46
+ # @param max_iter [Integer] The maximum number of iterations.
47
+ # @param batch_size [Integer] The size of the mini batches.
48
+ # @param probability [Boolean] The flag indicating whether to perform probability estimation.
49
+ # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
50
+ # If nil is given, Nadam is used.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
53
+ max_iter: 1000, batch_size: 20, probability: false, optimizer: nil, random_seed: nil)
54
+ check_params_float(reg_param: reg_param, bias_scale: bias_scale)
55
+ check_params_integer(max_iter: max_iter, batch_size: batch_size)
56
+ check_params_boolean(fit_bias: fit_bias, probability: probability)
57
+ check_params_type_or_nil(Integer, random_seed: random_seed)
58
+ check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
59
+ keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h
60
+ keywd_args.delete(:probability)
61
+ super(keywd_args)
62
+ @params[:probability] = probability
63
+ @prob_param = nil
64
+ @classes = nil
65
+ end
66
+
67
+ # Fit the model with given training data.
68
+ #
69
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
70
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
71
+ # @return [SVC] The learned classifier itself.
72
+ def fit(x, y)
73
+ check_sample_array(x)
74
+ check_label_array(y)
75
+ check_sample_label_size(x, y)
76
+
77
+ @classes = Numo::Int32[*y.to_a.uniq.sort]
78
+ n_classes = @classes.size
79
+ n_features = x.shape[1]
80
+
81
+ if n_classes > 2
82
+ @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
83
+ @bias_term = Numo::DFloat.zeros(n_classes)
84
+ @prob_param = Numo::DFloat.zeros(n_classes, 2)
85
+ n_classes.times do |n|
86
+ bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
87
+ @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
88
+ @prob_param[n, true] = if @params[:probability]
89
+ Rumale::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec[n, true].transpose) + @bias_term[n], bin_y)
90
+ else
91
+ Numo::DFloat[1, 0]
92
+ end
93
+ end
94
+ else
95
+ negative_label = y.to_a.uniq.min
96
+ bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
97
+ @weight_vec, @bias_term = partial_fit(x, bin_y)
98
+ @prob_param = if @params[:probability]
99
+ Rumale::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec.transpose) + @bias_term, bin_y)
100
+ else
101
+ Numo::DFloat[1, 0]
102
+ end
103
+ end
104
+
105
+ self
106
+ end
107
+
108
+ # Calculate confidence scores for samples.
109
+ #
110
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
111
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
112
+ def decision_function(x)
113
+ check_sample_array(x)
114
+ x.dot(@weight_vec.transpose) + @bias_term
115
+ end
116
+
117
+ # Predict class labels for samples.
118
+ #
119
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
120
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
121
+ def predict(x)
122
+ check_sample_array(x)
123
+
124
+ return Numo::Int32.cast(decision_function(x).ge(0.0)) * 2 - 1 if @classes.size <= 2
125
+
126
+ n_samples, = x.shape
127
+ decision_values = decision_function(x)
128
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
129
+ end
130
+
131
+ # Predict probability for samples.
132
+ #
133
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
134
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
135
+ def predict_proba(x)
136
+ check_sample_array(x)
137
+
138
+ if @classes.size > 2
139
+ probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
140
+ return (probs.transpose / probs.sum(axis: 1)).transpose
141
+ end
142
+
143
+ n_samples, = x.shape
144
+ probs = Numo::DFloat.zeros(n_samples, 2)
145
+ probs[true, 1] = 1.0 / (Numo::NMath.exp(@prob_param[0] * decision_function(x) + @prob_param[1]) + 1.0)
146
+ probs[true, 0] = 1.0 - probs[true, 1]
147
+ probs
148
+ end
149
+
150
+ # Dump marshal data.
151
+ # @return [Hash] The marshal data about SVC.
152
+ def marshal_dump
153
+ { params: @params,
154
+ weight_vec: @weight_vec,
155
+ bias_term: @bias_term,
156
+ prob_param: @prob_param,
157
+ classes: @classes,
158
+ rng: @rng }
159
+ end
160
+
161
+ # Load marshal data.
162
+ # @return [nil]
163
+ def marshal_load(obj)
164
+ @params = obj[:params]
165
+ @weight_vec = obj[:weight_vec]
166
+ @bias_term = obj[:bias_term]
167
+ @prob_param = obj[:prob_param]
168
+ @classes = obj[:classes]
169
+ @rng = obj[:rng]
170
+ nil
171
+ end
172
+
173
+ private
174
+
175
+ def calc_loss_gradient(x, y, weight)
176
+ target_ids = (x.dot(weight) * y).lt(1.0).where
177
+ grad = Numo::DFloat.zeros(@params[:batch_size])
178
+ grad[target_ids] = -y[target_ids]
179
+ grad
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/linear_model/base_linear_model'
4
+ require 'rumale/base/regressor'
5
+
6
+ module Rumale
7
+ module LinearModel
8
+ # SVR is a class that implements Support Vector Regressor
9
+ # with mini-batch stochastic gradient descent optimization.
10
+ #
11
+ # @example
12
+ # estimator =
13
+ # Rumale::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
14
+ # estimator.fit(training_samples, traininig_target_values)
15
+ # results = estimator.predict(testing_samples)
16
+ #
17
+ # *Reference*
18
+ # 1. S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
19
+ class SVR < BaseLinearModel
20
+ include Base::Regressor
21
+
22
+ # Return the weight vector for SVR.
23
+ # @return [Numo::DFloat] (shape: [n_outputs, n_features])
24
+ attr_reader :weight_vec
25
+
26
+ # Return the bias term (a.k.a. intercept) for SVR.
27
+ # @return [Numo::DFloat] (shape: [n_outputs])
28
+ attr_reader :bias_term
29
+
30
+ # Return the random generator for performing random sampling.
31
+ # @return [Random]
32
+ attr_reader :rng
33
+
34
+ # Create a new regressor with Support Vector Machine by the SGD optimization.
35
+ #
36
+ # @param reg_param [Float] The regularization parameter.
37
+ # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
38
+ # @param bias_scale [Float] The scale of the bias term.
39
+ # @param epsilon [Float] The margin of tolerance.
40
+ # @param max_iter [Integer] The maximum number of iterations.
41
+ # @param batch_size [Integer] The size of the mini batches.
42
+ # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
43
+ # If nil is given, Nadam is used.
44
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
45
+ def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0, epsilon: 0.1,
46
+ max_iter: 1000, batch_size: 20, optimizer: nil, random_seed: nil)
47
+ check_params_float(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon)
48
+ check_params_integer(max_iter: max_iter, batch_size: batch_size)
49
+ check_params_boolean(fit_bias: fit_bias)
50
+ check_params_type_or_nil(Integer, random_seed: random_seed)
51
+ check_params_positive(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon,
52
+ max_iter: max_iter, batch_size: batch_size)
53
+ keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h
54
+ keywd_args.delete(:epsilon)
55
+ super(keywd_args)
56
+ @params[:epsilon] = epsilon
57
+ end
58
+
59
+ # Fit the model with given training data.
60
+ #
61
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
62
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
63
+ # @return [SVR] The learned regressor itself.
64
+ def fit(x, y)
65
+ check_sample_array(x)
66
+ check_tvalue_array(y)
67
+ check_sample_tvalue_size(x, y)
68
+
69
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
70
+ n_features = x.shape[1]
71
+
72
+ if n_outputs > 1
73
+ @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
74
+ @bias_term = Numo::DFloat.zeros(n_outputs)
75
+ n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
76
+ else
77
+ @weight_vec, @bias_term = partial_fit(x, y)
78
+ end
79
+
80
+ self
81
+ end
82
+
83
+ # Predict values for samples.
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
86
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
87
+ def predict(x)
88
+ check_sample_array(x)
89
+ x.dot(@weight_vec.transpose) + @bias_term
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about SVR.
94
+ def marshal_dump
95
+ { params: @params,
96
+ weight_vec: @weight_vec,
97
+ bias_term: @bias_term,
98
+ rng: @rng }
99
+ end
100
+
101
+ # Load marshal data.
102
+ # @return [nil]
103
+ def marshal_load(obj)
104
+ @params = obj[:params]
105
+ @weight_vec = obj[:weight_vec]
106
+ @bias_term = obj[:bias_term]
107
+ @rng = obj[:rng]
108
+ nil
109
+ end
110
+
111
+ private
112
+
113
+ def calc_loss_gradient(x, y, weight)
114
+ z = x.dot(weight)
115
+ grad = Numo::DFloat.zeros(@params[:batch_size])
116
+ grad[(z - y).gt(@params[:epsilon]).where] = 1
117
+ grad[(y - z).gt(@params[:epsilon]).where] = -1
118
+ grad
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/base/regressor'
7
+ require 'rumale/base/splitter'
8
+ require 'rumale/base/evaluator'
9
+ require 'rumale/evaluation_measure/log_loss'
10
+
11
+ module Rumale
12
+ # This module consists of the classes for model validation techniques.
13
+ module ModelSelection
14
+ # CrossValidation is a class that evaluates a given classifier with cross-validation method.
15
+ #
16
+ # @example
17
+ # svc = Rumale::LinearModel::SVC.new
18
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
19
+ # cv = Rumale::ModelSelection::CrossValidation.new(estimator: svc, splitter: kf)
20
+ # report = cv.perform(samples, lables)
21
+ # mean_test_score = report[:test_score].inject(:+) / kf.n_splits
22
+ #
23
+ class CrossValidation
24
+ include Validation
25
+
26
+ # Return the classifier of which performance is evaluated.
27
+ # @return [Classifier]
28
+ attr_reader :estimator
29
+
30
+ # Return the splitter that divides dataset.
31
+ # @return [Splitter]
32
+ attr_reader :splitter
33
+
34
+ # Return the evaluator that calculates score.
35
+ # @return [Evaluator]
36
+ attr_reader :evaluator
37
+
38
+ # Return the flag indicating whether to caculate the score of training dataset.
39
+ # @return [Boolean]
40
+ attr_reader :return_train_score
41
+
42
+ # Create a new evaluator with cross-validation method.
43
+ #
44
+ # @param estimator [Classifier] The classifier of which performance is evaluated.
45
+ # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset.
46
+ # @param evaluator [Evaluator] The evaluator that calculates score of estimator results.
47
+ # @param return_train_score [Boolean] The flag indicating whether to calculate the score of training dataset.
48
+ def initialize(estimator: nil, splitter: nil, evaluator: nil, return_train_score: false)
49
+ check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
50
+ check_params_type(Rumale::Base::Splitter, splitter: splitter)
51
+ check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
52
+ check_params_boolean(return_train_score: return_train_score)
53
+ @estimator = estimator
54
+ @splitter = splitter
55
+ @evaluator = evaluator
56
+ @return_train_score = return_train_score
57
+ end
58
+
59
+ # Perform the evalution of given classifier with cross-validation method.
60
+ #
61
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
62
+ # The dataset to be used to evaluate the estimator.
63
+ # @param y [Numo::Int32 / Numo::DFloat] (shape: [n_samples] / [n_samples, n_outputs])
64
+ # The labels to be used to evaluate the classifier / The target values to be used to evaluate the regressor.
65
+ # @return [Hash] The report summarizing the results of cross-validation.
66
+ # * :fit_time (Array<Float>) The calculation times of fitting the estimator for each split.
67
+ # * :test_score (Array<Float>) The scores of testing dataset for each split.
68
+ # * :train_score (Array<Float>) The scores of training dataset for each split. This option is nil if
69
+ # the return_train_score is false.
70
+ def perform(x, y)
71
+ check_sample_array(x)
72
+ if @estimator.is_a?(Rumale::Base::Classifier)
73
+ check_label_array(y)
74
+ check_sample_label_size(x, y)
75
+ end
76
+ if @estimator.is_a?(Rumale::Base::Regressor)
77
+ check_tvalue_array(y)
78
+ check_sample_tvalue_size(x, y)
79
+ end
80
+ # Initialize the report of cross validation.
81
+ report = { test_score: [], train_score: nil, fit_time: [] }
82
+ report[:train_score] = [] if @return_train_score
83
+ # Evaluate the estimator on each split.
84
+ @splitter.split(x, y).each do |train_ids, test_ids|
85
+ # Split dataset into training and testing dataset.
86
+ feature_ids = !kernel_machine? || train_ids
87
+ train_x = x[train_ids, feature_ids]
88
+ train_y = y.shape[1].nil? ? y[train_ids] : y[train_ids, true]
89
+ test_x = x[test_ids, feature_ids]
90
+ test_y = y.shape[1].nil? ? y[test_ids] : y[test_ids, true]
91
+ # Fit the estimator.
92
+ start_time = Time.now.to_i
93
+ @estimator.fit(train_x, train_y)
94
+ # Calculate scores and prepare the report.
95
+ report[:fit_time].push(Time.now.to_i - start_time)
96
+ if @evaluator.nil?
97
+ report[:test_score].push(@estimator.score(test_x, test_y))
98
+ report[:train_score].push(@estimator.score(train_x, train_y)) if @return_train_score
99
+ elsif log_loss?
100
+ report[:test_score].push(@evaluator.score(test_y, @estimator.predict_proba(test_x)))
101
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict_proba(train_x))) if @return_train_score
102
+ else
103
+ report[:test_score].push(@evaluator.score(test_y, @estimator.predict(test_x)))
104
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict(train_x))) if @return_train_score
105
+ end
106
+ end
107
+ report
108
+ end
109
+
110
+ private
111
+
112
+ def kernel_machine?
113
+ class_name = @estimator.class.to_s
114
+ class_name = @estimator.params[:estimator].class.to_s if class_name.include?('Multiclass')
115
+ class_name.include?('KernelMachine')
116
+ end
117
+
118
+ def log_loss?
119
+ @evaluator.is_a?(Rumale::EvaluationMeasure::LogLoss)
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,247 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/evaluator'
6
+ require 'rumale/base/splitter'
7
+ require 'rumale/pipeline/pipeline'
8
+
9
+ module Rumale
10
+ module ModelSelection
11
+ # GridSearchCV is a class that performs hyperparameter optimization with grid search method.
12
+ #
13
+ # @example
14
+ # rfc = Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
15
+ # pg = { n_estimators: [5, 10], max_depth: [3, 5], max_leaf_nodes: [15, 31] }
16
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
17
+ # gs = Rumale::ModelSelection::GridSearchCV.new(estimator: rfc, param_grid: pg, splitter: kf)
18
+ # gs.fit(samples, labels)
19
+ # p gs.cv_results
20
+ # p gs.best_params
21
+ #
22
+ # @example
23
+ # rbf = Rumale::KernelApproximation::RBF.new(random_seed: 1)
24
+ # svc = Rumale::LinearModel::SVC.new(random_seed: 1)
25
+ # pipe = Rumale::Pipeline::Pipeline.new(steps: { rbf: rbf, svc: svc })
26
+ # pg = { rbf__gamma: [32.0, 1.0], rbf__n_components: [4, 128], svc__reg_param: [16.0, 0.1] }
27
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
28
+ # gs = Rumale::ModelSelection::GridSearchCV.new(estimator: pipe, param_grid: pg, splitter: kf)
29
+ # gs.fit(samples, labels)
30
+ # p gs.cv_results
31
+ # p gs.best_params
32
+ #
33
+ class GridSearchCV
34
+ include Base::BaseEstimator
35
+ include Validation
36
+
37
+ # Return the result of cross validation for each parameter.
38
+ # @return [Hash]
39
+ attr_reader :cv_results
40
+
41
+ # Return the score of the estimator learned with the best parameter.
42
+ # @return [Float]
43
+ attr_reader :best_score
44
+
45
+ # Return the best parameter set.
46
+ # @return [Hash]
47
+ attr_reader :best_params
48
+
49
+ # Return the index of the best parameter.
50
+ # @return [Integer]
51
+ attr_reader :best_index
52
+
53
+ # Return the estimator learned with the best parameter.
54
+ # @return [Estimator]
55
+ attr_reader :best_estimator
56
+
57
+ # Create a new grid search method.
58
+ #
59
+ # @param estimator [Classifier/Regresor] The estimator to be searched for optimal parameters with grid search method.
60
+ # @param param_grid [Array<Hash>] The parameter sets is represented with array of hash that
61
+ # consists of parameter names as keys and array of parameter values as values.
62
+ # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset on cross validation.
63
+ # @param evaluator [Evaluator] The evaluator that calculates score of estimator results on cross validation.
64
+ # If nil is given, the score method of estimator is used to evaluation.
65
+ # @param greater_is_better [Boolean] The flag that indicates whether the estimator is better as
66
+ # evaluation score is larger.
67
+ def initialize(estimator: nil, param_grid: nil, splitter: nil, evaluator: nil, greater_is_better: true)
68
+ check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
69
+ check_params_type(Rumale::Base::Splitter, splitter: splitter)
70
+ check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
71
+ check_params_boolean(greater_is_better: greater_is_better)
72
+ @params = {}
73
+ @params[:param_grid] = valid_param_grid(param_grid)
74
+ @params[:estimator] = Marshal.load(Marshal.dump(estimator))
75
+ @params[:splitter] = Marshal.load(Marshal.dump(splitter))
76
+ @params[:evaluator] = Marshal.load(Marshal.dump(evaluator))
77
+ @params[:greater_is_better] = greater_is_better
78
+ @cv_results = nil
79
+ @best_score = nil
80
+ @best_params = nil
81
+ @best_index = nil
82
+ @best_estimator = nil
83
+ end
84
+
85
+ # Fit the model with given training data and all sets of parameters.
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
89
+ # @return [GridSearchCV] The learned estimator with grid search.
90
+ def fit(x, y)
91
+ check_sample_array(x)
92
+
93
+ init_attrs
94
+
95
+ param_combinations.each do |prm_set|
96
+ prm_set.each do |prms|
97
+ report = perform_cross_validation(x, y, prms)
98
+ store_cv_result(prms, report)
99
+ end
100
+ end
101
+
102
+ find_best_params
103
+
104
+ @best_estimator = configurated_estimator(@best_params)
105
+ @best_estimator.fit(x, y)
106
+ self
107
+ end
108
+
109
+ # Call the decision_function method of learned estimator with the best parameter.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
112
+ # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
113
+ def decision_function(x)
114
+ check_sample_array(x)
115
+ @best_estimator.decision_function(x)
116
+ end
117
+
118
+ # Call the predict method of learned estimator with the best parameter.
119
+ #
120
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
121
+ # @return [Numo::NArray] Predicted results.
122
+ def predict(x)
123
+ check_sample_array(x)
124
+ @best_estimator.predict(x)
125
+ end
126
+
127
+ # Call the predict_log_proba method of learned estimator with the best parameter.
128
+ #
129
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
130
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
131
+ def predict_log_proba(x)
132
+ check_sample_array(x)
133
+ @best_estimator.predict_log_proba(x)
134
+ end
135
+
136
+ # Call the predict_proba method of learned estimator with the best parameter.
137
+ #
138
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
139
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
140
+ def predict_proba(x)
141
+ check_sample_array(x)
142
+ @best_estimator.predict_proba(x)
143
+ end
144
+
145
+ # Call the score method of learned estimator with the best parameter.
146
+ #
147
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
148
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
149
+ # @return [Float] The score of estimator.
150
+ def score(x, y)
151
+ check_sample_array(x)
152
+ @best_estimator.score(x, y)
153
+ end
154
+
155
+ # Dump marshal data.
156
+ # @return [Hash] The marshal data about GridSearchCV.
157
+ def marshal_dump
158
+ { params: @params,
159
+ cv_results: @cv_results,
160
+ best_score: @best_score,
161
+ best_params: @best_params,
162
+ best_index: @best_index,
163
+ best_estimator: @best_estimator }
164
+ end
165
+
166
+ # Load marshal data.
167
+ # @return [nil]
168
+ def marshal_load(obj)
169
+ @params = obj[:params]
170
+ @cv_results = obj[:cv_results]
171
+ @best_score = obj[:best_score]
172
+ @best_params = obj[:best_params]
173
+ @best_index = obj[:best_index]
174
+ @best_estimator = obj[:best_estimator]
175
+ nil
176
+ end
177
+
178
+ private
179
+
180
+ def valid_param_grid(grid)
181
+ raise TypeError, 'Expect class of param_grid to be Hash or Array' unless grid.is_a?(Hash) || grid.is_a?(Array)
182
+ grid = [grid] if grid.is_a?(Hash)
183
+ grid.each do |h|
184
+ raise TypeError, 'Expect class of elements in param_grid to be Hash' unless h.is_a?(Hash)
185
+ raise TypeError, 'Expect class of parameter values in param_grid to be Array' unless h.values.all? { |v| v.is_a?(Array) }
186
+ end
187
+ grid
188
+ end
189
+
190
+ def param_combinations
191
+ @param_combinations ||= @params[:param_grid].map do |prm|
192
+ x = Hash[prm.sort].map { |k, v| [k].product(v) }
193
+ x[0].product(*x[1...x.size]).map { |v| Hash[v] }
194
+ end
195
+ end
196
+
197
+ def perform_cross_validation(x, y, prms)
198
+ est = configurated_estimator(prms)
199
+ cv = CrossValidation.new(estimator: est, splitter: @params[:splitter],
200
+ evaluator: @params[:evaluator], return_train_score: true)
201
+ cv.perform(x, y)
202
+ end
203
+
204
+ def configurated_estimator(prms)
205
+ estimator = Marshal.load(Marshal.dump(@params[:estimator]))
206
+ if @params[:estimator].is_a?(Rumale::Pipeline::Pipeline)
207
+ prms.each do |k, v|
208
+ est_name, prm_name = k.to_s.split('__')
209
+ estimator.steps[est_name.to_sym].params[prm_name.to_sym] = v
210
+ end
211
+ else
212
+ prms.each { |k, v| estimator.params[k] = v }
213
+ end
214
+ estimator
215
+ end
216
+
217
+ def init_attrs
218
+ @cv_results = %i[mean_test_score std_test_score
219
+ mean_train_score std_train_score
220
+ mean_fit_time std_fit_time params].map { |v| [v, []] }.to_h
221
+ @best_score = nil
222
+ @best_params = nil
223
+ @best_index = nil
224
+ @best_estimator = nil
225
+ end
226
+
227
+ def store_cv_result(prms, report)
228
+ test_scores = Numo::DFloat[*report[:test_score]]
229
+ train_scores = Numo::DFloat[*report[:train_score]]
230
+ fit_times = Numo::DFloat[*report[:fit_time]]
231
+ @cv_results[:mean_test_score].push(test_scores.mean)
232
+ @cv_results[:std_test_score].push(test_scores.stddev)
233
+ @cv_results[:mean_train_score].push(train_scores.mean)
234
+ @cv_results[:std_train_score].push(train_scores.stddev)
235
+ @cv_results[:mean_fit_time].push(fit_times.mean)
236
+ @cv_results[:std_fit_time].push(fit_times.stddev)
237
+ @cv_results[:params].push(prms)
238
+ end
239
+
240
+ def find_best_params
241
+ @best_score = @params[:greater_is_better] ? @cv_results[:mean_test_score].max : @cv_results[:mean_test_score].min
242
+ @best_index = @cv_results[:mean_test_score].index(@best_score)
243
+ @best_params = @cv_results[:params][@best_index]
244
+ end
245
+ end
246
+ end
247
+ end