rumale 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/linear_model/base_linear_model'
4
+ require 'rumale/base/classifier'
5
+ require 'rumale/probabilistic_output'
6
+
7
+ module Rumale
8
+ # This module consists of the classes that implement generalized linear models.
9
+ module LinearModel
10
+ # SVC is a class that implements Support Vector Classifier
11
+ # with mini-batch stochastic gradient descent optimization.
12
+ # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
+ #
14
+ # @example
15
+ # estimator =
16
+ # Rumale::LinearModel::SVC.new(reg_param: 1.0, max_iter: 1000, batch_size: 20, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
22
+ class SVC < BaseLinearModel
23
+ include Base::Classifier
24
+
25
+ # Return the weight vector for SVC.
26
+ # @return [Numo::DFloat] (shape: [n_classes, n_features])
27
+ attr_reader :weight_vec
28
+
29
+ # Return the bias term (a.k.a. intercept) for SVC.
30
+ # @return [Numo::DFloat] (shape: [n_classes])
31
+ attr_reader :bias_term
32
+
33
+ # Return the class labels.
34
+ # @return [Numo::Int32] (shape: [n_classes])
35
+ attr_reader :classes
36
+
37
+ # Return the random generator for performing random sampling.
38
+ # @return [Random]
39
+ attr_reader :rng
40
+
41
+ # Create a new classifier with Support Vector Machine by the SGD optimization.
42
+ #
43
+ # @param reg_param [Float] The regularization parameter.
44
+ # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
45
+ # @param bias_scale [Float] The scale of the bias term.
46
+ # @param max_iter [Integer] The maximum number of iterations.
47
+ # @param batch_size [Integer] The size of the mini batches.
48
+ # @param probability [Boolean] The flag indicating whether to perform probability estimation.
49
+ # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
50
+ # If nil is given, Nadam is used.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
53
+ max_iter: 1000, batch_size: 20, probability: false, optimizer: nil, random_seed: nil)
54
+ check_params_float(reg_param: reg_param, bias_scale: bias_scale)
55
+ check_params_integer(max_iter: max_iter, batch_size: batch_size)
56
+ check_params_boolean(fit_bias: fit_bias, probability: probability)
57
+ check_params_type_or_nil(Integer, random_seed: random_seed)
58
+ check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter, batch_size: batch_size)
59
+ keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h
60
+ keywd_args.delete(:probability)
61
+ super(keywd_args)
62
+ @params[:probability] = probability
63
+ @prob_param = nil
64
+ @classes = nil
65
+ end
66
+
67
+ # Fit the model with given training data.
68
+ #
69
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
70
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
71
+ # @return [SVC] The learned classifier itself.
72
+ def fit(x, y)
73
+ check_sample_array(x)
74
+ check_label_array(y)
75
+ check_sample_label_size(x, y)
76
+
77
+ @classes = Numo::Int32[*y.to_a.uniq.sort]
78
+ n_classes = @classes.size
79
+ n_features = x.shape[1]
80
+
81
+ if n_classes > 2
82
+ @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
83
+ @bias_term = Numo::DFloat.zeros(n_classes)
84
+ @prob_param = Numo::DFloat.zeros(n_classes, 2)
85
+ n_classes.times do |n|
86
+ bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
87
+ @weight_vec[n, true], @bias_term[n] = partial_fit(x, bin_y)
88
+ @prob_param[n, true] = if @params[:probability]
89
+ Rumale::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec[n, true].transpose) + @bias_term[n], bin_y)
90
+ else
91
+ Numo::DFloat[1, 0]
92
+ end
93
+ end
94
+ else
95
+ negative_label = y.to_a.uniq.min
96
+ bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
97
+ @weight_vec, @bias_term = partial_fit(x, bin_y)
98
+ @prob_param = if @params[:probability]
99
+ Rumale::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec.transpose) + @bias_term, bin_y)
100
+ else
101
+ Numo::DFloat[1, 0]
102
+ end
103
+ end
104
+
105
+ self
106
+ end
107
+
108
+ # Calculate confidence scores for samples.
109
+ #
110
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
111
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
112
+ def decision_function(x)
113
+ check_sample_array(x)
114
+ x.dot(@weight_vec.transpose) + @bias_term
115
+ end
116
+
117
+ # Predict class labels for samples.
118
+ #
119
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
120
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
121
+ def predict(x)
122
+ check_sample_array(x)
123
+
124
+ return Numo::Int32.cast(decision_function(x).ge(0.0)) * 2 - 1 if @classes.size <= 2
125
+
126
+ n_samples, = x.shape
127
+ decision_values = decision_function(x)
128
+ Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
129
+ end
130
+
131
+ # Predict probability for samples.
132
+ #
133
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
134
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
135
+ def predict_proba(x)
136
+ check_sample_array(x)
137
+
138
+ if @classes.size > 2
139
+ probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
140
+ return (probs.transpose / probs.sum(axis: 1)).transpose
141
+ end
142
+
143
+ n_samples, = x.shape
144
+ probs = Numo::DFloat.zeros(n_samples, 2)
145
+ probs[true, 1] = 1.0 / (Numo::NMath.exp(@prob_param[0] * decision_function(x) + @prob_param[1]) + 1.0)
146
+ probs[true, 0] = 1.0 - probs[true, 1]
147
+ probs
148
+ end
149
+
150
+ # Dump marshal data.
151
+ # @return [Hash] The marshal data about SVC.
152
+ def marshal_dump
153
+ { params: @params,
154
+ weight_vec: @weight_vec,
155
+ bias_term: @bias_term,
156
+ prob_param: @prob_param,
157
+ classes: @classes,
158
+ rng: @rng }
159
+ end
160
+
161
+ # Load marshal data.
162
+ # @return [nil]
163
+ def marshal_load(obj)
164
+ @params = obj[:params]
165
+ @weight_vec = obj[:weight_vec]
166
+ @bias_term = obj[:bias_term]
167
+ @prob_param = obj[:prob_param]
168
+ @classes = obj[:classes]
169
+ @rng = obj[:rng]
170
+ nil
171
+ end
172
+
173
+ private
174
+
175
+ def calc_loss_gradient(x, y, weight)
176
+ target_ids = (x.dot(weight) * y).lt(1.0).where
177
+ grad = Numo::DFloat.zeros(@params[:batch_size])
178
+ grad[target_ids] = -y[target_ids]
179
+ grad
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/linear_model/base_linear_model'
4
+ require 'rumale/base/regressor'
5
+
6
+ module Rumale
7
+ module LinearModel
8
+ # SVR is a class that implements Support Vector Regressor
9
+ # with mini-batch stochastic gradient descent optimization.
10
+ #
11
+ # @example
12
+ # estimator =
13
+ # Rumale::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
14
+ # estimator.fit(training_samples, traininig_target_values)
15
+ # results = estimator.predict(testing_samples)
16
+ #
17
+ # *Reference*
18
+ # 1. S. Shalev-Shwartz and Y. Singer, "Pegasos: Primal Estimated sub-GrAdient SOlver for SVM," Proc. ICML'07, pp. 807--814, 2007.
19
+ class SVR < BaseLinearModel
20
+ include Base::Regressor
21
+
22
+ # Return the weight vector for SVR.
23
+ # @return [Numo::DFloat] (shape: [n_outputs, n_features])
24
+ attr_reader :weight_vec
25
+
26
+ # Return the bias term (a.k.a. intercept) for SVR.
27
+ # @return [Numo::DFloat] (shape: [n_outputs])
28
+ attr_reader :bias_term
29
+
30
+ # Return the random generator for performing random sampling.
31
+ # @return [Random]
32
+ attr_reader :rng
33
+
34
+ # Create a new regressor with Support Vector Machine by the SGD optimization.
35
+ #
36
+ # @param reg_param [Float] The regularization parameter.
37
+ # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
38
+ # @param bias_scale [Float] The scale of the bias term.
39
+ # @param epsilon [Float] The margin of tolerance.
40
+ # @param max_iter [Integer] The maximum number of iterations.
41
+ # @param batch_size [Integer] The size of the mini batches.
42
+ # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
43
+ # If nil is given, Nadam is used.
44
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
45
+ def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0, epsilon: 0.1,
46
+ max_iter: 1000, batch_size: 20, optimizer: nil, random_seed: nil)
47
+ check_params_float(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon)
48
+ check_params_integer(max_iter: max_iter, batch_size: batch_size)
49
+ check_params_boolean(fit_bias: fit_bias)
50
+ check_params_type_or_nil(Integer, random_seed: random_seed)
51
+ check_params_positive(reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon,
52
+ max_iter: max_iter, batch_size: batch_size)
53
+ keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h
54
+ keywd_args.delete(:epsilon)
55
+ super(keywd_args)
56
+ @params[:epsilon] = epsilon
57
+ end
58
+
59
+ # Fit the model with given training data.
60
+ #
61
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
62
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
63
+ # @return [SVR] The learned regressor itself.
64
+ def fit(x, y)
65
+ check_sample_array(x)
66
+ check_tvalue_array(y)
67
+ check_sample_tvalue_size(x, y)
68
+
69
+ n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
70
+ n_features = x.shape[1]
71
+
72
+ if n_outputs > 1
73
+ @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
74
+ @bias_term = Numo::DFloat.zeros(n_outputs)
75
+ n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
76
+ else
77
+ @weight_vec, @bias_term = partial_fit(x, y)
78
+ end
79
+
80
+ self
81
+ end
82
+
83
+ # Predict values for samples.
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
86
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
87
+ def predict(x)
88
+ check_sample_array(x)
89
+ x.dot(@weight_vec.transpose) + @bias_term
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about SVR.
94
+ def marshal_dump
95
+ { params: @params,
96
+ weight_vec: @weight_vec,
97
+ bias_term: @bias_term,
98
+ rng: @rng }
99
+ end
100
+
101
+ # Load marshal data.
102
+ # @return [nil]
103
+ def marshal_load(obj)
104
+ @params = obj[:params]
105
+ @weight_vec = obj[:weight_vec]
106
+ @bias_term = obj[:bias_term]
107
+ @rng = obj[:rng]
108
+ nil
109
+ end
110
+
111
+ private
112
+
113
+ def calc_loss_gradient(x, y, weight)
114
+ z = x.dot(weight)
115
+ grad = Numo::DFloat.zeros(@params[:batch_size])
116
+ grad[(z - y).gt(@params[:epsilon]).where] = 1
117
+ grad[(y - z).gt(@params[:epsilon]).where] = -1
118
+ grad
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/base/regressor'
7
+ require 'rumale/base/splitter'
8
+ require 'rumale/base/evaluator'
9
+ require 'rumale/evaluation_measure/log_loss'
10
+
11
+ module Rumale
12
+ # This module consists of the classes for model validation techniques.
13
+ module ModelSelection
14
+ # CrossValidation is a class that evaluates a given classifier with cross-validation method.
15
+ #
16
+ # @example
17
+ # svc = Rumale::LinearModel::SVC.new
18
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
19
+ # cv = Rumale::ModelSelection::CrossValidation.new(estimator: svc, splitter: kf)
20
+ # report = cv.perform(samples, lables)
21
+ # mean_test_score = report[:test_score].inject(:+) / kf.n_splits
22
+ #
23
+ class CrossValidation
24
+ include Validation
25
+
26
+ # Return the classifier of which performance is evaluated.
27
+ # @return [Classifier]
28
+ attr_reader :estimator
29
+
30
+ # Return the splitter that divides dataset.
31
+ # @return [Splitter]
32
+ attr_reader :splitter
33
+
34
+ # Return the evaluator that calculates score.
35
+ # @return [Evaluator]
36
+ attr_reader :evaluator
37
+
38
+ # Return the flag indicating whether to caculate the score of training dataset.
39
+ # @return [Boolean]
40
+ attr_reader :return_train_score
41
+
42
+ # Create a new evaluator with cross-validation method.
43
+ #
44
+ # @param estimator [Classifier] The classifier of which performance is evaluated.
45
+ # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset.
46
+ # @param evaluator [Evaluator] The evaluator that calculates score of estimator results.
47
+ # @param return_train_score [Boolean] The flag indicating whether to calculate the score of training dataset.
48
+ def initialize(estimator: nil, splitter: nil, evaluator: nil, return_train_score: false)
49
+ check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
50
+ check_params_type(Rumale::Base::Splitter, splitter: splitter)
51
+ check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
52
+ check_params_boolean(return_train_score: return_train_score)
53
+ @estimator = estimator
54
+ @splitter = splitter
55
+ @evaluator = evaluator
56
+ @return_train_score = return_train_score
57
+ end
58
+
59
+ # Perform the evalution of given classifier with cross-validation method.
60
+ #
61
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
62
+ # The dataset to be used to evaluate the estimator.
63
+ # @param y [Numo::Int32 / Numo::DFloat] (shape: [n_samples] / [n_samples, n_outputs])
64
+ # The labels to be used to evaluate the classifier / The target values to be used to evaluate the regressor.
65
+ # @return [Hash] The report summarizing the results of cross-validation.
66
+ # * :fit_time (Array<Float>) The calculation times of fitting the estimator for each split.
67
+ # * :test_score (Array<Float>) The scores of testing dataset for each split.
68
+ # * :train_score (Array<Float>) The scores of training dataset for each split. This option is nil if
69
+ # the return_train_score is false.
70
+ def perform(x, y)
71
+ check_sample_array(x)
72
+ if @estimator.is_a?(Rumale::Base::Classifier)
73
+ check_label_array(y)
74
+ check_sample_label_size(x, y)
75
+ end
76
+ if @estimator.is_a?(Rumale::Base::Regressor)
77
+ check_tvalue_array(y)
78
+ check_sample_tvalue_size(x, y)
79
+ end
80
+ # Initialize the report of cross validation.
81
+ report = { test_score: [], train_score: nil, fit_time: [] }
82
+ report[:train_score] = [] if @return_train_score
83
+ # Evaluate the estimator on each split.
84
+ @splitter.split(x, y).each do |train_ids, test_ids|
85
+ # Split dataset into training and testing dataset.
86
+ feature_ids = !kernel_machine? || train_ids
87
+ train_x = x[train_ids, feature_ids]
88
+ train_y = y.shape[1].nil? ? y[train_ids] : y[train_ids, true]
89
+ test_x = x[test_ids, feature_ids]
90
+ test_y = y.shape[1].nil? ? y[test_ids] : y[test_ids, true]
91
+ # Fit the estimator.
92
+ start_time = Time.now.to_i
93
+ @estimator.fit(train_x, train_y)
94
+ # Calculate scores and prepare the report.
95
+ report[:fit_time].push(Time.now.to_i - start_time)
96
+ if @evaluator.nil?
97
+ report[:test_score].push(@estimator.score(test_x, test_y))
98
+ report[:train_score].push(@estimator.score(train_x, train_y)) if @return_train_score
99
+ elsif log_loss?
100
+ report[:test_score].push(@evaluator.score(test_y, @estimator.predict_proba(test_x)))
101
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict_proba(train_x))) if @return_train_score
102
+ else
103
+ report[:test_score].push(@evaluator.score(test_y, @estimator.predict(test_x)))
104
+ report[:train_score].push(@evaluator.score(train_y, @estimator.predict(train_x))) if @return_train_score
105
+ end
106
+ end
107
+ report
108
+ end
109
+
110
+ private
111
+
112
+ def kernel_machine?
113
+ class_name = @estimator.class.to_s
114
+ class_name = @estimator.params[:estimator].class.to_s if class_name.include?('Multiclass')
115
+ class_name.include?('KernelMachine')
116
+ end
117
+
118
+ def log_loss?
119
+ @evaluator.is_a?(Rumale::EvaluationMeasure::LogLoss)
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,247 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/validation'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/evaluator'
6
+ require 'rumale/base/splitter'
7
+ require 'rumale/pipeline/pipeline'
8
+
9
+ module Rumale
10
+ module ModelSelection
11
+ # GridSearchCV is a class that performs hyperparameter optimization with grid search method.
12
+ #
13
+ # @example
14
+ # rfc = Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
15
+ # pg = { n_estimators: [5, 10], max_depth: [3, 5], max_leaf_nodes: [15, 31] }
16
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
17
+ # gs = Rumale::ModelSelection::GridSearchCV.new(estimator: rfc, param_grid: pg, splitter: kf)
18
+ # gs.fit(samples, labels)
19
+ # p gs.cv_results
20
+ # p gs.best_params
21
+ #
22
+ # @example
23
+ # rbf = Rumale::KernelApproximation::RBF.new(random_seed: 1)
24
+ # svc = Rumale::LinearModel::SVC.new(random_seed: 1)
25
+ # pipe = Rumale::Pipeline::Pipeline.new(steps: { rbf: rbf, svc: svc })
26
+ # pg = { rbf__gamma: [32.0, 1.0], rbf__n_components: [4, 128], svc__reg_param: [16.0, 0.1] }
27
+ # kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
28
+ # gs = Rumale::ModelSelection::GridSearchCV.new(estimator: pipe, param_grid: pg, splitter: kf)
29
+ # gs.fit(samples, labels)
30
+ # p gs.cv_results
31
+ # p gs.best_params
32
+ #
33
+ class GridSearchCV
34
+ include Base::BaseEstimator
35
+ include Validation
36
+
37
+ # Return the result of cross validation for each parameter.
38
+ # @return [Hash]
39
+ attr_reader :cv_results
40
+
41
+ # Return the score of the estimator learned with the best parameter.
42
+ # @return [Float]
43
+ attr_reader :best_score
44
+
45
+ # Return the best parameter set.
46
+ # @return [Hash]
47
+ attr_reader :best_params
48
+
49
+ # Return the index of the best parameter.
50
+ # @return [Integer]
51
+ attr_reader :best_index
52
+
53
+ # Return the estimator learned with the best parameter.
54
+ # @return [Estimator]
55
+ attr_reader :best_estimator
56
+
57
+ # Create a new grid search method.
58
+ #
59
+ # @param estimator [Classifier/Regresor] The estimator to be searched for optimal parameters with grid search method.
60
+ # @param param_grid [Array<Hash>] The parameter sets is represented with array of hash that
61
+ # consists of parameter names as keys and array of parameter values as values.
62
+ # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset on cross validation.
63
+ # @param evaluator [Evaluator] The evaluator that calculates score of estimator results on cross validation.
64
+ # If nil is given, the score method of estimator is used to evaluation.
65
+ # @param greater_is_better [Boolean] The flag that indicates whether the estimator is better as
66
+ # evaluation score is larger.
67
+ def initialize(estimator: nil, param_grid: nil, splitter: nil, evaluator: nil, greater_is_better: true)
68
+ check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
69
+ check_params_type(Rumale::Base::Splitter, splitter: splitter)
70
+ check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
71
+ check_params_boolean(greater_is_better: greater_is_better)
72
+ @params = {}
73
+ @params[:param_grid] = valid_param_grid(param_grid)
74
+ @params[:estimator] = Marshal.load(Marshal.dump(estimator))
75
+ @params[:splitter] = Marshal.load(Marshal.dump(splitter))
76
+ @params[:evaluator] = Marshal.load(Marshal.dump(evaluator))
77
+ @params[:greater_is_better] = greater_is_better
78
+ @cv_results = nil
79
+ @best_score = nil
80
+ @best_params = nil
81
+ @best_index = nil
82
+ @best_estimator = nil
83
+ end
84
+
85
+ # Fit the model with given training data and all sets of parameters.
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
89
+ # @return [GridSearchCV] The learned estimator with grid search.
90
+ def fit(x, y)
91
+ check_sample_array(x)
92
+
93
+ init_attrs
94
+
95
+ param_combinations.each do |prm_set|
96
+ prm_set.each do |prms|
97
+ report = perform_cross_validation(x, y, prms)
98
+ store_cv_result(prms, report)
99
+ end
100
+ end
101
+
102
+ find_best_params
103
+
104
+ @best_estimator = configurated_estimator(@best_params)
105
+ @best_estimator.fit(x, y)
106
+ self
107
+ end
108
+
109
+ # Call the decision_function method of learned estimator with the best parameter.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
112
+ # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
113
+ def decision_function(x)
114
+ check_sample_array(x)
115
+ @best_estimator.decision_function(x)
116
+ end
117
+
118
+ # Call the predict method of learned estimator with the best parameter.
119
+ #
120
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
121
+ # @return [Numo::NArray] Predicted results.
122
+ def predict(x)
123
+ check_sample_array(x)
124
+ @best_estimator.predict(x)
125
+ end
126
+
127
+ # Call the predict_log_proba method of learned estimator with the best parameter.
128
+ #
129
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
130
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
131
+ def predict_log_proba(x)
132
+ check_sample_array(x)
133
+ @best_estimator.predict_log_proba(x)
134
+ end
135
+
136
+ # Call the predict_proba method of learned estimator with the best parameter.
137
+ #
138
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
139
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
140
+ def predict_proba(x)
141
+ check_sample_array(x)
142
+ @best_estimator.predict_proba(x)
143
+ end
144
+
145
+ # Call the score method of learned estimator with the best parameter.
146
+ #
147
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
148
+ # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
149
+ # @return [Float] The score of estimator.
150
+ def score(x, y)
151
+ check_sample_array(x)
152
+ @best_estimator.score(x, y)
153
+ end
154
+
155
+ # Dump marshal data.
156
+ # @return [Hash] The marshal data about GridSearchCV.
157
+ def marshal_dump
158
+ { params: @params,
159
+ cv_results: @cv_results,
160
+ best_score: @best_score,
161
+ best_params: @best_params,
162
+ best_index: @best_index,
163
+ best_estimator: @best_estimator }
164
+ end
165
+
166
+ # Load marshal data.
167
+ # @return [nil]
168
+ def marshal_load(obj)
169
+ @params = obj[:params]
170
+ @cv_results = obj[:cv_results]
171
+ @best_score = obj[:best_score]
172
+ @best_params = obj[:best_params]
173
+ @best_index = obj[:best_index]
174
+ @best_estimator = obj[:best_estimator]
175
+ nil
176
+ end
177
+
178
+ private
179
+
180
+ def valid_param_grid(grid)
181
+ raise TypeError, 'Expect class of param_grid to be Hash or Array' unless grid.is_a?(Hash) || grid.is_a?(Array)
182
+ grid = [grid] if grid.is_a?(Hash)
183
+ grid.each do |h|
184
+ raise TypeError, 'Expect class of elements in param_grid to be Hash' unless h.is_a?(Hash)
185
+ raise TypeError, 'Expect class of parameter values in param_grid to be Array' unless h.values.all? { |v| v.is_a?(Array) }
186
+ end
187
+ grid
188
+ end
189
+
190
+ def param_combinations
191
+ @param_combinations ||= @params[:param_grid].map do |prm|
192
+ x = Hash[prm.sort].map { |k, v| [k].product(v) }
193
+ x[0].product(*x[1...x.size]).map { |v| Hash[v] }
194
+ end
195
+ end
196
+
197
+ def perform_cross_validation(x, y, prms)
198
+ est = configurated_estimator(prms)
199
+ cv = CrossValidation.new(estimator: est, splitter: @params[:splitter],
200
+ evaluator: @params[:evaluator], return_train_score: true)
201
+ cv.perform(x, y)
202
+ end
203
+
204
+ def configurated_estimator(prms)
205
+ estimator = Marshal.load(Marshal.dump(@params[:estimator]))
206
+ if @params[:estimator].is_a?(Rumale::Pipeline::Pipeline)
207
+ prms.each do |k, v|
208
+ est_name, prm_name = k.to_s.split('__')
209
+ estimator.steps[est_name.to_sym].params[prm_name.to_sym] = v
210
+ end
211
+ else
212
+ prms.each { |k, v| estimator.params[k] = v }
213
+ end
214
+ estimator
215
+ end
216
+
217
+ def init_attrs
218
+ @cv_results = %i[mean_test_score std_test_score
219
+ mean_train_score std_train_score
220
+ mean_fit_time std_fit_time params].map { |v| [v, []] }.to_h
221
+ @best_score = nil
222
+ @best_params = nil
223
+ @best_index = nil
224
+ @best_estimator = nil
225
+ end
226
+
227
+ def store_cv_result(prms, report)
228
+ test_scores = Numo::DFloat[*report[:test_score]]
229
+ train_scores = Numo::DFloat[*report[:train_score]]
230
+ fit_times = Numo::DFloat[*report[:fit_time]]
231
+ @cv_results[:mean_test_score].push(test_scores.mean)
232
+ @cv_results[:std_test_score].push(test_scores.stddev)
233
+ @cv_results[:mean_train_score].push(train_scores.mean)
234
+ @cv_results[:std_train_score].push(train_scores.stddev)
235
+ @cv_results[:mean_fit_time].push(fit_times.mean)
236
+ @cv_results[:std_fit_time].push(fit_times.stddev)
237
+ @cv_results[:params].push(prms)
238
+ end
239
+
240
+ def find_best_params
241
+ @best_score = @params[:greater_is_better] ? @cv_results[:mean_test_score].max : @cv_results[:mean_test_score].min
242
+ @best_index = @cv_results[:mean_test_score].index(@best_score)
243
+ @best_params = @cv_results[:params][@best_index]
244
+ end
245
+ end
246
+ end
247
+ end