rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,237 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/regressor'
6
- require 'rumale/tree/gradient_tree_regressor'
7
-
8
- module Rumale
9
- module Ensemble
10
- # GradientBoostingRegressor is a class that implements gradient tree boosting for regression.
11
- # The class use L2 loss for the loss function.
12
- #
13
- # @example
14
- # estimator =
15
- # Rumale::Ensemble::GradientBoostingRegressor.new(
16
- # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
17
- # estimator.fit(training_samples, traininig_values)
18
- # results = estimator.predict(testing_samples)
19
- #
20
- # *Reference*
21
- # - Friedman, J H. "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
22
- # - Friedman, J H. "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
23
- # - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
24
- #
25
- class GradientBoostingRegressor
26
- include Base::BaseEstimator
27
- include Base::Regressor
28
-
29
- # Return the set of estimators.
30
- # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
31
- attr_reader :estimators
32
-
33
- # Return the importance for each feature.
34
- # The feature importances are calculated based on the numbers of times the feature is used for splitting.
35
- # @return [Numo::DFloat] (size: n_features)
36
- attr_reader :feature_importances
37
-
38
- # Return the random generator for random selection of feature index.
39
- # @return [Random]
40
- attr_reader :rng
41
-
42
- # Create a new regressor with gradient tree boosting.
43
- #
44
- # @param n_estimators [Integer] The numeber of trees for contructing regressor.
45
- # @param learning_rate [Float] The boosting learining rate
46
- # @param reg_lambda [Float] The L2 regularization term on weight.
47
- # @param subsample [Float] The subsampling ratio of the training samples.
48
- # @param max_depth [Integer] The maximum depth of the tree.
49
- # If nil is given, decision tree grows without concern for depth.
50
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
51
- # If nil is given, number of leaves is not limited.
52
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
53
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
54
- # If nil is given, split process considers all features.
55
- # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
56
- # If nil is given, the methods do not execute in parallel.
57
- # If zero or less is given, it becomes equal to the number of processors.
58
- # This parameter is ignored if the Parallel gem is not loaded.
59
- # @param random_seed [Integer] The seed value using to initialize the random generator.
60
- # It is used to randomly determine the order of features when deciding spliting point.
61
- def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
62
- max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
63
- max_features: nil, n_jobs: nil, random_seed: nil)
64
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
65
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
66
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf,
67
- learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
68
- check_params_positive(n_estimators: n_estimators, learning_rate: learning_rate, reg_lambda: reg_lambda,
69
- subsample: subsample, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
70
- min_samples_leaf: min_samples_leaf, max_features: max_features)
71
- @params = {}
72
- @params[:n_estimators] = n_estimators
73
- @params[:learning_rate] = learning_rate
74
- @params[:reg_lambda] = reg_lambda
75
- @params[:subsample] = subsample
76
- @params[:max_depth] = max_depth
77
- @params[:max_leaf_nodes] = max_leaf_nodes
78
- @params[:min_samples_leaf] = min_samples_leaf
79
- @params[:max_features] = max_features
80
- @params[:n_jobs] = n_jobs
81
- @params[:random_seed] = random_seed
82
- @params[:random_seed] ||= srand
83
- @estimators = nil
84
- @base_predictions = nil
85
- @feature_importances = nil
86
- @rng = Random.new(@params[:random_seed])
87
- end
88
-
89
- # Fit the model with given training data.
90
- #
91
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
92
- # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
93
- # @return [GradientBoostingRegressor] The learned regressor itself.
94
- def fit(x, y)
95
- x = check_convert_sample_array(x)
96
- y = check_convert_tvalue_array(y)
97
- check_sample_tvalue_size(x, y)
98
- # initialize some variables.
99
- n_features = x.shape[1]
100
- @params[:max_features] = n_features if @params[:max_features].nil?
101
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
102
- n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
103
- # train regressor.
104
- @base_predictions = n_outputs > 1 ? y.mean(0) : y.mean
105
- @estimators = if n_outputs > 1
106
- multivar_estimators(x, y)
107
- else
108
- partial_fit(x, y, @base_predictions)
109
- end
110
- # calculate feature importances.
111
- @feature_importances = if n_outputs > 1
112
- multivar_feature_importances
113
- else
114
- @estimators.map(&:feature_importances).reduce(&:+)
115
- end
116
- self
117
- end
118
-
119
- # Predict values for samples.
120
- #
121
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
122
- # @return [Numo::DFloat] (shape: [n_samples]) Predicted values per sample.
123
- def predict(x)
124
- x = check_convert_sample_array(x)
125
- n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
126
- if n_outputs > 1
127
- multivar_predict(x)
128
- elsif enable_parallel?
129
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].predict(x) }.reduce(&:+) + @base_predictions
130
- else
131
- @estimators.map { |tree| tree.predict(x) }.reduce(&:+) + @base_predictions
132
- end
133
- end
134
-
135
- # Return the index of the leaf that each sample reached.
136
- #
137
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
138
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
139
- def apply(x)
140
- x = check_convert_sample_array(x)
141
- n_outputs = @estimators.first.is_a?(Array) ? @estimators.size : 1
142
- leaf_ids = if n_outputs > 1
143
- Array.new(n_outputs) { |n| @estimators[n].map { |tree| tree.apply(x) } }
144
- else
145
- @estimators.map { |tree| tree.apply(x) }
146
- end
147
- Numo::Int32[*leaf_ids].transpose.dup
148
- end
149
-
150
- private
151
-
152
- def partial_fit(x, y, init_pred)
153
- # initialize some variables.
154
- estimators = []
155
- n_samples = x.shape[0]
156
- n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
157
- whole_ids = Array.new(n_samples) { |v| v }
158
- y_pred = Numo::DFloat.ones(n_samples) * init_pred
159
- sub_rng = @rng.dup
160
- # grow trees.
161
- @params[:n_estimators].times do |_t|
162
- # subsampling
163
- ids = whole_ids.sample(n_sub_samples, random: sub_rng)
164
- x_sub = x[ids, true]
165
- y_sub = y[ids]
166
- y_pred_sub = y_pred[ids]
167
- # train tree
168
- g = gradient(y_sub, y_pred_sub)
169
- h = hessian(n_sub_samples)
170
- tree = plant_tree(sub_rng)
171
- tree.fit(x_sub, y_sub, g, h)
172
- estimators.push(tree)
173
- # update
174
- y_pred += tree.predict(x)
175
- end
176
- estimators
177
- end
178
-
179
- # for debug
180
- #
181
- # def loss(y_true, y_pred)
182
- # ((y_true - y_pred)**2).mean
183
- # end
184
-
185
- def gradient(y_true, y_pred)
186
- y_pred - y_true
187
- end
188
-
189
- def hessian(n_samples)
190
- Numo::DFloat.ones(n_samples)
191
- end
192
-
193
- def plant_tree(sub_rng)
194
- Rumale::Tree::GradientTreeRegressor.new(
195
- reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
196
- max_depth: @params[:max_depth],
197
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
198
- max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
199
- )
200
- end
201
-
202
- def multivar_estimators(x, y)
203
- n_outputs = y.shape[1]
204
- if enable_parallel?
205
- parallel_map(n_outputs) { |n| partial_fit(x, y[true, n], @base_predictions[n]) }
206
- else
207
- Array.new(n_outputs) { |n| partial_fit(x, y[true, n], @base_predictions[n]) }
208
- end
209
- end
210
-
211
- def multivar_feature_importances
212
- n_outputs = @estimators.size
213
- if enable_parallel?
214
- parallel_map(n_outputs) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
215
- else
216
- Array.new(n_outputs) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
217
- end
218
- end
219
-
220
- def multivar_predict(x)
221
- n_outputs = @estimators.size
222
- p = if enable_parallel?
223
- # :nocov:
224
- parallel_map(n_outputs) do |n|
225
- @estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
226
- end
227
- # :nocov:
228
- else
229
- Array.new(n_outputs) do |n|
230
- @estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
231
- end
232
- end
233
- Numo::DFloat.asarray(p).transpose + @base_predictions
234
- end
235
- end
236
- end
237
- end
@@ -1,189 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/classifier'
6
- require 'rumale/tree/decision_tree_classifier'
7
-
8
- module Rumale
9
- # This module consists of the classes that implement ensemble-based methods.
10
- module Ensemble
11
- # RandomForestClassifier is a class that implements random forest for classification.
12
- #
13
- # @example
14
- # estimator =
15
- # Rumale::Ensemble::RandomForestClassifier.new(
16
- # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
- # estimator.fit(training_samples, traininig_labels)
18
- # results = estimator.predict(testing_samples)
19
- #
20
- class RandomForestClassifier
21
- include Base::BaseEstimator
22
- include Base::Classifier
23
-
24
- # Return the set of estimators.
25
- # @return [Array<DecisionTreeClassifier>]
26
- attr_reader :estimators
27
-
28
- # Return the class labels.
29
- # @return [Numo::Int32] (size: n_classes)
30
- attr_reader :classes
31
-
32
- # Return the importance for each feature.
33
- # @return [Numo::DFloat] (size: n_features)
34
- attr_reader :feature_importances
35
-
36
- # Return the random generator for random selection of feature index.
37
- # @return [Random]
38
- attr_reader :rng
39
-
40
- # Create a new classifier with random forest.
41
- #
42
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
43
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
44
- # @param max_depth [Integer] The maximum depth of the tree.
45
- # If nil is given, decision tree grows without concern for depth.
46
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
- # If nil is given, number of leaves is not limited.
48
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
- # If nil is given, split process considers 'Math.sqrt(n_features)' features.
51
- # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
52
- # If nil is given, the method does not execute in parallel.
53
- # If zero or less is given, it becomes equal to the number of processors.
54
- # This parameter is ignored if the Parallel gem is not loaded.
55
- # @param random_seed [Integer] The seed value using to initialize the random generator.
56
- # It is used to randomly determine the order of features when deciding spliting point.
57
- def initialize(n_estimators: 10,
58
- criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
59
- max_features: nil, n_jobs: nil, random_seed: nil)
60
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
61
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
62
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
63
- check_params_string(criterion: criterion)
64
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
65
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
66
- max_features: max_features)
67
- @params = {}
68
- @params[:n_estimators] = n_estimators
69
- @params[:criterion] = criterion
70
- @params[:max_depth] = max_depth
71
- @params[:max_leaf_nodes] = max_leaf_nodes
72
- @params[:min_samples_leaf] = min_samples_leaf
73
- @params[:max_features] = max_features
74
- @params[:n_jobs] = n_jobs
75
- @params[:random_seed] = random_seed
76
- @params[:random_seed] ||= srand
77
- @estimators = nil
78
- @classes = nil
79
- @feature_importances = nil
80
- @rng = Random.new(@params[:random_seed])
81
- end
82
-
83
- # Fit the model with given training data.
84
- #
85
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
86
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
87
- # @return [RandomForestClassifier] The learned classifier itself.
88
- def fit(x, y) # rubocop:disable Metrics/AbcSize
89
- x = check_convert_sample_array(x)
90
- y = check_convert_label_array(y)
91
- check_sample_label_size(x, y)
92
- # Initialize some variables.
93
- n_samples, n_features = x.shape
94
- @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
95
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
96
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
97
- sub_rng = @rng.dup
98
- rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(Rumale::Values.int_max)) }
99
- # Construct forest.
100
- @estimators =
101
- if enable_parallel?
102
- # :nocov:
103
- parallel_map(@params[:n_estimators]) do |n|
104
- bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
105
- plant_tree(rngs[n].rand(Rumale::Values.int_max)).fit(x[bootstrap_ids, true], y[bootstrap_ids])
106
- end
107
- # :nocov:
108
- else
109
- Array.new(@params[:n_estimators]) do |n|
110
- bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
111
- plant_tree(rngs[n].rand(Rumale::Values.int_max)).fit(x[bootstrap_ids, true], y[bootstrap_ids])
112
- end
113
- end
114
- @feature_importances =
115
- if enable_parallel?
116
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
117
- else
118
- @estimators.map(&:feature_importances).reduce(&:+)
119
- end
120
- @feature_importances /= @feature_importances.sum
121
- self
122
- end
123
-
124
- # Predict class labels for samples.
125
- #
126
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
127
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
128
- def predict(x)
129
- x = check_convert_sample_array(x)
130
- n_samples = x.shape[0]
131
- n_estimators = @estimators.size
132
- predicted = if enable_parallel?
133
- predict_set = parallel_map(n_estimators) { |n| @estimators[n].predict(x).to_a }.transpose
134
- parallel_map(n_samples) { |n| predict_set[n].group_by { |v| v }.max_by { |_k, v| v.size }.first }
135
- else
136
- predict_set = @estimators.map { |tree| tree.predict(x).to_a }.transpose
137
- Array.new(n_samples) { |n| predict_set[n].group_by { |v| v }.max_by { |_k, v| v.size }.first }
138
- end
139
- Numo::Int32.asarray(predicted)
140
- end
141
-
142
- # Predict probability for samples.
143
- #
144
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
145
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
146
- def predict_proba(x)
147
- x = check_convert_sample_array(x)
148
- n_estimators = @estimators.size
149
- if enable_parallel?
150
- parallel_map(n_estimators) { |n| predict_proba_tree(@estimators[n], x) }.reduce(&:+) / n_estimators
151
- else
152
- @estimators.map { |tree| predict_proba_tree(tree, x) }.reduce(&:+) / n_estimators
153
- end
154
- end
155
-
156
- # Return the index of the leaf that each sample reached.
157
- #
158
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
159
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
160
- def apply(x)
161
- x = check_convert_sample_array(x)
162
- Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
163
- end
164
-
165
- private
166
-
167
- def plant_tree(rnd_seed)
168
- Tree::DecisionTreeClassifier.new(
169
- criterion: @params[:criterion], max_depth: @params[:max_depth],
170
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
171
- max_features: @params[:max_features], random_seed: rnd_seed
172
- )
173
- end
174
-
175
- def predict_proba_tree(tree, x)
176
- # initialize some variables.
177
- n_samples = x.shape[0]
178
- base_classes = @classes.to_a
179
- n_classes = base_classes.size
180
- class_ids = tree.classes.map { |c| base_classes.index(c) }
181
- # predict probabilities.
182
- probs = Numo::DFloat.zeros(n_samples, n_classes)
183
- tree_probs = tree.predict_proba(x)
184
- class_ids.each_with_index { |i, j| probs[true, i] = tree_probs[true, j] }
185
- probs
186
- end
187
- end
188
- end
189
- end
@@ -1,153 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/regressor'
6
- require 'rumale/tree/decision_tree_regressor'
7
-
8
- module Rumale
9
- module Ensemble
10
- # RandomForestRegressor is a class that implements random forest for regression
11
- #
12
- # @example
13
- # estimator =
14
- # Rumale::Ensemble::RandomForestRegressor.new(
15
- # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
- # estimator.fit(training_samples, traininig_values)
17
- # results = estimator.predict(testing_samples)
18
- #
19
- class RandomForestRegressor
20
- include Base::BaseEstimator
21
- include Base::Regressor
22
-
23
- # Return the set of estimators.
24
- # @return [Array<DecisionTreeRegressor>]
25
- attr_reader :estimators
26
-
27
- # Return the importance for each feature.
28
- # @return [Numo::DFloat] (size: n_features)
29
- attr_reader :feature_importances
30
-
31
- # Return the random generator for random selection of feature index.
32
- # @return [Random]
33
- attr_reader :rng
34
-
35
- # Create a new regressor with random forest.
36
- #
37
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
38
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
- # @param max_depth [Integer] The maximum depth of the tree.
40
- # If nil is given, decision tree grows without concern for depth.
41
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
42
- # If nil is given, number of leaves is not limited.
43
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
- # If nil is given, split process considers 'Math.sqrt(n_features)' features.
46
- # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
47
- # If nil is given, the methods do not execute in parallel.
48
- # If zero or less is given, it becomes equal to the number of processors.
49
- # This parameter is ignored if the Parallel gem is not loaded.
50
- # @param random_seed [Integer] The seed value using to initialize the random generator.
51
- # It is used to randomly determine the order of features when deciding spliting point.
52
- def initialize(n_estimators: 10,
53
- criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
54
- max_features: nil, n_jobs: nil, random_seed: nil)
55
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
57
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
58
- check_params_string(criterion: criterion)
59
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
60
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
61
- max_features: max_features)
62
- @params = {}
63
- @params[:n_estimators] = n_estimators
64
- @params[:criterion] = criterion
65
- @params[:max_depth] = max_depth
66
- @params[:max_leaf_nodes] = max_leaf_nodes
67
- @params[:min_samples_leaf] = min_samples_leaf
68
- @params[:max_features] = max_features
69
- @params[:n_jobs] = n_jobs
70
- @params[:random_seed] = random_seed
71
- @params[:random_seed] ||= srand
72
- @estimators = nil
73
- @feature_importances = nil
74
- @rng = Random.new(@params[:random_seed])
75
- end
76
-
77
- # Fit the model with given training data.
78
- #
79
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
81
- # @return [RandomForestRegressor] The learned regressor itself.
82
- def fit(x, y) # rubocop:disable Metrics/AbcSize
83
- x = check_convert_sample_array(x)
84
- y = check_convert_tvalue_array(y)
85
- check_sample_tvalue_size(x, y)
86
- # Initialize some variables.
87
- n_samples, n_features = x.shape
88
- @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
89
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
90
- single_target = y.shape[1].nil?
91
- sub_rng = @rng.dup
92
- rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(Rumale::Values.int_max)) }
93
- # Construct forest.
94
- @estimators =
95
- if enable_parallel?
96
- # :nocov:
97
- parallel_map(@params[:n_estimators]) do |n|
98
- bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
99
- tree = plant_tree(rngs[n].rand(Rumale::Values.int_max))
100
- tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
101
- end
102
- # :nocov:
103
- else
104
- Array.new(@params[:n_estimators]) do |n|
105
- bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
106
- tree = plant_tree(rngs[n].rand(Rumale::Values.int_max))
107
- tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
108
- end
109
- end
110
- @feature_importances =
111
- if enable_parallel?
112
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
113
- else
114
- @estimators.map(&:feature_importances).reduce(&:+)
115
- end
116
- @feature_importances /= @feature_importances.sum
117
- self
118
- end
119
-
120
- # Predict values for samples.
121
- #
122
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
123
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
124
- def predict(x)
125
- x = check_convert_sample_array(x)
126
- if enable_parallel?
127
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].predict(x) }.reduce(&:+) / @params[:n_estimators]
128
- else
129
- @estimators.map { |tree| tree.predict(x) }.reduce(&:+) / @params[:n_estimators]
130
- end
131
- end
132
-
133
- # Return the index of the leaf that each sample reached.
134
- #
135
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
136
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
137
- def apply(x)
138
- x = check_convert_sample_array(x)
139
- Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
140
- end
141
-
142
- private
143
-
144
- def plant_tree(rnd_seed)
145
- Tree::DecisionTreeRegressor.new(
146
- criterion: @params[:criterion], max_depth: @params[:max_depth],
147
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
148
- max_features: @params[:max_features], random_seed: rnd_seed
149
- )
150
- end
151
- end
152
- end
153
- end