rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,124 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/utils'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/transformer'
6
-
7
- module Rumale
8
- module Decomposition
9
- # NMF is a class that implements Non-negative Matrix Factorization.
10
- #
11
- # @example
12
- # decomposer = Rumale::Decomposition::NMF.new(n_components: 2)
13
- # representaion = decomposer.fit_transform(samples)
14
- #
15
- # *Reference*
16
- # - Xu, W., Liu, X., and Gong, Y., "Document Clustering Based On Non-negative Matrix Factorization," Proc. SIGIR' 03 , pp. 267--273, 2003.
17
- class NMF
18
- include Base::BaseEstimator
19
- include Base::Transformer
20
-
21
- # Returns the factorization matrix.
22
- # @return [Numo::DFloat] (shape: [n_components, n_features])
23
- attr_reader :components
24
-
25
- # Return the random generator.
26
- # @return [Random]
27
- attr_reader :rng
28
-
29
- # Create a new transformer with NMF.
30
- #
31
- # @param n_components [Integer] The number of components.
32
- # @param max_iter [Integer] The maximum number of iterations.
33
- # @param tol [Float] The tolerance of termination criterion.
34
- # @param eps [Float] A small value close to zero to avoid zero division error.
35
- # @param random_seed [Integer] The seed value using to initialize the random generator.
36
- def initialize(n_components: 2, max_iter: 500, tol: 1.0e-4, eps: 1.0e-16, random_seed: nil)
37
- check_params_numeric(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
38
- check_params_numeric_or_nil(random_seed: random_seed)
39
- check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol, eps: eps)
40
- @params = {}
41
- @params[:n_components] = n_components
42
- @params[:max_iter] = max_iter
43
- @params[:tol] = tol
44
- @params[:eps] = eps
45
- @params[:random_seed] = random_seed
46
- @params[:random_seed] ||= srand
47
- @components = nil
48
- @rng = Random.new(@params[:random_seed])
49
- end
50
-
51
- # Fit the model with given training data.
52
- #
53
- # @overload fit(x) -> NMF
54
- #
55
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
56
- # @return [NMF] The learned transformer itself.
57
- def fit(x, _y = nil)
58
- x = check_convert_sample_array(x)
59
- partial_fit(x)
60
- self
61
- end
62
-
63
- # Fit the model with training data, and then transform them with the learned model.
64
- #
65
- # @overload fit_transform(x) -> Numo::DFloat
66
- #
67
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
68
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
69
- def fit_transform(x, _y = nil)
70
- x = check_convert_sample_array(x)
71
- partial_fit(x)
72
- end
73
-
74
- # Transform the given data with the learned model.
75
- #
76
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
77
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
78
- def transform(x)
79
- x = check_convert_sample_array(x)
80
- partial_fit(x, update_comps: false)
81
- end
82
-
83
- # Inverse transform the given transformed data with the learned model.
84
- #
85
- # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
86
- # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
87
- def inverse_transform(z)
88
- z = check_convert_sample_array(z)
89
- z.dot(@components)
90
- end
91
-
92
- private
93
-
94
- def partial_fit(x, update_comps: true)
95
- # initialize some variables.
96
- n_samples, n_features = x.shape
97
- scale = Math.sqrt(x.mean / @params[:n_components])
98
- sub_rng = @rng.dup
99
- @components = Rumale::Utils.rand_uniform([@params[:n_components], n_features], sub_rng) * scale if update_comps
100
- coefficients = Rumale::Utils.rand_uniform([n_samples, @params[:n_components]], sub_rng) * scale
101
- # optimization.
102
- @params[:max_iter].times do
103
- # update
104
- if update_comps
105
- nume = coefficients.transpose.dot(x)
106
- deno = coefficients.transpose.dot(coefficients).dot(@components) + @params[:eps]
107
- @components *= (nume / deno)
108
- end
109
- nume = x.dot(@components.transpose)
110
- deno = coefficients.dot(@components).dot(@components.transpose) + @params[:eps]
111
- coefficients *= (nume / deno)
112
- # normalize
113
- norm = Numo::NMath.sqrt((@components**2).sum(1)) + @params[:eps]
114
- @components /= norm.expand_dims(1) if update_comps
115
- coefficients *= norm
116
- # check convergence
117
- err = ((x - coefficients.dot(@components))**2).sum(1).mean
118
- break if err < @params[:tol]
119
- end
120
- coefficients
121
- end
122
- end
123
- end
124
- end
@@ -1,159 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- # Module for matrix decomposition algorithms.
8
- module Decomposition
9
- # PCA is a class that implements Principal Component Analysis.
10
- #
11
- # @example
12
- # decomposer = Rumale::Decomposition::PCA.new(n_components: 2, solver: 'fpt')
13
- # representaion = decomposer.fit_transform(samples)
14
- #
15
- # # If Numo::Linalg is installed, you can specify 'evd' for the solver option.
16
- # require 'numo/linalg/autoloader'
17
- # decomposer = Rumale::Decomposition::PCA.new(n_components: 2, solver: 'evd')
18
- # representaion = decomposer.fit_transform(samples)
19
- #
20
- # # If Numo::Linalg is loaded and the solver option is not given,
21
- # # the solver option is choosen 'evd' automatically.
22
- # decomposer = Rumale::Decomposition::PCA.new(n_components: 2)
23
- # representaion = decomposer.fit_transform(samples)
24
- #
25
- # *Reference*
26
- # - Sharma, A., and Paliwal, K K., "Fast principal component analysis using fixed-point algorithm," Pattern Recognition Letters, 28, pp. 1151--1155, 2007.
27
- class PCA
28
- include Base::BaseEstimator
29
- include Base::Transformer
30
-
31
- # Returns the principal components.
32
- # @return [Numo::DFloat] (shape: [n_components, n_features])
33
- attr_reader :components
34
-
35
- # Returns the mean vector.
36
- # @return [Numo::DFloat] (shape: [n_features])
37
- attr_reader :mean
38
-
39
- # Return the random generator.
40
- # @return [Random]
41
- attr_reader :rng
42
-
43
- # Create a new transformer with PCA.
44
- #
45
- # @param n_components [Integer] The number of principal components.
46
- # @param solver [String] The algorithm for the optimization ('auto', 'fpt' or 'evd').
47
- # 'auto' chooses the 'evd' solver if Numo::Linalg is loaded. Otherwise, it chooses the 'fpt' solver.
48
- # 'fpt' uses the fixed-point algorithm.
49
- # 'evd' performs eigen value decomposition of the covariance matrix of samples.
50
- # @param max_iter [Integer] The maximum number of iterations. If solver = 'evd', this parameter is ignored.
51
- # @param tol [Float] The tolerance of termination criterion. If solver = 'evd', this parameter is ignored.
52
- # @param random_seed [Integer] The seed value using to initialize the random generator.
53
- def initialize(n_components: 2, solver: 'auto', max_iter: 100, tol: 1.0e-4, random_seed: nil)
54
- check_params_numeric(n_components: n_components, max_iter: max_iter, tol: tol)
55
- check_params_string(solver: solver)
56
- check_params_numeric_or_nil(random_seed: random_seed)
57
- check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
58
- @params = {}
59
- @params[:solver] = if solver == 'auto'
60
- load_linalg? ? 'evd' : 'fpt'
61
- else
62
- solver != 'evd' ? 'fpt' : 'evd' # rubocop:disable Style/NegatedIfElseCondition
63
- end
64
- @params[:n_components] = n_components
65
- @params[:max_iter] = max_iter
66
- @params[:tol] = tol
67
- @params[:random_seed] = random_seed
68
- @params[:random_seed] ||= srand
69
- @components = nil
70
- @mean = nil
71
- @rng = Random.new(@params[:random_seed])
72
- end
73
-
74
- # Fit the model with given training data.
75
- #
76
- # @overload fit(x) -> PCA
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
79
- # @return [PCA] The learned transformer itself.
80
- def fit(x, _y = nil)
81
- x = check_convert_sample_array(x)
82
- # initialize some variables.
83
- @components = nil
84
- n_samples, n_features = x.shape
85
- sub_rng = @rng.dup
86
- # centering.
87
- @mean = x.mean(0)
88
- centered_x = x - @mean
89
- # optimization.
90
- covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
91
- if @params[:solver] == 'evd' && enable_linalg?
92
- _, evecs = Numo::Linalg.eigh(covariance_mat, vals_range: (n_features - @params[:n_components])...n_features)
93
- comps = evecs.reverse(1).transpose
94
- @components = @params[:n_components] == 1 ? comps[0, true].dup : comps.dup
95
- else
96
- @params[:n_components].times do
97
- comp_vec = Rumale::Utils.rand_uniform(n_features, sub_rng)
98
- @params[:max_iter].times do
99
- updated = orthogonalize(covariance_mat.dot(comp_vec))
100
- break if (updated.dot(comp_vec) - 1).abs < @params[:tol]
101
-
102
- comp_vec = updated
103
- end
104
- @components = @components.nil? ? comp_vec : Numo::NArray.vstack([@components, comp_vec])
105
- end
106
- end
107
- self
108
- end
109
-
110
- # Fit the model with training data, and then transform them with the learned model.
111
- #
112
- # @overload fit_transform(x) -> Numo::DFloat
113
- #
114
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
115
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
116
- def fit_transform(x, _y = nil)
117
- x = check_convert_sample_array(x)
118
- fit(x).transform(x)
119
- end
120
-
121
- # Transform the given data with the learned model.
122
- #
123
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
124
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
125
- def transform(x)
126
- x = check_convert_sample_array(x)
127
- (x - @mean).dot(@components.transpose)
128
- end
129
-
130
- # Inverse transform the given transformed data with the learned model.
131
- #
132
- # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The data to be restored into original space with the learned model.
133
- # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The restored data.
134
- def inverse_transform(z)
135
- z = check_convert_sample_array(z)
136
- c = @components.shape[1].nil? ? @components.expand_dims(0) : @components
137
- z.dot(c) + @mean
138
- end
139
-
140
- private
141
-
142
- def load_linalg?
143
- return false if defined?(Numo::Linalg).nil?
144
- return false if Numo::Linalg::VERSION < '0.1.4'
145
-
146
- true
147
- end
148
-
149
- def orthogonalize(pcvec)
150
- unless @components.nil?
151
- delta = @components.dot(pcvec) * @components.transpose
152
- delta = delta.sum(1) unless delta.shape[1].nil?
153
- pcvec -= delta
154
- end
155
- pcvec / Math.sqrt((pcvec**2).sum.abs) + 1.0e-12
156
- end
157
- end
158
- end
159
- end
@@ -1,179 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/utils'
5
- require 'rumale/base/base_estimator'
6
- require 'rumale/base/classifier'
7
- require 'rumale/tree/decision_tree_classifier'
8
-
9
- module Rumale
10
- module Ensemble
11
- # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
12
- # This class uses decision tree for a weak learner.
13
- #
14
- # @example
15
- # estimator =
16
- # Rumale::Ensemble::AdaBoostClassifier.new(
17
- # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
18
- # estimator.fit(training_samples, traininig_labels)
19
- # results = estimator.predict(testing_samples)
20
- #
21
- # *Reference*
22
- # - Zhu, J., Rosset, S., Zou, H., and Hashie, T., "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
23
- class AdaBoostClassifier
24
- include Base::BaseEstimator
25
- include Base::Classifier
26
-
27
- # Return the set of estimators.
28
- # @return [Array<DecisionTreeClassifier>]
29
- attr_reader :estimators
30
-
31
- # Return the class labels.
32
- # @return [Numo::Int32] (size: n_classes)
33
- attr_reader :classes
34
-
35
- # Return the importance for each feature.
36
- # @return [Numo::DFloat] (size: n_features)
37
- attr_reader :feature_importances
38
-
39
- # Return the random generator for random selection of feature index.
40
- # @return [Random]
41
- attr_reader :rng
42
-
43
- # Create a new classifier with AdaBoost.
44
- #
45
- # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost classifier.
46
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
47
- # @param max_depth [Integer] The maximum depth of the tree.
48
- # If nil is given, decision tree grows without concern for depth.
49
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
50
- # If nil is given, number of leaves is not limited.
51
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
52
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
53
- # If nil is given, split process considers all features.
54
- # @param random_seed [Integer] The seed value using to initialize the random generator.
55
- # It is used to randomly determine the order of features when deciding spliting point.
56
- def initialize(n_estimators: 50,
57
- criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
- max_features: nil, random_seed: nil)
59
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
- max_features: max_features, random_seed: random_seed)
61
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
- check_params_string(criterion: criterion)
63
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
64
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
65
- max_features: max_features)
66
- @params = {}
67
- @params[:n_estimators] = n_estimators
68
- @params[:criterion] = criterion
69
- @params[:max_depth] = max_depth
70
- @params[:max_leaf_nodes] = max_leaf_nodes
71
- @params[:min_samples_leaf] = min_samples_leaf
72
- @params[:max_features] = max_features
73
- @params[:random_seed] = random_seed
74
- @params[:random_seed] ||= srand
75
- @estimators = nil
76
- @classes = nil
77
- @feature_importances = nil
78
- @rng = Random.new(@params[:random_seed])
79
- end
80
-
81
- # Fit the model with given training data.
82
- #
83
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
84
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
85
- # @return [AdaBoostClassifier] The learned classifier itself.
86
- def fit(x, y) # rubocop:disable Metrics/AbcSize
87
- x = check_convert_sample_array(x)
88
- y = check_convert_label_array(y)
89
- check_sample_label_size(x, y)
90
- ## Initialize some variables.
91
- n_samples, n_features = x.shape
92
- @estimators = []
93
- @feature_importances = Numo::DFloat.zeros(n_features)
94
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
95
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
96
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
97
- n_classes = @classes.shape[0]
98
- sub_rng = @rng.dup
99
- ## Boosting.
100
- classes_arr = @classes.to_a
101
- y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
102
- n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
103
- observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
104
- @params[:n_estimators].times do |_t|
105
- # Fit classfier.
106
- ids = Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
107
- break if y[ids].to_a.uniq.size != n_classes
108
-
109
- tree = Tree::DecisionTreeClassifier.new(
110
- criterion: @params[:criterion], max_depth: @params[:max_depth],
111
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
112
- max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
113
- )
114
- tree.fit(x[ids, true], y[ids])
115
- # Calculate estimator error.
116
- proba = tree.predict_proba(x).clip(1.0e-15, nil)
117
- p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
118
- inds = p.ne(y)
119
- error = (observation_weights * inds).sum / observation_weights.sum
120
- # Store model.
121
- @estimators.push(tree)
122
- @feature_importances += tree.feature_importances
123
- break if error.zero?
124
-
125
- # Update observation weights.
126
- log_proba = Numo::NMath.log(proba)
127
- observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
128
- observation_weights = observation_weights.clip(1.0e-15, nil)
129
- sum_observation_weights = observation_weights.sum
130
- break if sum_observation_weights.zero?
131
-
132
- observation_weights /= sum_observation_weights
133
- end
134
- @feature_importances /= @feature_importances.sum
135
- self
136
- end
137
-
138
- # Calculate confidence scores for samples.
139
- #
140
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
141
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
142
- def decision_function(x)
143
- x = check_convert_sample_array(x)
144
- n_samples, = x.shape
145
- n_classes = @classes.size
146
- sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
147
- @estimators.each do |tree|
148
- log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
149
- sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
150
- end
151
- sum_probs /= @estimators.size
152
- end
153
-
154
- # Predict class labels for samples.
155
- #
156
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
157
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
158
- def predict(x)
159
- x = check_convert_sample_array(x)
160
- n_samples, = x.shape
161
- probs = decision_function(x)
162
- Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
163
- end
164
-
165
- # Predict probability for samples.
166
- #
167
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
168
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
169
- def predict_proba(x)
170
- x = check_convert_sample_array(x)
171
- n_classes = @classes.size
172
- probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
173
- sum_probs = probs.sum(1)
174
- probs /= Numo::DFloat[sum_probs].transpose
175
- probs
176
- end
177
- end
178
- end
179
- end
@@ -1,160 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/regressor'
6
- require 'rumale/tree/decision_tree_regressor'
7
-
8
- module Rumale
9
- module Ensemble
10
- # AdaBoostRegressor is a class that implements random forest for regression.
11
- # This class uses decision tree for a weak learner.
12
- #
13
- # @example
14
- # estimator =
15
- # Rumale::Ensemble::AdaBoostRegressor.new(
16
- # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
- # estimator.fit(training_samples, traininig_values)
18
- # results = estimator.predict(testing_samples)
19
- #
20
- # *Reference*
21
- # - Shrestha, D. L., and Solomatine, D. P., "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
22
- class AdaBoostRegressor
23
- include Base::BaseEstimator
24
- include Base::Regressor
25
-
26
- # Return the set of estimators.
27
- # @return [Array<DecisionTreeRegressor>]
28
- attr_reader :estimators
29
-
30
- # Return the weight for each weak learner.
31
- # @return [Numo::DFloat] (size: n_estimates)
32
- attr_reader :estimator_weights
33
-
34
- # Return the importance for each feature.
35
- # @return [Numo::DFloat] (size: n_features)
36
- attr_reader :feature_importances
37
-
38
- # Return the random generator for random selection of feature index.
39
- # @return [Random]
40
- attr_reader :rng
41
-
42
- # Create a new regressor with random forest.
43
- #
44
- # @param n_estimators [Integer] The numeber of decision trees for contructing AdaBoost regressor.
45
- # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
46
- # @param exponent [Float] The exponent for the weight of each weak learner.
47
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
48
- # @param max_depth [Integer] The maximum depth of the tree.
49
- # If nil is given, decision tree grows without concern for depth.
50
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
51
- # If nil is given, number of leaves is not limited.
52
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
53
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
54
- # If nil is given, split process considers all features.
55
- # @param random_seed [Integer] The seed value using to initialize the random generator.
56
- # It is used to randomly determine the order of features when deciding spliting point.
57
- def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
58
- criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
59
- max_features: nil, random_seed: nil)
60
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
61
- max_features: max_features, random_seed: random_seed)
62
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf,
63
- threshold: threshold, exponent: exponent)
64
- check_params_string(criterion: criterion)
65
- check_params_positive(n_estimators: n_estimators, threshold: threshold, exponent: exponent,
66
- max_depth: max_depth,
67
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
68
- max_features: max_features)
69
- @params = {}
70
- @params[:n_estimators] = n_estimators
71
- @params[:threshold] = threshold
72
- @params[:exponent] = exponent
73
- @params[:criterion] = criterion
74
- @params[:max_depth] = max_depth
75
- @params[:max_leaf_nodes] = max_leaf_nodes
76
- @params[:min_samples_leaf] = min_samples_leaf
77
- @params[:max_features] = max_features
78
- @params[:random_seed] = random_seed
79
- @params[:random_seed] ||= srand
80
- @estimators = nil
81
- @feature_importances = nil
82
- @rng = Random.new(@params[:random_seed])
83
- end
84
-
85
- # Fit the model with given training data.
86
- #
87
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
88
- # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
89
- # @return [AdaBoostRegressor] The learned regressor itself.
90
- def fit(x, y) # rubocop:disable Metrics/AbcSize
91
- x = check_convert_sample_array(x)
92
- y = check_convert_tvalue_array(y)
93
- check_sample_tvalue_size(x, y)
94
- # Check target values
95
- raise ArgumentError, 'Expect target value vector to be 1-D arrray' unless y.shape.size == 1
96
-
97
- # Initialize some variables.
98
- n_samples, n_features = x.shape
99
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
100
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
101
- observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
102
- @estimators = []
103
- @estimator_weights = []
104
- @feature_importances = Numo::DFloat.zeros(n_features)
105
- sub_rng = @rng.dup
106
- # Construct forest.
107
- @params[:n_estimators].times do |_t|
108
- # Fit weak learner.
109
- ids = Rumale::Utils.choice_ids(n_samples, observation_weights, sub_rng)
110
- tree = Tree::DecisionTreeRegressor.new(
111
- criterion: @params[:criterion], max_depth: @params[:max_depth],
112
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
113
- max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
114
- )
115
- tree.fit(x[ids, true], y[ids])
116
- p = tree.predict(x)
117
- # Calculate errors.
118
- abs_err = ((p - y) / y).abs
119
- err = observation_weights[abs_err.gt(@params[:threshold])].sum
120
- break if err <= 0.0
121
-
122
- # Calculate weight.
123
- beta = err**@params[:exponent]
124
- weight = Math.log(1.fdiv(beta))
125
- # Store model.
126
- @estimators.push(tree)
127
- @estimator_weights.push(weight)
128
- @feature_importances += weight * tree.feature_importances
129
- # Update observation weights.
130
- update = Numo::DFloat.ones(n_samples)
131
- update[abs_err.le(@params[:threshold])] = beta
132
- observation_weights *= update
133
- observation_weights = observation_weights.clip(1.0e-15, nil)
134
- sum_observation_weights = observation_weights.sum
135
- break if sum_observation_weights.zero?
136
-
137
- observation_weights /= sum_observation_weights
138
- end
139
- @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
140
- @feature_importances /= @estimator_weights.sum
141
- self
142
- end
143
-
144
- # Predict values for samples.
145
- #
146
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
147
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
148
- def predict(x)
149
- x = check_convert_sample_array(x)
150
- n_samples, = x.shape
151
- predictions = Numo::DFloat.zeros(n_samples)
152
- @estimators.size.times do |t|
153
- predictions += @estimator_weights[t] * @estimators[t].predict(x)
154
- end
155
- sum_weight = @estimator_weights.sum
156
- predictions / sum_weight
157
- end
158
- end
159
- end
160
- end