rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,139 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/extra_tree_classifier'
4
- require 'rumale/ensemble/random_forest_classifier'
5
-
6
- module Rumale
7
- module Ensemble
8
- # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
9
- # The algorithm of extremely randomized trees is similar to random forest.
10
- # The features of the algorithm of extremely randomized trees are
11
- # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
- #
13
- # @example
14
- # estimator =
15
- # Rumale::Ensemble::ExtraTreesClassifier.new(
16
- # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
- # estimator.fit(training_samples, traininig_labels)
18
- # results = estimator.predict(testing_samples)
19
- #
20
- # *Reference*
21
- # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
- class ExtraTreesClassifier < RandomForestClassifier
23
- # Return the set of estimators.
24
- # @return [Array<ExtraTreeClassifier>]
25
- attr_reader :estimators
26
-
27
- # Return the class labels.
28
- # @return [Numo::Int32] (size: n_classes)
29
- attr_reader :classes
30
-
31
- # Return the importance for each feature.
32
- # @return [Numo::DFloat] (size: n_features)
33
- attr_reader :feature_importances
34
-
35
- # Return the random generator for random selection of feature index.
36
- # @return [Random]
37
- attr_reader :rng
38
-
39
- # Create a new classifier with extremely randomized trees.
40
- #
41
- # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
42
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
43
- # @param max_depth [Integer] The maximum depth of the tree.
44
- # If nil is given, extra tree grows without concern for depth.
45
- # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
46
- # If nil is given, number of leaves is not limited.
47
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
- # If nil is given, split process considers 'Math.sqrt(n_features)' features.
50
- # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
51
- # If nil is given, the method does not execute in parallel.
52
- # If zero or less is given, it becomes equal to the number of processors.
53
- # This parameter is ignored if the Parallel gem is not loaded.
54
- # @param random_seed [Integer] The seed value using to initialize the random generator.
55
- # It is used to randomly determine the order of features when deciding spliting point.
56
- def initialize(n_estimators: 10,
57
- criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
58
- max_features: nil, n_jobs: nil, random_seed: nil)
59
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
60
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
61
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
62
- check_params_string(criterion: criterion)
63
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
64
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
65
- max_features: max_features)
66
- super
67
- end
68
-
69
- # Fit the model with given training data.
70
- #
71
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
72
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
73
- # @return [ExtraTreesClassifier] The learned classifier itself.
74
- def fit(x, y)
75
- x = check_convert_sample_array(x)
76
- y = check_convert_label_array(y)
77
- check_sample_label_size(x, y)
78
- # Initialize some variables.
79
- n_features = x.shape[1]
80
- @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
81
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
82
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
83
- sub_rng = @rng.dup
84
- # Construct trees.
85
- rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
86
- @estimators = if enable_parallel?
87
- parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
88
- else
89
- Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
90
- end
91
- @feature_importances =
92
- if enable_parallel?
93
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
94
- else
95
- @estimators.map(&:feature_importances).reduce(&:+)
96
- end
97
- @feature_importances /= @feature_importances.sum
98
- self
99
- end
100
-
101
- # Predict class labels for samples.
102
- #
103
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
104
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
105
- def predict(x)
106
- x = check_convert_sample_array(x)
107
- super
108
- end
109
-
110
- # Predict probability for samples.
111
- #
112
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
113
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
114
- def predict_proba(x)
115
- x = check_convert_sample_array(x)
116
- super
117
- end
118
-
119
- # Return the index of the leaf that each sample reached.
120
- #
121
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
122
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
123
- def apply(x)
124
- x = check_convert_sample_array(x)
125
- super
126
- end
127
-
128
- private
129
-
130
- def plant_tree(rnd_seed)
131
- Tree::ExtraTreeClassifier.new(
132
- criterion: @params[:criterion], max_depth: @params[:max_depth],
133
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
134
- max_features: @params[:max_features], random_seed: rnd_seed
135
- )
136
- end
137
- end
138
- end
139
- end
@@ -1,125 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/tree/extra_tree_regressor'
4
- require 'rumale/ensemble/random_forest_regressor'
5
-
6
- module Rumale
7
- module Ensemble
8
- # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
9
- # The algorithm of extremely randomized trees is similar to random forest.
10
- # The features of the algorithm of extremely randomized trees are
11
- # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
- #
13
- # @example
14
- # estimator =
15
- # Rumale::Ensemble::ExtraTreesRegressor.new(
16
- # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
- # estimator.fit(training_samples, traininig_values)
18
- # results = estimator.predict(testing_samples)
19
- #
20
- # *Reference*
21
- # - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
- class ExtraTreesRegressor < RandomForestRegressor
23
- # Return the set of estimators.
24
- # @return [Array<ExtraTreeRegressor>]
25
- attr_reader :estimators
26
-
27
- # Return the importance for each feature.
28
- # @return [Numo::DFloat] (size: n_features)
29
- attr_reader :feature_importances
30
-
31
- # Return the random generator for random selection of feature index.
32
- # @return [Random]
33
- attr_reader :rng
34
-
35
- # Create a new regressor with extremely randomized trees.
36
- #
37
- # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
38
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
- # @param max_depth [Integer] The maximum depth of the tree.
40
- # If nil is given, extra tree grows without concern for depth.
41
- # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
42
- # If nil is given, number of leaves is not limited.
43
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
- # If nil is given, split process considers 'Math.sqrt(n_features)' features.
46
- # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
47
- # If nil is given, the methods do not execute in parallel.
48
- # If zero or less is given, it becomes equal to the number of processors.
49
- # This parameter is ignored if the Parallel gem is not loaded.
50
- # @param random_seed [Integer] The seed value using to initialize the random generator.
51
- # It is used to randomly determine the order of features when deciding spliting point.
52
- def initialize(n_estimators: 10,
53
- criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
54
- max_features: nil, n_jobs: nil, random_seed: nil)
55
- check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
57
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
58
- check_params_string(criterion: criterion)
59
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
60
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
61
- max_features: max_features)
62
- super
63
- end
64
-
65
- # Fit the model with given training data.
66
- #
67
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
68
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
69
- # @return [ExtraTreesRegressor] The learned regressor itself.
70
- def fit(x, y)
71
- x = check_convert_sample_array(x)
72
- y = check_convert_tvalue_array(y)
73
- check_sample_tvalue_size(x, y)
74
- # Initialize some variables.
75
- n_features = x.shape[1]
76
- @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
77
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
78
- sub_rng = @rng.dup
79
- # Construct forest.
80
- rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
81
- @estimators = if enable_parallel?
82
- parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
83
- else
84
- Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
85
- end
86
- @feature_importances =
87
- if enable_parallel?
88
- parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
89
- else
90
- @estimators.map(&:feature_importances).reduce(&:+)
91
- end
92
- @feature_importances /= @feature_importances.sum
93
- self
94
- end
95
-
96
- # Predict values for samples.
97
- #
98
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
99
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
100
- def predict(x)
101
- x = check_convert_sample_array(x)
102
- super
103
- end
104
-
105
- # Return the index of the leaf that each sample reached.
106
- #
107
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
108
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
109
- def apply(x)
110
- x = check_convert_sample_array(x)
111
- super
112
- end
113
-
114
- private
115
-
116
- def plant_tree(rnd_seed)
117
- Tree::ExtraTreeRegressor.new(
118
- criterion: @params[:criterion], max_depth: @params[:max_depth],
119
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
120
- max_features: @params[:max_features], random_seed: rnd_seed
121
- )
122
- end
123
- end
124
- end
125
- end
@@ -1,306 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/values'
4
- require 'rumale/base/base_estimator'
5
- require 'rumale/base/classifier'
6
- require 'rumale/tree/gradient_tree_regressor'
7
-
8
- module Rumale
9
- module Ensemble
10
- # GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
11
- # The class use negative binomial log-likelihood for the loss function.
12
- # For multiclass classification problem, it uses one-vs-the-rest strategy.
13
- #
14
- # @example
15
- # estimator =
16
- # Rumale::Ensemble::GradientBoostingClassifier.new(
17
- # n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
18
- # estimator.fit(training_samples, traininig_values)
19
- # results = estimator.predict(testing_samples)
20
- #
21
- # *Reference*
22
- # - Friedman, J H., "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
23
- # - Friedman, J H., "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
24
- # - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
25
- #
26
- class GradientBoostingClassifier
27
- include Base::BaseEstimator
28
- include Base::Classifier
29
-
30
- # Return the set of estimators.
31
- # @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
32
- attr_reader :estimators
33
-
34
- # Return the class labels.
35
- # @return [Numo::Int32] (size: n_classes)
36
- attr_reader :classes
37
-
38
- # Return the importance for each feature.
39
- # The feature importances are calculated based on the numbers of times the feature is used for splitting.
40
- # @return [Numo::DFloat] (size: n_features)
41
- attr_reader :feature_importances
42
-
43
- # Return the random generator for random selection of feature index.
44
- # @return [Random]
45
- attr_reader :rng
46
-
47
- # Create a new classifier with gradient tree boosting.
48
- #
49
- # @param n_estimators [Integer] The numeber of trees for contructing classifier.
50
- # @param learning_rate [Float] The boosting learining rate
51
- # @param reg_lambda [Float] The L2 regularization term on weight.
52
- # @param subsample [Float] The subsampling ratio of the training samples.
53
- # @param max_depth [Integer] The maximum depth of the tree.
54
- # If nil is given, decision tree grows without concern for depth.
55
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
56
- # If nil is given, number of leaves is not limited.
57
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
58
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
59
- # If nil is given, split process considers all features.
60
- # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
61
- # If nil is given, the methods do not execute in parallel.
62
- # If zero or less is given, it becomes equal to the number of processors.
63
- # This parameter is ignored if the Parallel gem is not loaded.
64
- # @param random_seed [Integer] The seed value using to initialize the random generator.
65
- # It is used to randomly determine the order of features when deciding spliting point.
66
- def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
67
- max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
68
- max_features: nil, n_jobs: nil, random_seed: nil)
69
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
70
- max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
71
- check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf,
72
- learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
73
- check_params_positive(n_estimators: n_estimators, learning_rate: learning_rate, reg_lambda: reg_lambda,
74
- subsample: subsample, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
75
- min_samples_leaf: min_samples_leaf, max_features: max_features)
76
- @params = {}
77
- @params[:n_estimators] = n_estimators
78
- @params[:learning_rate] = learning_rate
79
- @params[:reg_lambda] = reg_lambda
80
- @params[:subsample] = subsample
81
- @params[:max_depth] = max_depth
82
- @params[:max_leaf_nodes] = max_leaf_nodes
83
- @params[:min_samples_leaf] = min_samples_leaf
84
- @params[:max_features] = max_features
85
- @params[:n_jobs] = n_jobs
86
- @params[:random_seed] = random_seed
87
- @params[:random_seed] ||= srand
88
- @estimators = nil
89
- @classes = nil
90
- @base_predictions = nil
91
- @feature_importances = nil
92
- @rng = Random.new(@params[:random_seed])
93
- end
94
-
95
- # Fit the model with given training data.
96
- #
97
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
98
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
99
- # @return [GradientBoostingClassifier] The learned classifier itself.
100
- def fit(x, y)
101
- x = check_convert_sample_array(x)
102
- y = check_convert_label_array(y)
103
- check_sample_label_size(x, y)
104
- # initialize some variables.
105
- n_features = x.shape[1]
106
- @params[:max_features] = n_features if @params[:max_features].nil?
107
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
108
- @classes = Numo::Int32[*y.to_a.uniq.sort]
109
- n_classes = @classes.size
110
- # train estimator.
111
- if n_classes > 2
112
- @base_predictions = multiclass_base_predictions(y)
113
- @estimators = multiclass_estimators(x, y)
114
- else
115
- negative_label = y.to_a.uniq.min
116
- bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
117
- y_mean = bin_y.mean
118
- @base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
119
- @estimators = partial_fit(x, bin_y, @base_predictions)
120
- end
121
- # calculate feature importances.
122
- @feature_importances = if n_classes > 2
123
- multiclass_feature_importances
124
- else
125
- @estimators.map(&:feature_importances).reduce(&:+)
126
- end
127
- self
128
- end
129
-
130
- # Calculate confidence scores for samples.
131
- #
132
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
133
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
134
- def decision_function(x)
135
- x = check_convert_sample_array(x)
136
- n_classes = @classes.size
137
- if n_classes > 2
138
- multiclass_scores(x)
139
- else
140
- @estimators.map { |tree| tree.predict(x) }.reduce(&:+) + @base_predictions
141
- end
142
- end
143
-
144
- # Predict class labels for samples.
145
- #
146
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
147
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
148
- def predict(x)
149
- x = check_convert_sample_array(x)
150
- n_samples = x.shape[0]
151
- probs = predict_proba(x)
152
- Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
153
- end
154
-
155
- # Predict probability for samples.
156
- #
157
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
158
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
159
- def predict_proba(x)
160
- x = check_convert_sample_array(x)
161
-
162
- proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
163
-
164
- return (proba.transpose / proba.sum(axis: 1)).transpose.dup if @classes.size > 2
165
-
166
- n_samples, = x.shape
167
- probs = Numo::DFloat.zeros(n_samples, 2)
168
- probs[true, 1] = proba
169
- probs[true, 0] = 1.0 - proba
170
- probs
171
- end
172
-
173
- # Return the index of the leaf that each sample reached.
174
- #
175
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
176
- # @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
177
- def apply(x)
178
- x = check_convert_sample_array(x)
179
- n_classes = @classes.size
180
- leaf_ids = if n_classes > 2
181
- Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
182
- else
183
- @estimators.map { |tree| tree.apply(x) }
184
- end
185
- Numo::Int32[*leaf_ids].transpose.dup
186
- end
187
-
188
- private
189
-
190
- def partial_fit(x, y, init_pred)
191
- # initialize some variables.
192
- estimators = []
193
- n_samples = x.shape[0]
194
- n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
195
- whole_ids = Array.new(n_samples) { |v| v }
196
- y_pred = Numo::DFloat.ones(n_samples) * init_pred
197
- sub_rng = @rng.dup
198
- # grow trees.
199
- @params[:n_estimators].times do |_t|
200
- # subsampling
201
- ids = whole_ids.sample(n_sub_samples, random: sub_rng)
202
- x_sub = x[ids, true]
203
- y_sub = y[ids]
204
- y_pred_sub = y_pred[ids]
205
- # train tree
206
- g = gradient(y_sub, y_pred_sub)
207
- h = hessian(y_sub, y_pred_sub)
208
- tree = plant_tree(sub_rng)
209
- tree.fit(x_sub, y_sub, g, h)
210
- estimators.push(tree)
211
- # update
212
- y_pred += tree.predict(x)
213
- end
214
- estimators
215
- end
216
-
217
- # for debug
218
- #
219
- # def loss(y_true, y_pred)
220
- # # y_true in {-1, 1}
221
- # Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
222
- # end
223
-
224
- def gradient(y_true, y_pred)
225
- # y in {-1, 1}
226
- -2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
227
- end
228
-
229
- def hessian(y_true, y_pred)
230
- abs_response = gradient(y_true, y_pred).abs
231
- abs_response * (2.0 - abs_response)
232
- end
233
-
234
- def plant_tree(sub_rng)
235
- Rumale::Tree::GradientTreeRegressor.new(
236
- reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
237
- max_depth: @params[:max_depth],
238
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
239
- max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
240
- )
241
- end
242
-
243
- def multiclass_base_predictions(y)
244
- n_classes = @classes.size
245
- b = if enable_parallel?
246
- # :nocov:
247
- parallel_map(n_classes) do |n|
248
- bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
249
- y_mean = bin_y.mean
250
- 0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
251
- end
252
- # :nocov:
253
- else
254
- Array.new(n_classes) do |n|
255
- bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
256
- y_mean = bin_y.mean
257
- 0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
258
- end
259
- end
260
- Numo::DFloat.asarray(b)
261
- end
262
-
263
- def multiclass_estimators(x, y)
264
- n_classes = @classes.size
265
- if enable_parallel?
266
- # :nocov:
267
- parallel_map(n_classes) do |n|
268
- bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
269
- partial_fit(x, bin_y, @base_predictions[n])
270
- end
271
- # :nocov:
272
- else
273
- Array.new(n_classes) do |n|
274
- bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
275
- partial_fit(x, bin_y, @base_predictions[n])
276
- end
277
- end
278
- end
279
-
280
- def multiclass_feature_importances
281
- n_classes = @classes.size
282
- if enable_parallel?
283
- parallel_map(n_classes) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
284
- else
285
- Array.new(n_classes) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
286
- end
287
- end
288
-
289
- def multiclass_scores(x)
290
- n_classes = @classes.size
291
- s = if enable_parallel?
292
- # :nocov:
293
- parallel_map(n_classes) do |n|
294
- @estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
295
- end
296
- # :nocov:
297
- else
298
- Array.new(n_classes) do |n|
299
- @estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
300
- end
301
- end
302
- Numo::DFloat.asarray(s).transpose + @base_predictions
303
- end
304
- end
305
- end
306
- end