rumale 0.23.3 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/extra_tree_classifier'
|
|
4
|
-
require 'rumale/ensemble/random_forest_classifier'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Ensemble
|
|
8
|
-
# ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
|
|
9
|
-
# The algorithm of extremely randomized trees is similar to random forest.
|
|
10
|
-
# The features of the algorithm of extremely randomized trees are
|
|
11
|
-
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
|
12
|
-
#
|
|
13
|
-
# @example
|
|
14
|
-
# estimator =
|
|
15
|
-
# Rumale::Ensemble::ExtraTreesClassifier.new(
|
|
16
|
-
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
17
|
-
# estimator.fit(training_samples, traininig_labels)
|
|
18
|
-
# results = estimator.predict(testing_samples)
|
|
19
|
-
#
|
|
20
|
-
# *Reference*
|
|
21
|
-
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
|
22
|
-
class ExtraTreesClassifier < RandomForestClassifier
|
|
23
|
-
# Return the set of estimators.
|
|
24
|
-
# @return [Array<ExtraTreeClassifier>]
|
|
25
|
-
attr_reader :estimators
|
|
26
|
-
|
|
27
|
-
# Return the class labels.
|
|
28
|
-
# @return [Numo::Int32] (size: n_classes)
|
|
29
|
-
attr_reader :classes
|
|
30
|
-
|
|
31
|
-
# Return the importance for each feature.
|
|
32
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
33
|
-
attr_reader :feature_importances
|
|
34
|
-
|
|
35
|
-
# Return the random generator for random selection of feature index.
|
|
36
|
-
# @return [Random]
|
|
37
|
-
attr_reader :rng
|
|
38
|
-
|
|
39
|
-
# Create a new classifier with extremely randomized trees.
|
|
40
|
-
#
|
|
41
|
-
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
|
42
|
-
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
|
43
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
44
|
-
# If nil is given, extra tree grows without concern for depth.
|
|
45
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
|
46
|
-
# If nil is given, number of leaves is not limited.
|
|
47
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
48
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
49
|
-
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
|
50
|
-
# @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
|
|
51
|
-
# If nil is given, the method does not execute in parallel.
|
|
52
|
-
# If zero or less is given, it becomes equal to the number of processors.
|
|
53
|
-
# This parameter is ignored if the Parallel gem is not loaded.
|
|
54
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
55
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
56
|
-
def initialize(n_estimators: 10,
|
|
57
|
-
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
|
58
|
-
max_features: nil, n_jobs: nil, random_seed: nil)
|
|
59
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
60
|
-
max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
|
|
61
|
-
check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
|
62
|
-
check_params_string(criterion: criterion)
|
|
63
|
-
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
|
64
|
-
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
|
65
|
-
max_features: max_features)
|
|
66
|
-
super
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Fit the model with given training data.
|
|
70
|
-
#
|
|
71
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
72
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
|
73
|
-
# @return [ExtraTreesClassifier] The learned classifier itself.
|
|
74
|
-
def fit(x, y)
|
|
75
|
-
x = check_convert_sample_array(x)
|
|
76
|
-
y = check_convert_label_array(y)
|
|
77
|
-
check_sample_label_size(x, y)
|
|
78
|
-
# Initialize some variables.
|
|
79
|
-
n_features = x.shape[1]
|
|
80
|
-
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
|
81
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
|
82
|
-
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
|
83
|
-
sub_rng = @rng.dup
|
|
84
|
-
# Construct trees.
|
|
85
|
-
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
|
|
86
|
-
@estimators = if enable_parallel?
|
|
87
|
-
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
|
88
|
-
else
|
|
89
|
-
Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
|
90
|
-
end
|
|
91
|
-
@feature_importances =
|
|
92
|
-
if enable_parallel?
|
|
93
|
-
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
|
|
94
|
-
else
|
|
95
|
-
@estimators.map(&:feature_importances).reduce(&:+)
|
|
96
|
-
end
|
|
97
|
-
@feature_importances /= @feature_importances.sum
|
|
98
|
-
self
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# Predict class labels for samples.
|
|
102
|
-
#
|
|
103
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
104
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
|
105
|
-
def predict(x)
|
|
106
|
-
x = check_convert_sample_array(x)
|
|
107
|
-
super
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
# Predict probability for samples.
|
|
111
|
-
#
|
|
112
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
|
113
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
|
114
|
-
def predict_proba(x)
|
|
115
|
-
x = check_convert_sample_array(x)
|
|
116
|
-
super
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
# Return the index of the leaf that each sample reached.
|
|
120
|
-
#
|
|
121
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
122
|
-
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
|
123
|
-
def apply(x)
|
|
124
|
-
x = check_convert_sample_array(x)
|
|
125
|
-
super
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
private
|
|
129
|
-
|
|
130
|
-
def plant_tree(rnd_seed)
|
|
131
|
-
Tree::ExtraTreeClassifier.new(
|
|
132
|
-
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
|
133
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
|
134
|
-
max_features: @params[:max_features], random_seed: rnd_seed
|
|
135
|
-
)
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
end
|
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/extra_tree_regressor'
|
|
4
|
-
require 'rumale/ensemble/random_forest_regressor'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Ensemble
|
|
8
|
-
# ExtraTreesRegressor is a class that implements extremely randomized trees for regression
|
|
9
|
-
# The algorithm of extremely randomized trees is similar to random forest.
|
|
10
|
-
# The features of the algorithm of extremely randomized trees are
|
|
11
|
-
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
|
12
|
-
#
|
|
13
|
-
# @example
|
|
14
|
-
# estimator =
|
|
15
|
-
# Rumale::Ensemble::ExtraTreesRegressor.new(
|
|
16
|
-
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
17
|
-
# estimator.fit(training_samples, traininig_values)
|
|
18
|
-
# results = estimator.predict(testing_samples)
|
|
19
|
-
#
|
|
20
|
-
# *Reference*
|
|
21
|
-
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
|
22
|
-
class ExtraTreesRegressor < RandomForestRegressor
|
|
23
|
-
# Return the set of estimators.
|
|
24
|
-
# @return [Array<ExtraTreeRegressor>]
|
|
25
|
-
attr_reader :estimators
|
|
26
|
-
|
|
27
|
-
# Return the importance for each feature.
|
|
28
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
29
|
-
attr_reader :feature_importances
|
|
30
|
-
|
|
31
|
-
# Return the random generator for random selection of feature index.
|
|
32
|
-
# @return [Random]
|
|
33
|
-
attr_reader :rng
|
|
34
|
-
|
|
35
|
-
# Create a new regressor with extremely randomized trees.
|
|
36
|
-
#
|
|
37
|
-
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
|
38
|
-
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
|
39
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
40
|
-
# If nil is given, extra tree grows without concern for depth.
|
|
41
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
|
42
|
-
# If nil is given, number of leaves is not limited.
|
|
43
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
44
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
45
|
-
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
|
46
|
-
# @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
|
|
47
|
-
# If nil is given, the methods do not execute in parallel.
|
|
48
|
-
# If zero or less is given, it becomes equal to the number of processors.
|
|
49
|
-
# This parameter is ignored if the Parallel gem is not loaded.
|
|
50
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
51
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
52
|
-
def initialize(n_estimators: 10,
|
|
53
|
-
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
|
54
|
-
max_features: nil, n_jobs: nil, random_seed: nil)
|
|
55
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
56
|
-
max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
|
|
57
|
-
check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
|
58
|
-
check_params_string(criterion: criterion)
|
|
59
|
-
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
|
60
|
-
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
|
61
|
-
max_features: max_features)
|
|
62
|
-
super
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Fit the model with given training data.
|
|
66
|
-
#
|
|
67
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
68
|
-
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
|
69
|
-
# @return [ExtraTreesRegressor] The learned regressor itself.
|
|
70
|
-
def fit(x, y)
|
|
71
|
-
x = check_convert_sample_array(x)
|
|
72
|
-
y = check_convert_tvalue_array(y)
|
|
73
|
-
check_sample_tvalue_size(x, y)
|
|
74
|
-
# Initialize some variables.
|
|
75
|
-
n_features = x.shape[1]
|
|
76
|
-
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
|
77
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
|
78
|
-
sub_rng = @rng.dup
|
|
79
|
-
# Construct forest.
|
|
80
|
-
rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
|
|
81
|
-
@estimators = if enable_parallel?
|
|
82
|
-
parallel_map(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
|
83
|
-
else
|
|
84
|
-
Array.new(@params[:n_estimators]) { |n| plant_tree(rng_seeds[n]).fit(x, y) }
|
|
85
|
-
end
|
|
86
|
-
@feature_importances =
|
|
87
|
-
if enable_parallel?
|
|
88
|
-
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
|
|
89
|
-
else
|
|
90
|
-
@estimators.map(&:feature_importances).reduce(&:+)
|
|
91
|
-
end
|
|
92
|
-
@feature_importances /= @feature_importances.sum
|
|
93
|
-
self
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
# Predict values for samples.
|
|
97
|
-
#
|
|
98
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
|
99
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
|
100
|
-
def predict(x)
|
|
101
|
-
x = check_convert_sample_array(x)
|
|
102
|
-
super
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
# Return the index of the leaf that each sample reached.
|
|
106
|
-
#
|
|
107
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
|
|
108
|
-
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
|
109
|
-
def apply(x)
|
|
110
|
-
x = check_convert_sample_array(x)
|
|
111
|
-
super
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
private
|
|
115
|
-
|
|
116
|
-
def plant_tree(rnd_seed)
|
|
117
|
-
Tree::ExtraTreeRegressor.new(
|
|
118
|
-
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
|
119
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
|
120
|
-
max_features: @params[:max_features], random_seed: rnd_seed
|
|
121
|
-
)
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
end
|
|
@@ -1,306 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/values'
|
|
4
|
-
require 'rumale/base/base_estimator'
|
|
5
|
-
require 'rumale/base/classifier'
|
|
6
|
-
require 'rumale/tree/gradient_tree_regressor'
|
|
7
|
-
|
|
8
|
-
module Rumale
|
|
9
|
-
module Ensemble
|
|
10
|
-
# GradientBoostingClassifier is a class that implements gradient tree boosting for classification.
|
|
11
|
-
# The class use negative binomial log-likelihood for the loss function.
|
|
12
|
-
# For multiclass classification problem, it uses one-vs-the-rest strategy.
|
|
13
|
-
#
|
|
14
|
-
# @example
|
|
15
|
-
# estimator =
|
|
16
|
-
# Rumale::Ensemble::GradientBoostingClassifier.new(
|
|
17
|
-
# n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
|
|
18
|
-
# estimator.fit(training_samples, traininig_values)
|
|
19
|
-
# results = estimator.predict(testing_samples)
|
|
20
|
-
#
|
|
21
|
-
# *Reference*
|
|
22
|
-
# - Friedman, J H., "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
|
23
|
-
# - Friedman, J H., "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
|
24
|
-
# - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
|
25
|
-
#
|
|
26
|
-
class GradientBoostingClassifier
|
|
27
|
-
include Base::BaseEstimator
|
|
28
|
-
include Base::Classifier
|
|
29
|
-
|
|
30
|
-
# Return the set of estimators.
|
|
31
|
-
# @return [Array<GradientTreeRegressor>] or [Array<Array<GradientTreeRegressor>>]
|
|
32
|
-
attr_reader :estimators
|
|
33
|
-
|
|
34
|
-
# Return the class labels.
|
|
35
|
-
# @return [Numo::Int32] (size: n_classes)
|
|
36
|
-
attr_reader :classes
|
|
37
|
-
|
|
38
|
-
# Return the importance for each feature.
|
|
39
|
-
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
|
40
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
41
|
-
attr_reader :feature_importances
|
|
42
|
-
|
|
43
|
-
# Return the random generator for random selection of feature index.
|
|
44
|
-
# @return [Random]
|
|
45
|
-
attr_reader :rng
|
|
46
|
-
|
|
47
|
-
# Create a new classifier with gradient tree boosting.
|
|
48
|
-
#
|
|
49
|
-
# @param n_estimators [Integer] The numeber of trees for contructing classifier.
|
|
50
|
-
# @param learning_rate [Float] The boosting learining rate
|
|
51
|
-
# @param reg_lambda [Float] The L2 regularization term on weight.
|
|
52
|
-
# @param subsample [Float] The subsampling ratio of the training samples.
|
|
53
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
54
|
-
# If nil is given, decision tree grows without concern for depth.
|
|
55
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
|
56
|
-
# If nil is given, number of leaves is not limited.
|
|
57
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
58
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
59
|
-
# If nil is given, split process considers all features.
|
|
60
|
-
# @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
|
|
61
|
-
# If nil is given, the methods do not execute in parallel.
|
|
62
|
-
# If zero or less is given, it becomes equal to the number of processors.
|
|
63
|
-
# This parameter is ignored if the Parallel gem is not loaded.
|
|
64
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
65
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
66
|
-
def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
|
|
67
|
-
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
|
68
|
-
max_features: nil, n_jobs: nil, random_seed: nil)
|
|
69
|
-
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
70
|
-
max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
|
|
71
|
-
check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf,
|
|
72
|
-
learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
|
|
73
|
-
check_params_positive(n_estimators: n_estimators, learning_rate: learning_rate, reg_lambda: reg_lambda,
|
|
74
|
-
subsample: subsample, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
75
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
76
|
-
@params = {}
|
|
77
|
-
@params[:n_estimators] = n_estimators
|
|
78
|
-
@params[:learning_rate] = learning_rate
|
|
79
|
-
@params[:reg_lambda] = reg_lambda
|
|
80
|
-
@params[:subsample] = subsample
|
|
81
|
-
@params[:max_depth] = max_depth
|
|
82
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
|
83
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
|
84
|
-
@params[:max_features] = max_features
|
|
85
|
-
@params[:n_jobs] = n_jobs
|
|
86
|
-
@params[:random_seed] = random_seed
|
|
87
|
-
@params[:random_seed] ||= srand
|
|
88
|
-
@estimators = nil
|
|
89
|
-
@classes = nil
|
|
90
|
-
@base_predictions = nil
|
|
91
|
-
@feature_importances = nil
|
|
92
|
-
@rng = Random.new(@params[:random_seed])
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
# Fit the model with given training data.
|
|
96
|
-
#
|
|
97
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
98
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
|
99
|
-
# @return [GradientBoostingClassifier] The learned classifier itself.
|
|
100
|
-
def fit(x, y)
|
|
101
|
-
x = check_convert_sample_array(x)
|
|
102
|
-
y = check_convert_label_array(y)
|
|
103
|
-
check_sample_label_size(x, y)
|
|
104
|
-
# initialize some variables.
|
|
105
|
-
n_features = x.shape[1]
|
|
106
|
-
@params[:max_features] = n_features if @params[:max_features].nil?
|
|
107
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
|
108
|
-
@classes = Numo::Int32[*y.to_a.uniq.sort]
|
|
109
|
-
n_classes = @classes.size
|
|
110
|
-
# train estimator.
|
|
111
|
-
if n_classes > 2
|
|
112
|
-
@base_predictions = multiclass_base_predictions(y)
|
|
113
|
-
@estimators = multiclass_estimators(x, y)
|
|
114
|
-
else
|
|
115
|
-
negative_label = y.to_a.uniq.min
|
|
116
|
-
bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
|
|
117
|
-
y_mean = bin_y.mean
|
|
118
|
-
@base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
|
|
119
|
-
@estimators = partial_fit(x, bin_y, @base_predictions)
|
|
120
|
-
end
|
|
121
|
-
# calculate feature importances.
|
|
122
|
-
@feature_importances = if n_classes > 2
|
|
123
|
-
multiclass_feature_importances
|
|
124
|
-
else
|
|
125
|
-
@estimators.map(&:feature_importances).reduce(&:+)
|
|
126
|
-
end
|
|
127
|
-
self
|
|
128
|
-
end
|
|
129
|
-
|
|
130
|
-
# Calculate confidence scores for samples.
|
|
131
|
-
#
|
|
132
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
|
133
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
|
134
|
-
def decision_function(x)
|
|
135
|
-
x = check_convert_sample_array(x)
|
|
136
|
-
n_classes = @classes.size
|
|
137
|
-
if n_classes > 2
|
|
138
|
-
multiclass_scores(x)
|
|
139
|
-
else
|
|
140
|
-
@estimators.map { |tree| tree.predict(x) }.reduce(&:+) + @base_predictions
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
# Predict class labels for samples.
|
|
145
|
-
#
|
|
146
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
147
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
|
148
|
-
def predict(x)
|
|
149
|
-
x = check_convert_sample_array(x)
|
|
150
|
-
n_samples = x.shape[0]
|
|
151
|
-
probs = predict_proba(x)
|
|
152
|
-
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
# Predict probability for samples.
|
|
156
|
-
#
|
|
157
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
|
158
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
|
159
|
-
def predict_proba(x)
|
|
160
|
-
x = check_convert_sample_array(x)
|
|
161
|
-
|
|
162
|
-
proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)
|
|
163
|
-
|
|
164
|
-
return (proba.transpose / proba.sum(axis: 1)).transpose.dup if @classes.size > 2
|
|
165
|
-
|
|
166
|
-
n_samples, = x.shape
|
|
167
|
-
probs = Numo::DFloat.zeros(n_samples, 2)
|
|
168
|
-
probs[true, 1] = proba
|
|
169
|
-
probs[true, 0] = 1.0 - proba
|
|
170
|
-
probs
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
# Return the index of the leaf that each sample reached.
|
|
174
|
-
#
|
|
175
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
176
|
-
# @return [Numo::Int32] (shape: [n_samples, n_estimators, n_classes]) Leaf index for sample.
|
|
177
|
-
def apply(x)
|
|
178
|
-
x = check_convert_sample_array(x)
|
|
179
|
-
n_classes = @classes.size
|
|
180
|
-
leaf_ids = if n_classes > 2
|
|
181
|
-
Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
|
|
182
|
-
else
|
|
183
|
-
@estimators.map { |tree| tree.apply(x) }
|
|
184
|
-
end
|
|
185
|
-
Numo::Int32[*leaf_ids].transpose.dup
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
private
|
|
189
|
-
|
|
190
|
-
def partial_fit(x, y, init_pred)
|
|
191
|
-
# initialize some variables.
|
|
192
|
-
estimators = []
|
|
193
|
-
n_samples = x.shape[0]
|
|
194
|
-
n_sub_samples = [n_samples, [(n_samples * @params[:subsample]).to_i, 1].max].min
|
|
195
|
-
whole_ids = Array.new(n_samples) { |v| v }
|
|
196
|
-
y_pred = Numo::DFloat.ones(n_samples) * init_pred
|
|
197
|
-
sub_rng = @rng.dup
|
|
198
|
-
# grow trees.
|
|
199
|
-
@params[:n_estimators].times do |_t|
|
|
200
|
-
# subsampling
|
|
201
|
-
ids = whole_ids.sample(n_sub_samples, random: sub_rng)
|
|
202
|
-
x_sub = x[ids, true]
|
|
203
|
-
y_sub = y[ids]
|
|
204
|
-
y_pred_sub = y_pred[ids]
|
|
205
|
-
# train tree
|
|
206
|
-
g = gradient(y_sub, y_pred_sub)
|
|
207
|
-
h = hessian(y_sub, y_pred_sub)
|
|
208
|
-
tree = plant_tree(sub_rng)
|
|
209
|
-
tree.fit(x_sub, y_sub, g, h)
|
|
210
|
-
estimators.push(tree)
|
|
211
|
-
# update
|
|
212
|
-
y_pred += tree.predict(x)
|
|
213
|
-
end
|
|
214
|
-
estimators
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
# for debug
|
|
218
|
-
#
|
|
219
|
-
# def loss(y_true, y_pred)
|
|
220
|
-
# # y_true in {-1, 1}
|
|
221
|
-
# Numo::NMath.log(1.0 + Numo::NMath.exp(-2.0 * y_true * y_pred)).mean
|
|
222
|
-
# end
|
|
223
|
-
|
|
224
|
-
def gradient(y_true, y_pred)
|
|
225
|
-
# y in {-1, 1}
|
|
226
|
-
-2.0 * y_true / (1.0 + Numo::NMath.exp(2.0 * y_true * y_pred))
|
|
227
|
-
end
|
|
228
|
-
|
|
229
|
-
def hessian(y_true, y_pred)
|
|
230
|
-
abs_response = gradient(y_true, y_pred).abs
|
|
231
|
-
abs_response * (2.0 - abs_response)
|
|
232
|
-
end
|
|
233
|
-
|
|
234
|
-
def plant_tree(sub_rng)
|
|
235
|
-
Rumale::Tree::GradientTreeRegressor.new(
|
|
236
|
-
reg_lambda: @params[:reg_lambda], shrinkage_rate: @params[:learning_rate],
|
|
237
|
-
max_depth: @params[:max_depth],
|
|
238
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
|
239
|
-
max_features: @params[:max_features], random_seed: sub_rng.rand(Rumale::Values.int_max)
|
|
240
|
-
)
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
def multiclass_base_predictions(y)
|
|
244
|
-
n_classes = @classes.size
|
|
245
|
-
b = if enable_parallel?
|
|
246
|
-
# :nocov:
|
|
247
|
-
parallel_map(n_classes) do |n|
|
|
248
|
-
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
|
249
|
-
y_mean = bin_y.mean
|
|
250
|
-
0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
|
|
251
|
-
end
|
|
252
|
-
# :nocov:
|
|
253
|
-
else
|
|
254
|
-
Array.new(n_classes) do |n|
|
|
255
|
-
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
|
256
|
-
y_mean = bin_y.mean
|
|
257
|
-
0.5 * Math.log((1.0 + y_mean) / (1.0 - y_mean))
|
|
258
|
-
end
|
|
259
|
-
end
|
|
260
|
-
Numo::DFloat.asarray(b)
|
|
261
|
-
end
|
|
262
|
-
|
|
263
|
-
def multiclass_estimators(x, y)
|
|
264
|
-
n_classes = @classes.size
|
|
265
|
-
if enable_parallel?
|
|
266
|
-
# :nocov:
|
|
267
|
-
parallel_map(n_classes) do |n|
|
|
268
|
-
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
|
269
|
-
partial_fit(x, bin_y, @base_predictions[n])
|
|
270
|
-
end
|
|
271
|
-
# :nocov:
|
|
272
|
-
else
|
|
273
|
-
Array.new(n_classes) do |n|
|
|
274
|
-
bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
|
|
275
|
-
partial_fit(x, bin_y, @base_predictions[n])
|
|
276
|
-
end
|
|
277
|
-
end
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
def multiclass_feature_importances
|
|
281
|
-
n_classes = @classes.size
|
|
282
|
-
if enable_parallel?
|
|
283
|
-
parallel_map(n_classes) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
|
|
284
|
-
else
|
|
285
|
-
Array.new(n_classes) { |n| @estimators[n].map(&:feature_importances).reduce(&:+) }.reduce(&:+)
|
|
286
|
-
end
|
|
287
|
-
end
|
|
288
|
-
|
|
289
|
-
def multiclass_scores(x)
|
|
290
|
-
n_classes = @classes.size
|
|
291
|
-
s = if enable_parallel?
|
|
292
|
-
# :nocov:
|
|
293
|
-
parallel_map(n_classes) do |n|
|
|
294
|
-
@estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
|
|
295
|
-
end
|
|
296
|
-
# :nocov:
|
|
297
|
-
else
|
|
298
|
-
Array.new(n_classes) do |n|
|
|
299
|
-
@estimators[n].map { |tree| tree.predict(x) }.reduce(&:+)
|
|
300
|
-
end
|
|
301
|
-
end
|
|
302
|
-
Numo::DFloat.asarray(s).transpose + @base_predictions
|
|
303
|
-
end
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
end
|