rumale 0.23.3 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/base_decision_tree'
|
|
4
|
-
require 'rumale/base/classifier'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Tree
|
|
8
|
-
# DecisionTreeClassifier is a class that implements decision tree for classification.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# estimator =
|
|
12
|
-
# Rumale::Tree::DecisionTreeClassifier.new(
|
|
13
|
-
# criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
14
|
-
# estimator.fit(training_samples, traininig_labels)
|
|
15
|
-
# results = estimator.predict(testing_samples)
|
|
16
|
-
#
|
|
17
|
-
class DecisionTreeClassifier < BaseDecisionTree
|
|
18
|
-
include Base::Classifier
|
|
19
|
-
include ExtDecisionTreeClassifier
|
|
20
|
-
|
|
21
|
-
# Return the class labels.
|
|
22
|
-
# @return [Numo::Int32] (size: n_classes)
|
|
23
|
-
attr_reader :classes
|
|
24
|
-
|
|
25
|
-
# Return the importance for each feature.
|
|
26
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
27
|
-
attr_reader :feature_importances
|
|
28
|
-
|
|
29
|
-
# Return the learned tree.
|
|
30
|
-
# @return [Node]
|
|
31
|
-
attr_reader :tree
|
|
32
|
-
|
|
33
|
-
# Return the random generator for random selection of feature index.
|
|
34
|
-
# @return [Random]
|
|
35
|
-
attr_reader :rng
|
|
36
|
-
|
|
37
|
-
# Return the labels assigned each leaf.
|
|
38
|
-
# @return [Numo::Int32] (size: n_leafs)
|
|
39
|
-
attr_reader :leaf_labels
|
|
40
|
-
|
|
41
|
-
# Create a new classifier with decision tree algorithm.
|
|
42
|
-
#
|
|
43
|
-
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
|
44
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
45
|
-
# If nil is given, decision tree grows without concern for depth.
|
|
46
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
|
47
|
-
# If nil is given, number of leaves is not limited.
|
|
48
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
49
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
50
|
-
# If nil is given, split process considers all features.
|
|
51
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
52
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
53
|
-
def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
|
54
|
-
random_seed: nil)
|
|
55
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
56
|
-
max_features: max_features, random_seed: random_seed)
|
|
57
|
-
check_params_numeric(min_samples_leaf: min_samples_leaf)
|
|
58
|
-
check_params_string(criterion: criterion)
|
|
59
|
-
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
60
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
61
|
-
super
|
|
62
|
-
@leaf_labels = nil
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Fit the model with given training data.
|
|
66
|
-
#
|
|
67
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
68
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
|
69
|
-
# @return [DecisionTreeClassifier] The learned classifier itself.
|
|
70
|
-
def fit(x, y)
|
|
71
|
-
x = check_convert_sample_array(x)
|
|
72
|
-
y = check_convert_label_array(y)
|
|
73
|
-
check_sample_label_size(x, y)
|
|
74
|
-
n_samples, n_features = x.shape
|
|
75
|
-
@params[:max_features] = n_features if @params[:max_features].nil?
|
|
76
|
-
@params[:max_features] = [@params[:max_features], n_features].min
|
|
77
|
-
uniq_y = y.to_a.uniq.sort
|
|
78
|
-
@classes = Numo::Int32.asarray(uniq_y)
|
|
79
|
-
@n_leaves = 0
|
|
80
|
-
@leaf_labels = []
|
|
81
|
-
@sub_rng = @rng.dup
|
|
82
|
-
build_tree(x, y.map { |v| uniq_y.index(v) })
|
|
83
|
-
eval_importance(n_samples, n_features)
|
|
84
|
-
@leaf_labels = Numo::Int32[*@leaf_labels]
|
|
85
|
-
self
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
# Predict class labels for samples.
|
|
89
|
-
#
|
|
90
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
91
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
|
92
|
-
def predict(x)
|
|
93
|
-
x = check_convert_sample_array(x)
|
|
94
|
-
@leaf_labels[apply(x)].dup
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Predict probability for samples.
|
|
98
|
-
#
|
|
99
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
|
100
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
|
101
|
-
def predict_proba(x)
|
|
102
|
-
x = check_convert_sample_array(x)
|
|
103
|
-
Numo::DFloat[*(Array.new(x.shape[0]) { |n| partial_predict_proba(@tree, x[n, true]) })]
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
private
|
|
107
|
-
|
|
108
|
-
def partial_predict_proba(tree, sample)
|
|
109
|
-
node = tree
|
|
110
|
-
until node.leaf
|
|
111
|
-
# :nocov:
|
|
112
|
-
node = if node.right.nil?
|
|
113
|
-
node.left
|
|
114
|
-
elsif node.left.nil?
|
|
115
|
-
node.right
|
|
116
|
-
# :nocov:
|
|
117
|
-
else
|
|
118
|
-
sample[node.feature_id] <= node.threshold ? node.left : node.right
|
|
119
|
-
end
|
|
120
|
-
end
|
|
121
|
-
node.probs
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def stop_growing?(y)
|
|
125
|
-
y[true, 0].to_a.uniq.size == 1
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def put_leaf(node, y)
|
|
129
|
-
node.probs = y.flatten.bincount(minlength: @classes.size) / node.n_samples.to_f
|
|
130
|
-
node.leaf = true
|
|
131
|
-
node.leaf_id = @n_leaves
|
|
132
|
-
@n_leaves += 1
|
|
133
|
-
@leaf_labels.push(@classes[node.probs.max_index])
|
|
134
|
-
node
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def best_split(features, y, whole_impurity)
|
|
138
|
-
order = features.sort_index
|
|
139
|
-
n_classes = @classes.size
|
|
140
|
-
find_split_params(@params[:criterion], whole_impurity, order, features, y[true, 0], n_classes)
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
def impurity(y)
|
|
144
|
-
n_elements = y.shape[0]
|
|
145
|
-
n_classes = @classes.size
|
|
146
|
-
node_impurity(@params[:criterion], y[true, 0].dup, n_elements, n_classes)
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
end
|
|
@@ -1,116 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/base_decision_tree'
|
|
4
|
-
require 'rumale/base/regressor'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Tree
|
|
8
|
-
# DecisionTreeRegressor is a class that implements decision tree for regression.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# estimator =
|
|
12
|
-
# Rumale::Tree::DecisionTreeRegressor.new(
|
|
13
|
-
# max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
14
|
-
# estimator.fit(training_samples, traininig_values)
|
|
15
|
-
# results = estimator.predict(testing_samples)
|
|
16
|
-
#
|
|
17
|
-
class DecisionTreeRegressor < BaseDecisionTree
|
|
18
|
-
include Base::Regressor
|
|
19
|
-
include ExtDecisionTreeRegressor
|
|
20
|
-
|
|
21
|
-
# Return the importance for each feature.
|
|
22
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
23
|
-
attr_reader :feature_importances
|
|
24
|
-
|
|
25
|
-
# Return the learned tree.
|
|
26
|
-
# @return [Node]
|
|
27
|
-
attr_reader :tree
|
|
28
|
-
|
|
29
|
-
# Return the random generator for random selection of feature index.
|
|
30
|
-
# @return [Random]
|
|
31
|
-
attr_reader :rng
|
|
32
|
-
|
|
33
|
-
# Return the values assigned each leaf.
|
|
34
|
-
# @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
|
|
35
|
-
attr_reader :leaf_values
|
|
36
|
-
|
|
37
|
-
# Create a new regressor with decision tree algorithm.
|
|
38
|
-
#
|
|
39
|
-
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
|
40
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
41
|
-
# If nil is given, decision tree grows without concern for depth.
|
|
42
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
|
43
|
-
# If nil is given, number of leaves is not limited.
|
|
44
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
45
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
46
|
-
# If nil is given, split process considers all features.
|
|
47
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
48
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
49
|
-
def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
|
50
|
-
random_seed: nil)
|
|
51
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
52
|
-
max_features: max_features, random_seed: random_seed)
|
|
53
|
-
check_params_numeric(min_samples_leaf: min_samples_leaf)
|
|
54
|
-
check_params_string(criterion: criterion)
|
|
55
|
-
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
56
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
57
|
-
super
|
|
58
|
-
@leaf_values = nil
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
# Fit the model with given training data.
|
|
62
|
-
#
|
|
63
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
64
|
-
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
|
|
65
|
-
# @return [DecisionTreeRegressor] The learned regressor itself.
|
|
66
|
-
def fit(x, y)
|
|
67
|
-
x = check_convert_sample_array(x)
|
|
68
|
-
y = check_convert_tvalue_array(y)
|
|
69
|
-
check_sample_tvalue_size(x, y)
|
|
70
|
-
n_samples, n_features = x.shape
|
|
71
|
-
@params[:max_features] = n_features if @params[:max_features].nil?
|
|
72
|
-
@params[:max_features] = [@params[:max_features], n_features].min
|
|
73
|
-
@n_leaves = 0
|
|
74
|
-
@leaf_values = []
|
|
75
|
-
@sub_rng = @rng.dup
|
|
76
|
-
build_tree(x, y)
|
|
77
|
-
eval_importance(n_samples, n_features)
|
|
78
|
-
@leaf_values = Numo::DFloat.cast(@leaf_values)
|
|
79
|
-
@leaf_values = @leaf_values.flatten.dup if @leaf_values.shape[1] == 1
|
|
80
|
-
self
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Predict values for samples.
|
|
84
|
-
#
|
|
85
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
|
86
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
|
87
|
-
def predict(x)
|
|
88
|
-
x = check_convert_sample_array(x)
|
|
89
|
-
@leaf_values.shape[1].nil? ? @leaf_values[apply(x)].dup : @leaf_values[apply(x), true].dup
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
private
|
|
93
|
-
|
|
94
|
-
def stop_growing?(y)
|
|
95
|
-
y.to_a.uniq.size == 1
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
def put_leaf(node, y)
|
|
99
|
-
node.probs = nil
|
|
100
|
-
node.leaf = true
|
|
101
|
-
node.leaf_id = @n_leaves
|
|
102
|
-
@n_leaves += 1
|
|
103
|
-
@leaf_values.push(y.mean(0))
|
|
104
|
-
node
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
def best_split(f, y, impurity)
|
|
108
|
-
find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
def impurity(y)
|
|
112
|
-
node_impurity(@params[:criterion], y.to_a)
|
|
113
|
-
end
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
end
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/decision_tree_classifier'
|
|
4
|
-
|
|
5
|
-
module Rumale
|
|
6
|
-
module Tree
|
|
7
|
-
# ExtraTreeClassifier is a class that implements extra randomized tree for classification.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# estimator =
|
|
11
|
-
# Rumale::Tree::ExtraTreeClassifier.new(
|
|
12
|
-
# criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
13
|
-
# estimator.fit(training_samples, traininig_labels)
|
|
14
|
-
# results = estimator.predict(testing_samples)
|
|
15
|
-
#
|
|
16
|
-
# *Reference*
|
|
17
|
-
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
|
18
|
-
class ExtraTreeClassifier < DecisionTreeClassifier
|
|
19
|
-
# Return the class labels.
|
|
20
|
-
# @return [Numo::Int32] (size: n_classes)
|
|
21
|
-
attr_reader :classes
|
|
22
|
-
|
|
23
|
-
# Return the importance for each feature.
|
|
24
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
25
|
-
attr_reader :feature_importances
|
|
26
|
-
|
|
27
|
-
# Return the learned tree.
|
|
28
|
-
# @return [Node]
|
|
29
|
-
attr_reader :tree
|
|
30
|
-
|
|
31
|
-
# Return the random generator for random selection of feature index.
|
|
32
|
-
# @return [Random]
|
|
33
|
-
attr_reader :rng
|
|
34
|
-
|
|
35
|
-
# Return the labels assigned each leaf.
|
|
36
|
-
# @return [Numo::Int32] (size: n_leafs)
|
|
37
|
-
attr_reader :leaf_labels
|
|
38
|
-
|
|
39
|
-
# Create a new classifier with extra randomized tree algorithm.
|
|
40
|
-
#
|
|
41
|
-
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
|
42
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
43
|
-
# If nil is given, extra tree grows without concern for depth.
|
|
44
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
|
45
|
-
# If nil is given, number of leaves is not limited.
|
|
46
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
47
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
48
|
-
# If nil is given, split process considers all features.
|
|
49
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
50
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
51
|
-
def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
|
52
|
-
random_seed: nil)
|
|
53
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
54
|
-
max_features: max_features, random_seed: random_seed)
|
|
55
|
-
check_params_numeric(min_samples_leaf: min_samples_leaf)
|
|
56
|
-
check_params_string(criterion: criterion)
|
|
57
|
-
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
58
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
59
|
-
super
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# Fit the model with given training data.
|
|
63
|
-
#
|
|
64
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
65
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
|
66
|
-
# @return [ExtraTreeClassifier] The learned classifier itself.
|
|
67
|
-
def fit(x, y)
|
|
68
|
-
x = check_convert_sample_array(x)
|
|
69
|
-
y = check_convert_label_array(y)
|
|
70
|
-
check_sample_label_size(x, y)
|
|
71
|
-
super
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Predict class labels for samples.
|
|
75
|
-
#
|
|
76
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
77
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
|
78
|
-
def predict(x)
|
|
79
|
-
x = check_convert_sample_array(x)
|
|
80
|
-
super
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Predict probability for samples.
|
|
84
|
-
#
|
|
85
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
|
86
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
|
87
|
-
def predict_proba(x)
|
|
88
|
-
x = check_convert_sample_array(x)
|
|
89
|
-
super
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
private
|
|
93
|
-
|
|
94
|
-
def best_split(features, y, whole_impurity)
|
|
95
|
-
threshold = @sub_rng.rand(features.min..features.max)
|
|
96
|
-
l_ids = features.le(threshold).where
|
|
97
|
-
r_ids = features.gt(threshold).where
|
|
98
|
-
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
|
99
|
-
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
|
100
|
-
gain = whole_impurity -
|
|
101
|
-
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
|
102
|
-
r_impurity * r_ids.size.fdiv(y.shape[0])
|
|
103
|
-
[l_impurity, r_impurity, threshold, gain]
|
|
104
|
-
end
|
|
105
|
-
end
|
|
106
|
-
end
|
|
107
|
-
end
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/tree/decision_tree_regressor'
|
|
4
|
-
|
|
5
|
-
module Rumale
|
|
6
|
-
module Tree
|
|
7
|
-
# ExtraTreeRegressor is a class that implements extra randomized tree for regression.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# estimator =
|
|
11
|
-
# Rumale::Tree::ExtraTreeRegressor.new(
|
|
12
|
-
# max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
|
13
|
-
# estimator.fit(training_samples, traininig_values)
|
|
14
|
-
# results = estimator.predict(testing_samples)
|
|
15
|
-
#
|
|
16
|
-
# *Reference*
|
|
17
|
-
# - Geurts, P., Ernst, D., and Wehenkel, L., "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
|
18
|
-
class ExtraTreeRegressor < DecisionTreeRegressor
|
|
19
|
-
# Return the importance for each feature.
|
|
20
|
-
# @return [Numo::DFloat] (size: n_features)
|
|
21
|
-
attr_reader :feature_importances
|
|
22
|
-
|
|
23
|
-
# Return the learned tree.
|
|
24
|
-
# @return [Node]
|
|
25
|
-
attr_reader :tree
|
|
26
|
-
|
|
27
|
-
# Return the random generator for random selection of feature index.
|
|
28
|
-
# @return [Random]
|
|
29
|
-
attr_reader :rng
|
|
30
|
-
|
|
31
|
-
# Return the values assigned each leaf.
|
|
32
|
-
# @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
|
|
33
|
-
attr_reader :leaf_values
|
|
34
|
-
|
|
35
|
-
# Create a new regressor with extra randomized tree algorithm.
|
|
36
|
-
#
|
|
37
|
-
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
|
38
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
39
|
-
# If nil is given, extra tree grows without concern for depth.
|
|
40
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
|
41
|
-
# If nil is given, number of leaves is not limited.
|
|
42
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
43
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
44
|
-
# If nil is given, split process considers all features.
|
|
45
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
46
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
47
|
-
def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
|
48
|
-
random_seed: nil)
|
|
49
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
50
|
-
max_features: max_features, random_seed: random_seed)
|
|
51
|
-
check_params_numeric(min_samples_leaf: min_samples_leaf)
|
|
52
|
-
check_params_string(criterion: criterion)
|
|
53
|
-
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
54
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
55
|
-
super
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# Fit the model with given training data.
|
|
59
|
-
#
|
|
60
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
61
|
-
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
|
|
62
|
-
# @return [ExtraTreeRegressor] The learned regressor itself.
|
|
63
|
-
def fit(x, y)
|
|
64
|
-
x = check_convert_sample_array(x)
|
|
65
|
-
y = check_convert_tvalue_array(y)
|
|
66
|
-
check_sample_tvalue_size(x, y)
|
|
67
|
-
super
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# Predict values for samples.
|
|
71
|
-
#
|
|
72
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
|
73
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
|
74
|
-
def predict(x)
|
|
75
|
-
x = check_convert_sample_array(x)
|
|
76
|
-
super
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
private
|
|
80
|
-
|
|
81
|
-
def best_split(features, y, whole_impurity)
|
|
82
|
-
threshold = @sub_rng.rand(features.min..features.max)
|
|
83
|
-
l_ids = features.le(threshold).where
|
|
84
|
-
r_ids = features.gt(threshold).where
|
|
85
|
-
l_impurity = l_ids.empty? ? 0.0 : impurity(y[l_ids, true])
|
|
86
|
-
r_impurity = r_ids.empty? ? 0.0 : impurity(y[r_ids, true])
|
|
87
|
-
gain = whole_impurity -
|
|
88
|
-
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
|
89
|
-
r_impurity * r_ids.size.fdiv(y.shape[0])
|
|
90
|
-
[l_impurity, r_impurity, threshold, gain]
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
end
|
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/regressor'
|
|
5
|
-
require 'rumale/rumaleext'
|
|
6
|
-
require 'rumale/tree/node'
|
|
7
|
-
|
|
8
|
-
module Rumale
|
|
9
|
-
module Tree
|
|
10
|
-
# GradientTreeRegressor is a class that implements decision tree for regression with exact gredy algorithm.
|
|
11
|
-
# This class is used internally for estimators with gradient tree boosting.
|
|
12
|
-
#
|
|
13
|
-
# *Reference*
|
|
14
|
-
# - Friedman, J H., "Greedy Function Approximation: A Gradient Boosting Machine," Annals of Statistics, 29 (5), pp. 1189--1232, 2001.
|
|
15
|
-
# - Friedman, J H., "Stochastic Gradient Boosting," Computational Statistics and Data Analysis, 38 (4), pp. 367--378, 2002.
|
|
16
|
-
# - Chen, T., and Guestrin, C., "XGBoost: A Scalable Tree Boosting System," Proc. KDD'16, pp. 785--794, 2016.
|
|
17
|
-
class GradientTreeRegressor
|
|
18
|
-
include Base::BaseEstimator
|
|
19
|
-
include Base::Regressor
|
|
20
|
-
include ExtGradientTreeRegressor
|
|
21
|
-
|
|
22
|
-
# Return the importance for each feature.
|
|
23
|
-
# The feature importances are calculated based on the numbers of times the feature is used for splitting.
|
|
24
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
|
25
|
-
attr_reader :feature_importances
|
|
26
|
-
|
|
27
|
-
# Return the learned tree.
|
|
28
|
-
# @return [Node]
|
|
29
|
-
attr_reader :tree
|
|
30
|
-
|
|
31
|
-
# Return the random generator for random selection of feature index.
|
|
32
|
-
# @return [Random]
|
|
33
|
-
attr_reader :rng
|
|
34
|
-
|
|
35
|
-
# Return the values assigned each leaf.
|
|
36
|
-
# @return [Numo::DFloat] (shape: [n_leaves])
|
|
37
|
-
attr_reader :leaf_weights
|
|
38
|
-
|
|
39
|
-
# Initialize a gradient tree regressor
|
|
40
|
-
#
|
|
41
|
-
# @param reg_lambda [Float] The L2 regularization term on weight.
|
|
42
|
-
# @param shrinkage_rate [Float] The shrinkage rate for weight.
|
|
43
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
|
44
|
-
# If nil is given, decision tree grows without concern for depth.
|
|
45
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
|
46
|
-
# If nil is given, number of leaves is not limited.
|
|
47
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
|
48
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
|
49
|
-
# If nil is given, split process considers all features.
|
|
50
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
51
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
|
52
|
-
def initialize(reg_lambda: 0.0, shrinkage_rate: 1.0,
|
|
53
|
-
max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil)
|
|
54
|
-
check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
55
|
-
max_features: max_features, random_seed: random_seed)
|
|
56
|
-
check_params_numeric(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate, min_samples_leaf: min_samples_leaf)
|
|
57
|
-
check_params_positive(reg_lambda: reg_lambda, shrinkage_rate: shrinkage_rate,
|
|
58
|
-
max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
|
59
|
-
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
|
60
|
-
@params = {}
|
|
61
|
-
@params[:reg_lambda] = reg_lambda
|
|
62
|
-
@params[:shrinkage_rate] = shrinkage_rate
|
|
63
|
-
@params[:max_depth] = max_depth
|
|
64
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
|
65
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
|
66
|
-
@params[:max_features] = max_features
|
|
67
|
-
@params[:random_seed] = random_seed
|
|
68
|
-
@params[:random_seed] ||= srand
|
|
69
|
-
@tree = nil
|
|
70
|
-
@feature_importances = nil
|
|
71
|
-
@n_leaves = nil
|
|
72
|
-
@leaf_weights = nil
|
|
73
|
-
@rng = Random.new(@params[:random_seed])
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Fit the model with given training data.
|
|
77
|
-
#
|
|
78
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
79
|
-
# @param y [Numo::DFloat] (shape: [n_samples]) The taget values to be used for fitting the model.
|
|
80
|
-
# @param g [Numo::DFloat] (shape: [n_samples]) The gradient of loss function.
|
|
81
|
-
# @param h [Numo::DFloat] (shape: [n_samples]) The hessian of loss function.
|
|
82
|
-
# @return [GradientTreeRegressor] The learned regressor itself.
|
|
83
|
-
def fit(x, y, g, h)
|
|
84
|
-
x = check_convert_sample_array(x)
|
|
85
|
-
y = check_convert_tvalue_array(y)
|
|
86
|
-
g = check_convert_tvalue_array(g)
|
|
87
|
-
h = check_convert_tvalue_array(h)
|
|
88
|
-
check_sample_tvalue_size(x, y)
|
|
89
|
-
# Initialize some variables.
|
|
90
|
-
n_features = x.shape[1]
|
|
91
|
-
@params[:max_features] ||= n_features
|
|
92
|
-
@n_leaves = 0
|
|
93
|
-
@leaf_weights = []
|
|
94
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
|
95
|
-
@sub_rng = @rng.dup
|
|
96
|
-
# Build tree.
|
|
97
|
-
build_tree(x, y, g, h)
|
|
98
|
-
@leaf_weights = Numo::DFloat[*@leaf_weights]
|
|
99
|
-
self
|
|
100
|
-
end
|
|
101
|
-
|
|
102
|
-
# Predict values for samples.
|
|
103
|
-
#
|
|
104
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
|
105
|
-
# @return [Numo::DFloat] (size: n_samples) Predicted values per sample.
|
|
106
|
-
def predict(x)
|
|
107
|
-
x = check_convert_sample_array(x)
|
|
108
|
-
@leaf_weights[apply(x)].dup
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
# Return the index of the leaf that each sample reached.
|
|
112
|
-
#
|
|
113
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
|
114
|
-
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
|
115
|
-
def apply(x)
|
|
116
|
-
x = check_convert_sample_array(x)
|
|
117
|
-
Numo::Int32[*(Array.new(x.shape[0]) { |n| partial_apply(@tree, x[n, true]) })]
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
private
|
|
121
|
-
|
|
122
|
-
def partial_apply(tree, sample)
|
|
123
|
-
node = tree
|
|
124
|
-
until node.leaf
|
|
125
|
-
# :nocov:
|
|
126
|
-
node = if node.right.nil?
|
|
127
|
-
node.left
|
|
128
|
-
elsif node.left.nil?
|
|
129
|
-
node.right
|
|
130
|
-
# :nocov:
|
|
131
|
-
else
|
|
132
|
-
sample[node.feature_id] <= node.threshold ? node.left : node.right
|
|
133
|
-
end
|
|
134
|
-
end
|
|
135
|
-
node.leaf_id
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
def build_tree(x, y, g, h)
|
|
139
|
-
@feature_ids = Array.new(x.shape[1]) { |v| v }
|
|
140
|
-
@tree = grow_node(0, x, y, g, h)
|
|
141
|
-
@feature_ids = nil
|
|
142
|
-
nil
|
|
143
|
-
end
|
|
144
|
-
|
|
145
|
-
def grow_node(depth, x, y, g, h) # rubocop:disable Metrics/AbcSize
|
|
146
|
-
# intialize some variables.
|
|
147
|
-
sum_g = g.sum
|
|
148
|
-
sum_h = h.sum
|
|
149
|
-
n_samples = x.shape[0]
|
|
150
|
-
node = Node.new(depth: depth, n_samples: n_samples)
|
|
151
|
-
|
|
152
|
-
# terminate growing.
|
|
153
|
-
return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
|
|
154
|
-
return nil if n_samples < @params[:min_samples_leaf]
|
|
155
|
-
return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
|
|
156
|
-
return put_leaf(node, sum_g, sum_h) if !@params[:max_depth].nil? && depth == @params[:max_depth]
|
|
157
|
-
return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
|
|
158
|
-
|
|
159
|
-
# calculate optimal parameters.
|
|
160
|
-
feature_id, threshold, gain = rand_ids.map { |n| [n, *best_split(x[true, n], g, h, sum_g, sum_h)] }.max_by(&:last)
|
|
161
|
-
|
|
162
|
-
return put_leaf(node, sum_g, sum_h) if gain.nil? || gain.zero?
|
|
163
|
-
|
|
164
|
-
left_ids = x[true, feature_id].le(threshold).where
|
|
165
|
-
right_ids = x[true, feature_id].gt(threshold).where
|
|
166
|
-
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], g[left_ids], h[left_ids])
|
|
167
|
-
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], g[right_ids], h[right_ids])
|
|
168
|
-
|
|
169
|
-
return put_leaf(node, sum_g, sum_h) if node.left.nil? && node.right.nil?
|
|
170
|
-
|
|
171
|
-
@feature_importances[feature_id] += 1.0
|
|
172
|
-
|
|
173
|
-
node.feature_id = feature_id
|
|
174
|
-
node.threshold = threshold
|
|
175
|
-
node.leaf = false
|
|
176
|
-
node
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
def stop_growing?(y)
|
|
180
|
-
y.to_a.uniq.size == 1
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
def put_leaf(node, sum_g, sum_h)
|
|
184
|
-
node.probs = nil
|
|
185
|
-
node.leaf = true
|
|
186
|
-
node.leaf_id = @n_leaves
|
|
187
|
-
weight = -@params[:shrinkage_rate] * sum_g / (sum_h + @params[:reg_lambda])
|
|
188
|
-
@leaf_weights.push(weight)
|
|
189
|
-
@n_leaves += 1
|
|
190
|
-
node
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
def best_split(f, g, h, sum_g, sum_h)
|
|
194
|
-
find_split_params(f.sort_index, f, g, h, sum_g, sum_h, @params[:reg_lambda])
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
def rand_ids
|
|
198
|
-
@feature_ids.sample(@params[:max_features], random: @sub_rng)
|
|
199
|
-
end
|
|
200
|
-
end
|
|
201
|
-
end
|
|
202
|
-
end
|