rumale-ensemble 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/ensemble/ada_boost_classifier.rb +176 -0
- data/lib/rumale/ensemble/ada_boost_regressor.rb +167 -0
- data/lib/rumale/ensemble/extra_trees_classifier.rb +140 -0
- data/lib/rumale/ensemble/extra_trees_regressor.rb +125 -0
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +296 -0
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +223 -0
- data/lib/rumale/ensemble/random_forest_classifier.rb +184 -0
- data/lib/rumale/ensemble/random_forest_regressor.rb +146 -0
- data/lib/rumale/ensemble/stacking_classifier.rb +224 -0
- data/lib/rumale/ensemble/stacking_regressor.rb +168 -0
- data/lib/rumale/ensemble/value.rb +13 -0
- data/lib/rumale/ensemble/version.rb +10 -0
- data/lib/rumale/ensemble/voting_classifier.rb +129 -0
- data/lib/rumale/ensemble/voting_regressor.rb +84 -0
- data/lib/rumale/ensemble.rb +20 -0
- metadata +152 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/base/estimator'
|
5
|
+
require 'rumale/base/classifier'
|
6
|
+
require 'rumale/tree/decision_tree_classifier'
|
7
|
+
require 'rumale/ensemble/value'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
# This module consists of the classes that implement ensemble-based methods.
|
11
|
+
module Ensemble
|
12
|
+
# RandomForestClassifier is a class that implements random forest for classification.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# require 'rumale/ensemble/random_forest_classifier'
|
16
|
+
#
|
17
|
+
# estimator =
|
18
|
+
# Rumale::Ensemble::RandomForestClassifier.new(
|
19
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
20
|
+
# estimator.fit(training_samples, traininig_labels)
|
21
|
+
# results = estimator.predict(testing_samples)
|
22
|
+
#
|
23
|
+
class RandomForestClassifier < ::Rumale::Base::Estimator
|
24
|
+
include ::Rumale::Base::Classifier
|
25
|
+
|
26
|
+
# Return the set of estimators.
|
27
|
+
# @return [Array<DecisionTreeClassifier>]
|
28
|
+
attr_reader :estimators
|
29
|
+
|
30
|
+
# Return the class labels.
|
31
|
+
# @return [Numo::Int32] (size: n_classes)
|
32
|
+
attr_reader :classes
|
33
|
+
|
34
|
+
# Return the importance for each feature.
|
35
|
+
# @return [Numo::DFloat] (size: n_features)
|
36
|
+
attr_reader :feature_importances
|
37
|
+
|
38
|
+
# Return the random generator for random selection of feature index.
|
39
|
+
# @return [Random]
|
40
|
+
attr_reader :rng
|
41
|
+
|
42
|
+
# Create a new classifier with random forest.
|
43
|
+
#
|
44
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
45
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
46
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
47
|
+
# If nil is given, decision tree grows without concern for depth.
|
48
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
49
|
+
# If nil is given, number of leaves is not limited.
|
50
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
51
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
52
|
+
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
53
|
+
# @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
|
54
|
+
# If nil is given, the method does not execute in parallel.
|
55
|
+
# If zero or less is given, it becomes equal to the number of processors.
|
56
|
+
# This parameter is ignored if the Parallel gem is not loaded.
|
57
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
58
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
59
|
+
def initialize(n_estimators: 10,
|
60
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
61
|
+
max_features: nil, n_jobs: nil, random_seed: nil)
|
62
|
+
super()
|
63
|
+
@params = {
|
64
|
+
n_estimators: n_estimators,
|
65
|
+
criterion: criterion,
|
66
|
+
max_depth: max_depth,
|
67
|
+
max_leaf_nodes: max_leaf_nodes,
|
68
|
+
min_samples_leaf: min_samples_leaf,
|
69
|
+
max_features: max_features,
|
70
|
+
n_jobs: n_jobs,
|
71
|
+
random_seed: random_seed || srand
|
72
|
+
}
|
73
|
+
@rng = Random.new(@params[:random_seed])
|
74
|
+
end
|
75
|
+
|
76
|
+
# Fit the model with given training data.
|
77
|
+
#
|
78
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
79
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
80
|
+
# @return [RandomForestClassifier] The learned classifier itself.
|
81
|
+
def fit(x, y)
|
82
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
83
|
+
y = ::Rumale::Validation.check_convert_label_array(y)
|
84
|
+
::Rumale::Validation.check_sample_size(x, y)
|
85
|
+
|
86
|
+
# Initialize some variables.
|
87
|
+
n_samples, n_features = x.shape
|
88
|
+
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
89
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
90
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
91
|
+
sub_rng = @rng.dup
|
92
|
+
rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)) }
|
93
|
+
# Construct forest.
|
94
|
+
@estimators =
|
95
|
+
if enable_parallel?
|
96
|
+
parallel_map(@params[:n_estimators]) do |n|
|
97
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
98
|
+
plant_tree(rngs[n].seed).fit(x[bootstrap_ids, true], y[bootstrap_ids])
|
99
|
+
end
|
100
|
+
else
|
101
|
+
Array.new(@params[:n_estimators]) do |n|
|
102
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
103
|
+
plant_tree(rngs[n].seed).fit(x[bootstrap_ids, true], y[bootstrap_ids])
|
104
|
+
end
|
105
|
+
end
|
106
|
+
@feature_importances =
|
107
|
+
if enable_parallel?
|
108
|
+
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
|
109
|
+
else
|
110
|
+
@estimators.sum(&:feature_importances)
|
111
|
+
end
|
112
|
+
@feature_importances /= @feature_importances.sum
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
# Predict class labels for samples.
|
117
|
+
#
|
118
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
119
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
120
|
+
def predict(x)
|
121
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
122
|
+
|
123
|
+
n_samples = x.shape[0]
|
124
|
+
n_estimators = @estimators.size
|
125
|
+
predicted = if enable_parallel?
|
126
|
+
predict_set = parallel_map(n_estimators) { |n| @estimators[n].predict(x).to_a }.transpose
|
127
|
+
parallel_map(n_samples) { |n| predict_set[n].group_by { |v| v }.max_by { |_k, v| v.size }.first }
|
128
|
+
else
|
129
|
+
predict_set = @estimators.map { |tree| tree.predict(x).to_a }.transpose
|
130
|
+
Array.new(n_samples) { |n| predict_set[n].group_by { |v| v }.max_by { |_k, v| v.size }.first }
|
131
|
+
end
|
132
|
+
Numo::Int32.asarray(predicted)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Predict probability for samples.
|
136
|
+
#
|
137
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
138
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
139
|
+
def predict_proba(x)
|
140
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
141
|
+
|
142
|
+
n_estimators = @estimators.size
|
143
|
+
if enable_parallel?
|
144
|
+
parallel_map(n_estimators) { |n| predict_proba_tree(@estimators[n], x) }.sum / n_estimators
|
145
|
+
else
|
146
|
+
@estimators.sum { |tree| predict_proba_tree(tree, x) } / n_estimators
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# Return the index of the leaf that each sample reached.
|
151
|
+
#
|
152
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
153
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
154
|
+
def apply(x)
|
155
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
156
|
+
|
157
|
+
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
|
158
|
+
end
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def plant_tree(rnd_seed)
|
163
|
+
::Rumale::Tree::DecisionTreeClassifier.new(
|
164
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
165
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
166
|
+
max_features: @params[:max_features], random_seed: rnd_seed
|
167
|
+
)
|
168
|
+
end
|
169
|
+
|
170
|
+
def predict_proba_tree(tree, x)
|
171
|
+
# initialize some variables.
|
172
|
+
n_samples = x.shape[0]
|
173
|
+
base_classes = @classes.to_a
|
174
|
+
n_classes = base_classes.size
|
175
|
+
class_ids = tree.classes.map { |c| base_classes.index(c) }
|
176
|
+
# predict probabilities.
|
177
|
+
probs = Numo::DFloat.zeros(n_samples, n_classes)
|
178
|
+
tree_probs = tree.predict_proba(x)
|
179
|
+
class_ids.each_with_index { |i, j| probs[true, i] = tree_probs[true, j] }
|
180
|
+
probs
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/base/estimator'
|
5
|
+
require 'rumale/base/regressor'
|
6
|
+
require 'rumale/tree/decision_tree_regressor'
|
7
|
+
require 'rumale/ensemble/value'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
module Ensemble
|
11
|
+
# RandomForestRegressor is a class that implements random forest for regression
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/ensemble/random_forest_regressor'
|
15
|
+
#
|
16
|
+
# estimator =
|
17
|
+
# Rumale::Ensemble::RandomForestRegressor.new(
|
18
|
+
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
19
|
+
# estimator.fit(training_samples, traininig_values)
|
20
|
+
# results = estimator.predict(testing_samples)
|
21
|
+
#
|
22
|
+
class RandomForestRegressor < ::Rumale::Base::Estimator
|
23
|
+
include ::Rumale::Base::Regressor
|
24
|
+
|
25
|
+
# Return the set of estimators.
|
26
|
+
# @return [Array<DecisionTreeRegressor>]
|
27
|
+
attr_reader :estimators
|
28
|
+
|
29
|
+
# Return the importance for each feature.
|
30
|
+
# @return [Numo::DFloat] (size: n_features)
|
31
|
+
attr_reader :feature_importances
|
32
|
+
|
33
|
+
# Return the random generator for random selection of feature index.
|
34
|
+
# @return [Random]
|
35
|
+
attr_reader :rng
|
36
|
+
|
37
|
+
# Create a new regressor with random forest.
|
38
|
+
#
|
39
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
40
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
41
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
42
|
+
# If nil is given, decision tree grows without concern for depth.
|
43
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
44
|
+
# If nil is given, number of leaves is not limited.
|
45
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
46
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
47
|
+
# If nil is given, split process considers 'Math.sqrt(n_features)' features.
|
48
|
+
# @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
|
49
|
+
# If nil is given, the methods do not execute in parallel.
|
50
|
+
# If zero or less is given, it becomes equal to the number of processors.
|
51
|
+
# This parameter is ignored if the Parallel gem is not loaded.
|
52
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
53
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
54
|
+
def initialize(n_estimators: 10,
|
55
|
+
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
56
|
+
max_features: nil, n_jobs: nil, random_seed: nil)
|
57
|
+
super()
|
58
|
+
@params = {
|
59
|
+
n_estimators: n_estimators,
|
60
|
+
criterion: criterion,
|
61
|
+
max_depth: max_depth,
|
62
|
+
max_leaf_nodes: max_leaf_nodes,
|
63
|
+
min_samples_leaf: min_samples_leaf,
|
64
|
+
max_features: max_features,
|
65
|
+
n_jobs: n_jobs,
|
66
|
+
random_seed: random_seed || srand
|
67
|
+
}
|
68
|
+
@rng = Random.new(@params[:random_seed])
|
69
|
+
end
|
70
|
+
|
71
|
+
# Fit the model with given training data.
|
72
|
+
#
|
73
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
74
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
75
|
+
# @return [RandomForestRegressor] The learned regressor itself.
|
76
|
+
def fit(x, y)
|
77
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
78
|
+
y = ::Rumale::Validation.check_convert_target_value_array(y)
|
79
|
+
::Rumale::Validation.check_sample_size(x, y)
|
80
|
+
|
81
|
+
# Initialize some variables.
|
82
|
+
n_samples, n_features = x.shape
|
83
|
+
@params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
|
84
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
85
|
+
single_target = y.shape[1].nil?
|
86
|
+
sub_rng = @rng.dup
|
87
|
+
rngs = Array.new(@params[:n_estimators]) { Random.new(sub_rng.rand(::Rumale::Ensemble::Value::SEED_BASE)) }
|
88
|
+
# Construct forest.
|
89
|
+
@estimators =
|
90
|
+
if enable_parallel?
|
91
|
+
parallel_map(@params[:n_estimators]) do |n|
|
92
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
93
|
+
plant_tree(rngs[n].seed).fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
|
94
|
+
end
|
95
|
+
else
|
96
|
+
Array.new(@params[:n_estimators]) do |n|
|
97
|
+
bootstrap_ids = Array.new(n_samples) { rngs[n].rand(0...n_samples) }
|
98
|
+
plant_tree(rngs[n].seed).fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@feature_importances =
|
102
|
+
if enable_parallel?
|
103
|
+
parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.sum
|
104
|
+
else
|
105
|
+
@estimators.sum(&:feature_importances)
|
106
|
+
end
|
107
|
+
@feature_importances /= @feature_importances.sum
|
108
|
+
self
|
109
|
+
end
|
110
|
+
|
111
|
+
# Predict values for samples.
|
112
|
+
#
|
113
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
114
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
115
|
+
def predict(x)
|
116
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
117
|
+
|
118
|
+
if enable_parallel?
|
119
|
+
parallel_map(@params[:n_estimators]) { |n| @estimators[n].predict(x) }.sum / @params[:n_estimators]
|
120
|
+
else
|
121
|
+
@estimators.sum { |tree| tree.predict(x) } / @params[:n_estimators]
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Return the index of the leaf that each sample reached.
|
126
|
+
#
|
127
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
|
128
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
129
|
+
def apply(x)
|
130
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
131
|
+
|
132
|
+
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose.dup
|
133
|
+
end
|
134
|
+
|
135
|
+
private
|
136
|
+
|
137
|
+
def plant_tree(rnd_seed)
|
138
|
+
::Rumale::Tree::DecisionTreeRegressor.new(
|
139
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
140
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
141
|
+
max_features: @params[:max_features], random_seed: rnd_seed
|
142
|
+
)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,224 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/validation'
|
4
|
+
require 'rumale/base/estimator'
|
5
|
+
require 'rumale/base/classifier'
|
6
|
+
require 'rumale/linear_model/logistic_regression'
|
7
|
+
require 'rumale/model_selection/stratified_k_fold'
|
8
|
+
require 'rumale/preprocessing/label_encoder'
|
9
|
+
|
10
|
+
module Rumale
|
11
|
+
module Ensemble
|
12
|
+
# StackingClassifier is a class that implements classifier with stacking method.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# require 'rumale/ensemble/stacking_classifier'
|
16
|
+
#
|
17
|
+
# estimators = {
|
18
|
+
# lgr: Rumale::LinearModel::LogisticRegression.new(reg_param: 1e-2, random_seed: 1),
|
19
|
+
# mlp: Rumale::NeuralNetwork::MLPClassifier.new(hidden_units: [256], random_seed: 1),
|
20
|
+
# rnd: Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
|
21
|
+
# }
|
22
|
+
# meta_estimator = Rumale::LinearModel::LogisticRegression.new(random_seed: 1)
|
23
|
+
# classifier = Rumale::Ensemble::StackedClassifier.new(
|
24
|
+
# estimators: estimators, meta_estimator: meta_estimator, random_seed: 1
|
25
|
+
# )
|
26
|
+
# classifier.fit(training_samples, training_labels)
|
27
|
+
# results = classifier.predict(testing_samples)
|
28
|
+
#
|
29
|
+
# *Reference*
|
30
|
+
# - Zhou, Z-H., "Ensemble Methods - Foundations and Algorithms," CRC Press Taylor and Francis Group, Chapman and Hall/CRC, 2012.
|
31
|
+
class StackingClassifier < ::Rumale::Base::Estimator
|
32
|
+
include ::Rumale::Base::Classifier
|
33
|
+
|
34
|
+
# Return the base classifiers.
|
35
|
+
# @return [Hash<Symbol,Classifier>]
|
36
|
+
attr_reader :estimators
|
37
|
+
|
38
|
+
# Return the meta classifier.
|
39
|
+
# @return [Classifier]
|
40
|
+
attr_reader :meta_estimator
|
41
|
+
|
42
|
+
# Return the class labels.
|
43
|
+
# @return [Numo::Int32] (size: n_classes)
|
44
|
+
attr_reader :classes
|
45
|
+
|
46
|
+
# Return the method used by each base classifier.
|
47
|
+
# @return [Hash<Symbol,Symbol>]
|
48
|
+
attr_reader :stack_method
|
49
|
+
|
50
|
+
# Create a new classifier with stacking method.
|
51
|
+
#
|
52
|
+
# @param estimators [Hash<Symbol,Classifier>] The base classifiers for extracting meta features.
|
53
|
+
# @param meta_estimator [Classifier/Nil] The meta classifier that predicts class label.
|
54
|
+
# If nil is given, LogisticRegression is used.
|
55
|
+
# @param n_splits [Integer] The number of folds for cross validation with stratified k-fold on meta feature extraction in training phase.
|
56
|
+
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset on cross validation.
|
57
|
+
# @param stack_method [String] The method name of base classifier for using meta feature extraction.
|
58
|
+
# If 'auto' is given, it searches the callable method in the order 'predict_proba', 'decision_function', and 'predict'
|
59
|
+
# on each classifier.
|
60
|
+
# @param passthrough [Boolean] The flag indicating whether to concatenate the original features and meta features when training the meta classifier.
|
61
|
+
# @param random_seed [Integer/Nil] The seed value using to initialize the random generator on cross validation.
|
62
|
+
def initialize(estimators:, meta_estimator: nil, n_splits: 5, shuffle: true, stack_method: 'auto', passthrough: false,
|
63
|
+
random_seed: nil)
|
64
|
+
super()
|
65
|
+
@estimators = estimators
|
66
|
+
@meta_estimator = meta_estimator || ::Rumale::LinearModel::LogisticRegression.new
|
67
|
+
@params = {
|
68
|
+
n_splits: n_splits,
|
69
|
+
shuffle: shuffle,
|
70
|
+
stack_method: stack_method,
|
71
|
+
passthrough: passthrough,
|
72
|
+
random_seed: random_seed || srand
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
# Fit the model with given training data.
|
77
|
+
#
|
78
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
79
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
80
|
+
# @return [StackedClassifier] The learned classifier itself.
|
81
|
+
def fit(x, y)
|
82
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
83
|
+
y = ::Rumale::Validation.check_convert_label_array(y)
|
84
|
+
::Rumale::Validation.check_sample_size(x, y)
|
85
|
+
|
86
|
+
n_samples, n_features = x.shape
|
87
|
+
|
88
|
+
@encoder = ::Rumale::Preprocessing::LabelEncoder.new
|
89
|
+
y_encoded = @encoder.fit_transform(y)
|
90
|
+
@classes = Numo::NArray[*@encoder.classes]
|
91
|
+
|
92
|
+
# training base classifiers with all training data.
|
93
|
+
@estimators.each_key { |name| @estimators[name].fit(x, y_encoded) }
|
94
|
+
|
95
|
+
# detecting feature extraction method and its size of output for each base classifier.
|
96
|
+
@stack_method = detect_stack_method
|
97
|
+
@output_size = detect_output_size(n_features)
|
98
|
+
|
99
|
+
# extracting meta features with base classifiers.
|
100
|
+
n_components = @output_size.values.sum
|
101
|
+
z = Numo::DFloat.zeros(n_samples, n_components)
|
102
|
+
|
103
|
+
kf = ::Rumale::ModelSelection::StratifiedKFold.new(
|
104
|
+
n_splits: @params[:n_splits], shuffle: @params[:shuffle], random_seed: @params[:random_seed]
|
105
|
+
)
|
106
|
+
|
107
|
+
kf.split(x, y_encoded).each do |train_ids, valid_ids|
|
108
|
+
x_train = x[train_ids, true]
|
109
|
+
y_train = y_encoded[train_ids]
|
110
|
+
x_valid = x[valid_ids, true]
|
111
|
+
f_start = 0
|
112
|
+
@estimators.each_key do |name|
|
113
|
+
est_fold = Marshal.load(Marshal.dump(@estimators[name]))
|
114
|
+
f_last = f_start + @output_size[name]
|
115
|
+
f_position = @output_size[name] == 1 ? f_start : f_start...f_last
|
116
|
+
z[valid_ids, f_position] = est_fold.fit(x_train, y_train).public_send(@stack_method[name], x_valid)
|
117
|
+
f_start = f_last
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# concatenating original features.
|
122
|
+
z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
|
123
|
+
|
124
|
+
# training meta classifier.
|
125
|
+
@meta_estimator.fit(z, y_encoded)
|
126
|
+
|
127
|
+
self
|
128
|
+
end
|
129
|
+
|
130
|
+
# Calculate confidence scores for samples.
|
131
|
+
#
|
132
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
133
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) The confidence score per sample.
|
134
|
+
def decision_function(x)
|
135
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
136
|
+
|
137
|
+
z = transform(x)
|
138
|
+
@meta_estimator.decision_function(z)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Predict class labels for samples.
|
142
|
+
#
|
143
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
144
|
+
# @return [Numo::Int32] (shape: [n_samples]) The predicted class label per sample.
|
145
|
+
def predict(x)
|
146
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
147
|
+
|
148
|
+
z = transform(x)
|
149
|
+
Numo::Int32.cast(@encoder.inverse_transform(@meta_estimator.predict(z)))
|
150
|
+
end
|
151
|
+
|
152
|
+
# Predict probability for samples.
|
153
|
+
#
|
154
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probabilities.
|
155
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) The predicted probability of each class per sample.
|
156
|
+
def predict_proba(x)
|
157
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
158
|
+
|
159
|
+
z = transform(x)
|
160
|
+
@meta_estimator.predict_proba(z)
|
161
|
+
end
|
162
|
+
|
163
|
+
# Transform the given data with the learned model.
|
164
|
+
#
|
165
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed with the learned model.
|
166
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for samples.
|
167
|
+
def transform(x)
|
168
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
169
|
+
|
170
|
+
n_samples = x.shape[0]
|
171
|
+
n_components = @output_size.values.sum
|
172
|
+
z = Numo::DFloat.zeros(n_samples, n_components)
|
173
|
+
f_start = 0
|
174
|
+
@estimators.each_key do |name|
|
175
|
+
f_last = f_start + @output_size[name]
|
176
|
+
f_position = @output_size[name] == 1 ? f_start : f_start...f_last
|
177
|
+
z[true, f_position] = @estimators[name].public_send(@stack_method[name], x)
|
178
|
+
f_start = f_last
|
179
|
+
end
|
180
|
+
z = Numo::NArray.hstack([z, x]) if @params[:passthrough]
|
181
|
+
z
|
182
|
+
end
|
183
|
+
|
184
|
+
# Fit the model with training data, and then transform them with the learned model.
|
185
|
+
#
|
186
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
187
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
188
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The meta features for training data.
|
189
|
+
def fit_transform(x, y)
|
190
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
191
|
+
y = ::Rumale::Validation.check_convert_label_array(y)
|
192
|
+
::Rumale::Validation.check_sample_size(x, y)
|
193
|
+
|
194
|
+
fit(x, y).transform(x)
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
|
199
|
+
STACK_METHODS = %i[predict_proba decision_function predict].freeze
|
200
|
+
|
201
|
+
private_constant :STACK_METHODS
|
202
|
+
|
203
|
+
def detect_stack_method
|
204
|
+
if @params[:stack_method] == 'auto'
|
205
|
+
@estimators.each_key.with_object({}) do |name, obj|
|
206
|
+
obj[name] = STACK_METHODS.detect do |m|
|
207
|
+
@estimators[name].respond_to?(m)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
else
|
211
|
+
@estimators.each_key.with_object({}) { |name, obj| obj[name] = @params[:stack_method].to_sym }
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def detect_output_size(n_features)
|
216
|
+
x_dummy = Numo::DFloat.new(2, n_features).rand
|
217
|
+
@estimators.each_key.with_object({}) do |name, obj|
|
218
|
+
output_dummy = @estimators[name].public_send(@stack_method[name], x_dummy)
|
219
|
+
obj[name] = output_dummy.ndim == 1 ? 1 : output_dummy.shape[1]
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|