svmkit 0.7.3 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -9
- data/.rspec +1 -0
- data/.travis.yml +4 -12
- data/LICENSE.txt +1 -1
- data/README.md +11 -13
- data/lib/svmkit.rb +3 -66
- data/svmkit.gemspec +12 -7
- metadata +16 -81
- data/.coveralls.yml +0 -1
- data/.rubocop.yml +0 -47
- data/.rubocop_todo.yml +0 -58
- data/HISTORY.md +0 -168
- data/lib/svmkit/base/base_estimator.rb +0 -13
- data/lib/svmkit/base/classifier.rb +0 -34
- data/lib/svmkit/base/cluster_analyzer.rb +0 -29
- data/lib/svmkit/base/evaluator.rb +0 -13
- data/lib/svmkit/base/regressor.rb +0 -34
- data/lib/svmkit/base/splitter.rb +0 -17
- data/lib/svmkit/base/transformer.rb +0 -18
- data/lib/svmkit/clustering/dbscan.rb +0 -127
- data/lib/svmkit/clustering/k_means.rb +0 -140
- data/lib/svmkit/dataset.rb +0 -109
- data/lib/svmkit/decomposition/nmf.rb +0 -147
- data/lib/svmkit/decomposition/pca.rb +0 -150
- data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
- data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
- data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
- data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
- data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
- data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
- data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
- data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
- data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
- data/lib/svmkit/evaluation_measure/precision.rb +0 -51
- data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
- data/lib/svmkit/evaluation_measure/purity.rb +0 -41
- data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
- data/lib/svmkit/evaluation_measure/recall.rb +0 -51
- data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
- data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
- data/lib/svmkit/linear_model/lasso.rb +0 -138
- data/lib/svmkit/linear_model/linear_regression.rb +0 -112
- data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
- data/lib/svmkit/linear_model/ridge.rb +0 -112
- data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
- data/lib/svmkit/linear_model/svc.rb +0 -184
- data/lib/svmkit/linear_model/svr.rb +0 -123
- data/lib/svmkit/model_selection/cross_validation.rb +0 -121
- data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
- data/lib/svmkit/model_selection/k_fold.rb +0 -77
- data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
- data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
- data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
- data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
- data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
- data/lib/svmkit/optimizer/nadam.rb +0 -90
- data/lib/svmkit/optimizer/rmsprop.rb +0 -69
- data/lib/svmkit/optimizer/sgd.rb +0 -65
- data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
- data/lib/svmkit/pairwise_metric.rb +0 -91
- data/lib/svmkit/pipeline/pipeline.rb +0 -197
- data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
- data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
- data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
- data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
- data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
- data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
- data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
- data/lib/svmkit/probabilistic_output.rb +0 -112
- data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
- data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
- data/lib/svmkit/tree/node.rb +0 -70
- data/lib/svmkit/utils.rb +0 -22
- data/lib/svmkit/validation.rb +0 -79
- data/lib/svmkit/values.rb +0 -13
- data/lib/svmkit/version.rb +0 -7
@@ -1,198 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/values'
|
5
|
-
require 'svmkit/utils'
|
6
|
-
require 'svmkit/base/base_estimator'
|
7
|
-
require 'svmkit/base/classifier'
|
8
|
-
require 'svmkit/tree/decision_tree_classifier'
|
9
|
-
|
10
|
-
module SVMKit
|
11
|
-
module Ensemble
|
12
|
-
# AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
|
13
|
-
# This class uses decision tree for a weak learner.
|
14
|
-
#
|
15
|
-
# @example
|
16
|
-
# estimator =
|
17
|
-
# SVMKit::Ensemble::AdaBoostClassifier.new(
|
18
|
-
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
19
|
-
# estimator.fit(training_samples, traininig_labels)
|
20
|
-
# results = estimator.predict(testing_samples)
|
21
|
-
#
|
22
|
-
# *Reference*
|
23
|
-
# - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
|
24
|
-
class AdaBoostClassifier
|
25
|
-
include Base::BaseEstimator
|
26
|
-
include Base::Classifier
|
27
|
-
include Validation
|
28
|
-
|
29
|
-
# Return the set of estimators.
|
30
|
-
# @return [Array<DecisionTreeClassifier>]
|
31
|
-
attr_reader :estimators
|
32
|
-
|
33
|
-
# Return the class labels.
|
34
|
-
# @return [Numo::Int32] (size: n_classes)
|
35
|
-
attr_reader :classes
|
36
|
-
|
37
|
-
# Return the importance for each feature.
|
38
|
-
# @return [Numo::DFloat] (size: n_features)
|
39
|
-
attr_reader :feature_importances
|
40
|
-
|
41
|
-
# Return the random generator for random selection of feature index.
|
42
|
-
# @return [Random]
|
43
|
-
attr_reader :rng
|
44
|
-
|
45
|
-
# Create a new classifier with AdaBoost.
|
46
|
-
#
|
47
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
48
|
-
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
49
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
50
|
-
# If nil is given, decision tree grows without concern for depth.
|
51
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
52
|
-
# If nil is given, number of leaves is not limited.
|
53
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
54
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
55
|
-
# If nil is given, split process considers all features.
|
56
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
57
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
58
|
-
def initialize(n_estimators: 50,
|
59
|
-
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
60
|
-
max_features: nil, random_seed: nil)
|
61
|
-
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
62
|
-
max_features: max_features, random_seed: random_seed)
|
63
|
-
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
64
|
-
check_params_string(criterion: criterion)
|
65
|
-
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
66
|
-
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
67
|
-
max_features: max_features)
|
68
|
-
@params = {}
|
69
|
-
@params[:n_estimators] = n_estimators
|
70
|
-
@params[:criterion] = criterion
|
71
|
-
@params[:max_depth] = max_depth
|
72
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
73
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
74
|
-
@params[:max_features] = max_features
|
75
|
-
@params[:random_seed] = random_seed
|
76
|
-
@params[:random_seed] ||= srand
|
77
|
-
@estimators = nil
|
78
|
-
@classes = nil
|
79
|
-
@feature_importances = nil
|
80
|
-
@rng = Random.new(@params[:random_seed])
|
81
|
-
end
|
82
|
-
|
83
|
-
# Fit the model with given training data.
|
84
|
-
#
|
85
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
86
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
87
|
-
# @return [AdaBoostClassifier] The learned classifier itself.
|
88
|
-
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
89
|
-
check_sample_array(x)
|
90
|
-
check_label_array(y)
|
91
|
-
check_sample_label_size(x, y)
|
92
|
-
## Initialize some variables.
|
93
|
-
n_samples, n_features = x.shape
|
94
|
-
@estimators = []
|
95
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
96
|
-
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
97
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
98
|
-
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
99
|
-
n_classes = @classes.shape[0]
|
100
|
-
## Boosting.
|
101
|
-
classes_arr = @classes.to_a
|
102
|
-
y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
|
103
|
-
n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
|
104
|
-
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
105
|
-
@params[:n_estimators].times do |_t|
|
106
|
-
# Fit classfier.
|
107
|
-
ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng)
|
108
|
-
break if y[ids].to_a.uniq.size != n_classes
|
109
|
-
tree = Tree::DecisionTreeClassifier.new(
|
110
|
-
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
111
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
112
|
-
max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
|
113
|
-
)
|
114
|
-
tree.fit(x[ids, true], y[ids])
|
115
|
-
# Calculate estimator error.
|
116
|
-
proba = tree.predict_proba(x).clip(1.0e-15, nil)
|
117
|
-
p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
|
118
|
-
inds = p.ne(y)
|
119
|
-
error = (observation_weights * inds).sum / observation_weights.sum
|
120
|
-
# Store model.
|
121
|
-
@estimators.push(tree)
|
122
|
-
@feature_importances += tree.feature_importances
|
123
|
-
break if error.zero?
|
124
|
-
# Update observation weights.
|
125
|
-
log_proba = Numo::NMath.log(proba)
|
126
|
-
observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
|
127
|
-
observation_weights = observation_weights.clip(1.0e-15, nil)
|
128
|
-
sum_observation_weights = observation_weights.sum
|
129
|
-
break if sum_observation_weights.zero?
|
130
|
-
observation_weights /= sum_observation_weights
|
131
|
-
end
|
132
|
-
@feature_importances /= @feature_importances.sum
|
133
|
-
self
|
134
|
-
end
|
135
|
-
|
136
|
-
# Calculate confidence scores for samples.
|
137
|
-
#
|
138
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
|
139
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
|
140
|
-
def decision_function(x)
|
141
|
-
check_sample_array(x)
|
142
|
-
n_samples, = x.shape
|
143
|
-
n_classes = @classes.size
|
144
|
-
sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
|
145
|
-
@estimators.each do |tree|
|
146
|
-
log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
|
147
|
-
sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
|
148
|
-
end
|
149
|
-
sum_probs /= @estimators.size
|
150
|
-
end
|
151
|
-
|
152
|
-
# Predict class labels for samples.
|
153
|
-
#
|
154
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
155
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
156
|
-
def predict(x)
|
157
|
-
check_sample_array(x)
|
158
|
-
n_samples, = x.shape
|
159
|
-
probs = decision_function(x)
|
160
|
-
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
|
161
|
-
end
|
162
|
-
|
163
|
-
# Predict probability for samples.
|
164
|
-
#
|
165
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
166
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
167
|
-
def predict_proba(x)
|
168
|
-
check_sample_array(x)
|
169
|
-
n_classes = @classes.size
|
170
|
-
probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
|
171
|
-
sum_probs = probs.sum(1)
|
172
|
-
probs /= Numo::DFloat[sum_probs].transpose
|
173
|
-
probs
|
174
|
-
end
|
175
|
-
|
176
|
-
# Dump marshal data.
|
177
|
-
# @return [Hash] The marshal data about AdaBoostClassifier.
|
178
|
-
def marshal_dump
|
179
|
-
{ params: @params,
|
180
|
-
estimators: @estimators,
|
181
|
-
classes: @classes,
|
182
|
-
feature_importances: @feature_importances,
|
183
|
-
rng: @rng }
|
184
|
-
end
|
185
|
-
|
186
|
-
# Load marshal data.
|
187
|
-
# @return [nil]
|
188
|
-
def marshal_load(obj)
|
189
|
-
@params = obj[:params]
|
190
|
-
@estimators = obj[:estimators]
|
191
|
-
@classes = obj[:classes]
|
192
|
-
@feature_importances = obj[:feature_importances]
|
193
|
-
@rng = obj[:rng]
|
194
|
-
nil
|
195
|
-
end
|
196
|
-
end
|
197
|
-
end
|
198
|
-
end
|
@@ -1,180 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/values'
|
5
|
-
require 'svmkit/base/base_estimator'
|
6
|
-
require 'svmkit/base/regressor'
|
7
|
-
require 'svmkit/tree/decision_tree_regressor'
|
8
|
-
|
9
|
-
module SVMKit
|
10
|
-
module Ensemble
|
11
|
-
# AdaBoostRegressor is a class that implements random forest for regression.
|
12
|
-
# This class uses decision tree for a weak learner.
|
13
|
-
#
|
14
|
-
# @example
|
15
|
-
# estimator =
|
16
|
-
# SVMKit::Ensemble::AdaBoostRegressor.new(
|
17
|
-
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
18
|
-
# estimator.fit(training_samples, traininig_values)
|
19
|
-
# results = estimator.predict(testing_samples)
|
20
|
-
#
|
21
|
-
# *Reference*
|
22
|
-
# - D. L. Shrestha and D. P. Solomatine, "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
|
23
|
-
#
|
24
|
-
class AdaBoostRegressor
|
25
|
-
include Base::BaseEstimator
|
26
|
-
include Base::Regressor
|
27
|
-
include Validation
|
28
|
-
|
29
|
-
# Return the set of estimators.
|
30
|
-
# @return [Array<DecisionTreeRegressor>]
|
31
|
-
attr_reader :estimators
|
32
|
-
|
33
|
-
# Return the weight for each weak learner.
|
34
|
-
# @return [Numo::DFloat] (size: n_estimates)
|
35
|
-
attr_reader :estimator_weights
|
36
|
-
|
37
|
-
# Return the importance for each feature.
|
38
|
-
# @return [Numo::DFloat] (size: n_features)
|
39
|
-
attr_reader :feature_importances
|
40
|
-
|
41
|
-
# Return the random generator for random selection of feature index.
|
42
|
-
# @return [Random]
|
43
|
-
attr_reader :rng
|
44
|
-
|
45
|
-
# Create a new regressor with random forest.
|
46
|
-
#
|
47
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
48
|
-
# @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
|
49
|
-
# @param exponent [Float] The exponent for the weight of each weak learner.
|
50
|
-
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
51
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
52
|
-
# If nil is given, decision tree grows without concern for depth.
|
53
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
54
|
-
# If nil is given, number of leaves is not limited.
|
55
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
56
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
57
|
-
# If nil is given, split process considers all features.
|
58
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
59
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
60
|
-
def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
|
61
|
-
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
62
|
-
max_features: nil, random_seed: nil)
|
63
|
-
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
64
|
-
max_features: max_features, random_seed: random_seed)
|
65
|
-
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
66
|
-
check_params_float(threshold: threshold, exponent: exponent)
|
67
|
-
check_params_string(criterion: criterion)
|
68
|
-
check_params_positive(n_estimators: n_estimators, threshold: threshold, exponent: exponent,
|
69
|
-
max_depth: max_depth,
|
70
|
-
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
71
|
-
max_features: max_features)
|
72
|
-
@params = {}
|
73
|
-
@params[:n_estimators] = n_estimators
|
74
|
-
@params[:threshold] = threshold
|
75
|
-
@params[:exponent] = exponent
|
76
|
-
@params[:criterion] = criterion
|
77
|
-
@params[:max_depth] = max_depth
|
78
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
79
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
80
|
-
@params[:max_features] = max_features
|
81
|
-
@params[:random_seed] = random_seed
|
82
|
-
@params[:random_seed] ||= srand
|
83
|
-
@estimators = nil
|
84
|
-
@feature_importances = nil
|
85
|
-
@rng = Random.new(@params[:random_seed])
|
86
|
-
end
|
87
|
-
|
88
|
-
# Fit the model with given training data.
|
89
|
-
#
|
90
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
91
|
-
# @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
|
92
|
-
# @return [AdaBoostRegressor] The learned regressor itself.
|
93
|
-
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
94
|
-
check_sample_array(x)
|
95
|
-
check_tvalue_array(y)
|
96
|
-
check_sample_tvalue_size(x, y)
|
97
|
-
# Check target values
|
98
|
-
raise ArgumentError, 'Expect target value vector to be 1-D arrray' unless y.shape.size == 1
|
99
|
-
# Initialize some variables.
|
100
|
-
n_samples, n_features = x.shape
|
101
|
-
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
102
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
103
|
-
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
|
104
|
-
@estimators = []
|
105
|
-
@estimator_weights = []
|
106
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
107
|
-
# Construct forest.
|
108
|
-
@params[:n_estimators].times do |_t|
|
109
|
-
# Fit weak learner.
|
110
|
-
ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng)
|
111
|
-
tree = Tree::DecisionTreeRegressor.new(
|
112
|
-
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
113
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
114
|
-
max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
|
115
|
-
)
|
116
|
-
tree.fit(x[ids, true], y[ids])
|
117
|
-
p = tree.predict(x)
|
118
|
-
# Calculate errors.
|
119
|
-
abs_err = ((p - y) / y).abs
|
120
|
-
err = observation_weights[abs_err.gt(@params[:threshold])].sum
|
121
|
-
break if err <= 0.0
|
122
|
-
# Calculate weight.
|
123
|
-
beta = err**@params[:exponent]
|
124
|
-
weight = Math.log(1.fdiv(beta))
|
125
|
-
# Store model.
|
126
|
-
@estimators.push(tree)
|
127
|
-
@estimator_weights.push(weight)
|
128
|
-
@feature_importances += weight * tree.feature_importances
|
129
|
-
# Update observation weights.
|
130
|
-
update = Numo::DFloat.ones(n_samples)
|
131
|
-
update[abs_err.le(@params[:threshold])] = beta
|
132
|
-
observation_weights *= update
|
133
|
-
observation_weights = observation_weights.clip(1.0e-15, nil)
|
134
|
-
sum_observation_weights = observation_weights.sum
|
135
|
-
break if sum_observation_weights.zero?
|
136
|
-
observation_weights /= sum_observation_weights
|
137
|
-
end
|
138
|
-
@estimator_weights = Numo::DFloat.asarray(@estimator_weights)
|
139
|
-
@feature_importances /= @estimator_weights.sum
|
140
|
-
self
|
141
|
-
end
|
142
|
-
|
143
|
-
# Predict values for samples.
|
144
|
-
#
|
145
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
146
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
147
|
-
def predict(x)
|
148
|
-
check_sample_array(x)
|
149
|
-
n_samples, = x.shape
|
150
|
-
predictions = Numo::DFloat.zeros(n_samples)
|
151
|
-
@estimators.size.times do |t|
|
152
|
-
predictions += @estimator_weights[t] * @estimators[t].predict(x)
|
153
|
-
end
|
154
|
-
sum_weight = @estimator_weights.sum
|
155
|
-
predictions / sum_weight
|
156
|
-
end
|
157
|
-
|
158
|
-
# Dump marshal data.
|
159
|
-
# @return [Hash] The marshal data about AdaBoostRegressor.
|
160
|
-
def marshal_dump
|
161
|
-
{ params: @params,
|
162
|
-
estimators: @estimators,
|
163
|
-
estimator_weights: @estimator_weights,
|
164
|
-
feature_importances: @feature_importances,
|
165
|
-
rng: @rng }
|
166
|
-
end
|
167
|
-
|
168
|
-
# Load marshal data.
|
169
|
-
# @return [nil]
|
170
|
-
def marshal_load(obj)
|
171
|
-
@params = obj[:params]
|
172
|
-
@estimators = obj[:estimators]
|
173
|
-
@estimator_weights = obj[:estimator_weights]
|
174
|
-
@feature_importances = obj[:feature_importances]
|
175
|
-
@rng = obj[:rng]
|
176
|
-
nil
|
177
|
-
end
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
@@ -1,182 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'svmkit/validation'
|
4
|
-
require 'svmkit/values'
|
5
|
-
require 'svmkit/base/base_estimator'
|
6
|
-
require 'svmkit/base/classifier'
|
7
|
-
require 'svmkit/tree/decision_tree_classifier'
|
8
|
-
|
9
|
-
module SVMKit
|
10
|
-
# This module consists of the classes that implement ensemble-based methods.
|
11
|
-
module Ensemble
|
12
|
-
# RandomForestClassifier is a class that implements random forest for classification.
|
13
|
-
#
|
14
|
-
# @example
|
15
|
-
# estimator =
|
16
|
-
# SVMKit::Ensemble::RandomForestClassifier.new(
|
17
|
-
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
18
|
-
# estimator.fit(training_samples, traininig_labels)
|
19
|
-
# results = estimator.predict(testing_samples)
|
20
|
-
#
|
21
|
-
class RandomForestClassifier
|
22
|
-
include Base::BaseEstimator
|
23
|
-
include Base::Classifier
|
24
|
-
include Validation
|
25
|
-
|
26
|
-
# Return the set of estimators.
|
27
|
-
# @return [Array<DecisionTreeClassifier>]
|
28
|
-
attr_reader :estimators
|
29
|
-
|
30
|
-
# Return the class labels.
|
31
|
-
# @return [Numo::Int32] (size: n_classes)
|
32
|
-
attr_reader :classes
|
33
|
-
|
34
|
-
# Return the importance for each feature.
|
35
|
-
# @return [Numo::DFloat] (size: n_features)
|
36
|
-
attr_reader :feature_importances
|
37
|
-
|
38
|
-
# Return the random generator for random selection of feature index.
|
39
|
-
# @return [Random]
|
40
|
-
attr_reader :rng
|
41
|
-
|
42
|
-
# Create a new classifier with random forest.
|
43
|
-
#
|
44
|
-
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
45
|
-
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
46
|
-
# @param max_depth [Integer] The maximum depth of the tree.
|
47
|
-
# If nil is given, decision tree grows without concern for depth.
|
48
|
-
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
49
|
-
# If nil is given, number of leaves is not limited.
|
50
|
-
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
51
|
-
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
52
|
-
# If nil is given, split process considers all features.
|
53
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
54
|
-
# It is used to randomly determine the order of features when deciding spliting point.
|
55
|
-
def initialize(n_estimators: 10,
|
56
|
-
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
57
|
-
max_features: nil, random_seed: nil)
|
58
|
-
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
59
|
-
max_features: max_features, random_seed: random_seed)
|
60
|
-
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
61
|
-
check_params_string(criterion: criterion)
|
62
|
-
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
63
|
-
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
64
|
-
max_features: max_features)
|
65
|
-
@params = {}
|
66
|
-
@params[:n_estimators] = n_estimators
|
67
|
-
@params[:criterion] = criterion
|
68
|
-
@params[:max_depth] = max_depth
|
69
|
-
@params[:max_leaf_nodes] = max_leaf_nodes
|
70
|
-
@params[:min_samples_leaf] = min_samples_leaf
|
71
|
-
@params[:max_features] = max_features
|
72
|
-
@params[:random_seed] = random_seed
|
73
|
-
@params[:random_seed] ||= srand
|
74
|
-
@estimators = nil
|
75
|
-
@classes = nil
|
76
|
-
@feature_importances = nil
|
77
|
-
@rng = Random.new(@params[:random_seed])
|
78
|
-
end
|
79
|
-
|
80
|
-
# Fit the model with given training data.
|
81
|
-
#
|
82
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
83
|
-
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
84
|
-
# @return [RandomForestClassifier] The learned classifier itself.
|
85
|
-
def fit(x, y)
|
86
|
-
check_sample_array(x)
|
87
|
-
check_label_array(y)
|
88
|
-
check_sample_label_size(x, y)
|
89
|
-
# Initialize some variables.
|
90
|
-
n_samples, n_features = x.shape
|
91
|
-
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
92
|
-
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
93
|
-
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
94
|
-
@feature_importances = Numo::DFloat.zeros(n_features)
|
95
|
-
# Construct forest.
|
96
|
-
@estimators = Array.new(@params[:n_estimators]) do
|
97
|
-
tree = Tree::DecisionTreeClassifier.new(
|
98
|
-
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
99
|
-
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
100
|
-
max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
|
101
|
-
)
|
102
|
-
bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
|
103
|
-
tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
|
104
|
-
@feature_importances += tree.feature_importances
|
105
|
-
tree
|
106
|
-
end
|
107
|
-
@feature_importances /= @feature_importances.sum
|
108
|
-
self
|
109
|
-
end
|
110
|
-
|
111
|
-
# Predict class labels for samples.
|
112
|
-
#
|
113
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
114
|
-
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
115
|
-
def predict(x)
|
116
|
-
check_sample_array(x)
|
117
|
-
n_samples, = x.shape
|
118
|
-
n_classes = @classes.size
|
119
|
-
classes_arr = @classes.to_a
|
120
|
-
ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
|
121
|
-
@estimators.each do |tree|
|
122
|
-
predicted = tree.predict(x)
|
123
|
-
n_samples.times do |n|
|
124
|
-
class_id = classes_arr.index(predicted[n])
|
125
|
-
ballot_box[n, class_id] += 1.0 unless class_id.nil?
|
126
|
-
end
|
127
|
-
end
|
128
|
-
Numo::Int32[*Array.new(n_samples) { |n| @classes[ballot_box[n, true].max_index] }]
|
129
|
-
end
|
130
|
-
|
131
|
-
# Predict probability for samples.
|
132
|
-
#
|
133
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
134
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
135
|
-
def predict_proba(x)
|
136
|
-
check_sample_array(x)
|
137
|
-
n_samples, = x.shape
|
138
|
-
n_classes = @classes.size
|
139
|
-
classes_arr = @classes.to_a
|
140
|
-
ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
|
141
|
-
@estimators.each do |tree|
|
142
|
-
probs = tree.predict_proba(x)
|
143
|
-
tree.classes.size.times do |n|
|
144
|
-
class_id = classes_arr.index(tree.classes[n])
|
145
|
-
ballot_box[true, class_id] += probs[true, n] unless class_id.nil?
|
146
|
-
end
|
147
|
-
end
|
148
|
-
(ballot_box.transpose / ballot_box.sum(axis: 1)).transpose
|
149
|
-
end
|
150
|
-
|
151
|
-
# Return the index of the leaf that each sample reached.
|
152
|
-
#
|
153
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
154
|
-
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
155
|
-
def apply(x)
|
156
|
-
check_sample_array(x)
|
157
|
-
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
|
158
|
-
end
|
159
|
-
|
160
|
-
# Dump marshal data.
|
161
|
-
# @return [Hash] The marshal data about RandomForestClassifier.
|
162
|
-
def marshal_dump
|
163
|
-
{ params: @params,
|
164
|
-
estimators: @estimators,
|
165
|
-
classes: @classes,
|
166
|
-
feature_importances: @feature_importances,
|
167
|
-
rng: @rng }
|
168
|
-
end
|
169
|
-
|
170
|
-
# Load marshal data.
|
171
|
-
# @return [nil]
|
172
|
-
def marshal_load(obj)
|
173
|
-
@params = obj[:params]
|
174
|
-
@estimators = obj[:estimators]
|
175
|
-
@classes = obj[:classes]
|
176
|
-
@feature_importances = obj[:feature_importances]
|
177
|
-
@rng = obj[:rng]
|
178
|
-
nil
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
end
|