svmkit 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -9
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -12
  5. data/LICENSE.txt +1 -1
  6. data/README.md +11 -13
  7. data/lib/svmkit.rb +3 -66
  8. data/svmkit.gemspec +12 -7
  9. metadata +16 -81
  10. data/.coveralls.yml +0 -1
  11. data/.rubocop.yml +0 -47
  12. data/.rubocop_todo.yml +0 -58
  13. data/HISTORY.md +0 -168
  14. data/lib/svmkit/base/base_estimator.rb +0 -13
  15. data/lib/svmkit/base/classifier.rb +0 -34
  16. data/lib/svmkit/base/cluster_analyzer.rb +0 -29
  17. data/lib/svmkit/base/evaluator.rb +0 -13
  18. data/lib/svmkit/base/regressor.rb +0 -34
  19. data/lib/svmkit/base/splitter.rb +0 -17
  20. data/lib/svmkit/base/transformer.rb +0 -18
  21. data/lib/svmkit/clustering/dbscan.rb +0 -127
  22. data/lib/svmkit/clustering/k_means.rb +0 -140
  23. data/lib/svmkit/dataset.rb +0 -109
  24. data/lib/svmkit/decomposition/nmf.rb +0 -147
  25. data/lib/svmkit/decomposition/pca.rb +0 -150
  26. data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
  27. data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
  28. data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
  29. data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
  30. data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
  31. data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
  32. data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
  33. data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
  34. data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
  35. data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
  36. data/lib/svmkit/evaluation_measure/precision.rb +0 -51
  37. data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
  38. data/lib/svmkit/evaluation_measure/purity.rb +0 -41
  39. data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
  40. data/lib/svmkit/evaluation_measure/recall.rb +0 -51
  41. data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
  42. data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
  43. data/lib/svmkit/linear_model/lasso.rb +0 -138
  44. data/lib/svmkit/linear_model/linear_regression.rb +0 -112
  45. data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
  46. data/lib/svmkit/linear_model/ridge.rb +0 -112
  47. data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
  48. data/lib/svmkit/linear_model/svc.rb +0 -184
  49. data/lib/svmkit/linear_model/svr.rb +0 -123
  50. data/lib/svmkit/model_selection/cross_validation.rb +0 -121
  51. data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
  52. data/lib/svmkit/model_selection/k_fold.rb +0 -77
  53. data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
  54. data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
  55. data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
  56. data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
  57. data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
  58. data/lib/svmkit/optimizer/nadam.rb +0 -90
  59. data/lib/svmkit/optimizer/rmsprop.rb +0 -69
  60. data/lib/svmkit/optimizer/sgd.rb +0 -65
  61. data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
  62. data/lib/svmkit/pairwise_metric.rb +0 -91
  63. data/lib/svmkit/pipeline/pipeline.rb +0 -197
  64. data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
  65. data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
  66. data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
  67. data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
  68. data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
  69. data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
  70. data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
  71. data/lib/svmkit/probabilistic_output.rb +0 -112
  72. data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
  73. data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
  74. data/lib/svmkit/tree/node.rb +0 -70
  75. data/lib/svmkit/utils.rb +0 -22
  76. data/lib/svmkit/validation.rb +0 -79
  77. data/lib/svmkit/values.rb +0 -13
  78. data/lib/svmkit/version.rb +0 -7
@@ -1,198 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/values'
5
- require 'svmkit/utils'
6
- require 'svmkit/base/base_estimator'
7
- require 'svmkit/base/classifier'
8
- require 'svmkit/tree/decision_tree_classifier'
9
-
10
- module SVMKit
11
- module Ensemble
12
- # AdaBoostClassifier is a class that implements AdaBoost (SAMME.R) for classification.
13
- # This class uses decision tree for a weak learner.
14
- #
15
- # @example
16
- # estimator =
17
- # SVMKit::Ensemble::AdaBoostClassifier.new(
18
- # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
19
- # estimator.fit(training_samples, traininig_labels)
20
- # results = estimator.predict(testing_samples)
21
- #
22
- # *Reference*
23
- # - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
24
- class AdaBoostClassifier
25
- include Base::BaseEstimator
26
- include Base::Classifier
27
- include Validation
28
-
29
- # Return the set of estimators.
30
- # @return [Array<DecisionTreeClassifier>]
31
- attr_reader :estimators
32
-
33
- # Return the class labels.
34
- # @return [Numo::Int32] (size: n_classes)
35
- attr_reader :classes
36
-
37
- # Return the importance for each feature.
38
- # @return [Numo::DFloat] (size: n_features)
39
- attr_reader :feature_importances
40
-
41
- # Return the random generator for random selection of feature index.
42
- # @return [Random]
43
- attr_reader :rng
44
-
45
- # Create a new classifier with AdaBoost.
46
- #
47
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
48
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
49
- # @param max_depth [Integer] The maximum depth of the tree.
50
- # If nil is given, decision tree grows without concern for depth.
51
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
52
- # If nil is given, number of leaves is not limited.
53
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
54
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
55
- # If nil is given, split process considers all features.
56
- # @param random_seed [Integer] The seed value using to initialize the random generator.
57
- # It is used to randomly determine the order of features when deciding spliting point.
58
- def initialize(n_estimators: 50,
59
- criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
60
- max_features: nil, random_seed: nil)
61
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
62
- max_features: max_features, random_seed: random_seed)
63
- check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
64
- check_params_string(criterion: criterion)
65
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
66
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
67
- max_features: max_features)
68
- @params = {}
69
- @params[:n_estimators] = n_estimators
70
- @params[:criterion] = criterion
71
- @params[:max_depth] = max_depth
72
- @params[:max_leaf_nodes] = max_leaf_nodes
73
- @params[:min_samples_leaf] = min_samples_leaf
74
- @params[:max_features] = max_features
75
- @params[:random_seed] = random_seed
76
- @params[:random_seed] ||= srand
77
- @estimators = nil
78
- @classes = nil
79
- @feature_importances = nil
80
- @rng = Random.new(@params[:random_seed])
81
- end
82
-
83
- # Fit the model with given training data.
84
- #
85
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
86
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
87
- # @return [AdaBoostClassifier] The learned classifier itself.
88
- def fit(x, y) # rubocop:disable Metrics/AbcSize
89
- check_sample_array(x)
90
- check_label_array(y)
91
- check_sample_label_size(x, y)
92
- ## Initialize some variables.
93
- n_samples, n_features = x.shape
94
- @estimators = []
95
- @feature_importances = Numo::DFloat.zeros(n_features)
96
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
97
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
98
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
99
- n_classes = @classes.shape[0]
100
- ## Boosting.
101
- classes_arr = @classes.to_a
102
- y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
103
- n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
104
- observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
105
- @params[:n_estimators].times do |_t|
106
- # Fit classfier.
107
- ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng)
108
- break if y[ids].to_a.uniq.size != n_classes
109
- tree = Tree::DecisionTreeClassifier.new(
110
- criterion: @params[:criterion], max_depth: @params[:max_depth],
111
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
112
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
113
- )
114
- tree.fit(x[ids, true], y[ids])
115
- # Calculate estimator error.
116
- proba = tree.predict_proba(x).clip(1.0e-15, nil)
117
- p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
118
- inds = p.ne(y)
119
- error = (observation_weights * inds).sum / observation_weights.sum
120
- # Store model.
121
- @estimators.push(tree)
122
- @feature_importances += tree.feature_importances
123
- break if error.zero?
124
- # Update observation weights.
125
- log_proba = Numo::NMath.log(proba)
126
- observation_weights *= Numo::NMath.exp(-1.0 * (n_classes - 1).fdiv(n_classes) * (y_codes * log_proba).sum(1))
127
- observation_weights = observation_weights.clip(1.0e-15, nil)
128
- sum_observation_weights = observation_weights.sum
129
- break if sum_observation_weights.zero?
130
- observation_weights /= sum_observation_weights
131
- end
132
- @feature_importances /= @feature_importances.sum
133
- self
134
- end
135
-
136
- # Calculate confidence scores for samples.
137
- #
138
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
139
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
140
- def decision_function(x)
141
- check_sample_array(x)
142
- n_samples, = x.shape
143
- n_classes = @classes.size
144
- sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
145
- @estimators.each do |tree|
146
- log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
147
- sum_probs += (n_classes - 1) * (log_proba - 1.fdiv(n_classes) * Numo::DFloat[log_proba.sum(1)].transpose)
148
- end
149
- sum_probs /= @estimators.size
150
- end
151
-
152
- # Predict class labels for samples.
153
- #
154
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
155
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
156
- def predict(x)
157
- check_sample_array(x)
158
- n_samples, = x.shape
159
- probs = decision_function(x)
160
- Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
161
- end
162
-
163
- # Predict probability for samples.
164
- #
165
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
166
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
167
- def predict_proba(x)
168
- check_sample_array(x)
169
- n_classes = @classes.size
170
- probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
171
- sum_probs = probs.sum(1)
172
- probs /= Numo::DFloat[sum_probs].transpose
173
- probs
174
- end
175
-
176
- # Dump marshal data.
177
- # @return [Hash] The marshal data about AdaBoostClassifier.
178
- def marshal_dump
179
- { params: @params,
180
- estimators: @estimators,
181
- classes: @classes,
182
- feature_importances: @feature_importances,
183
- rng: @rng }
184
- end
185
-
186
- # Load marshal data.
187
- # @return [nil]
188
- def marshal_load(obj)
189
- @params = obj[:params]
190
- @estimators = obj[:estimators]
191
- @classes = obj[:classes]
192
- @feature_importances = obj[:feature_importances]
193
- @rng = obj[:rng]
194
- nil
195
- end
196
- end
197
- end
198
- end
@@ -1,180 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/values'
5
- require 'svmkit/base/base_estimator'
6
- require 'svmkit/base/regressor'
7
- require 'svmkit/tree/decision_tree_regressor'
8
-
9
- module SVMKit
10
- module Ensemble
11
- # AdaBoostRegressor is a class that implements random forest for regression.
12
- # This class uses decision tree for a weak learner.
13
- #
14
- # @example
15
- # estimator =
16
- # SVMKit::Ensemble::AdaBoostRegressor.new(
17
- # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
18
- # estimator.fit(training_samples, traininig_values)
19
- # results = estimator.predict(testing_samples)
20
- #
21
- # *Reference*
22
- # - D. L. Shrestha and D. P. Solomatine, "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
23
- #
24
- class AdaBoostRegressor
25
- include Base::BaseEstimator
26
- include Base::Regressor
27
- include Validation
28
-
29
- # Return the set of estimators.
30
- # @return [Array<DecisionTreeRegressor>]
31
- attr_reader :estimators
32
-
33
- # Return the weight for each weak learner.
34
- # @return [Numo::DFloat] (size: n_estimates)
35
- attr_reader :estimator_weights
36
-
37
- # Return the importance for each feature.
38
- # @return [Numo::DFloat] (size: n_features)
39
- attr_reader :feature_importances
40
-
41
- # Return the random generator for random selection of feature index.
42
- # @return [Random]
43
- attr_reader :rng
44
-
45
- # Create a new regressor with random forest.
46
- #
47
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
48
- # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
49
- # @param exponent [Float] The exponent for the weight of each weak learner.
50
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
51
- # @param max_depth [Integer] The maximum depth of the tree.
52
- # If nil is given, decision tree grows without concern for depth.
53
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
54
- # If nil is given, number of leaves is not limited.
55
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
56
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
57
- # If nil is given, split process considers all features.
58
- # @param random_seed [Integer] The seed value using to initialize the random generator.
59
- # It is used to randomly determine the order of features when deciding spliting point.
60
- def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
61
- criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
62
- max_features: nil, random_seed: nil)
63
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
64
- max_features: max_features, random_seed: random_seed)
65
- check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
66
- check_params_float(threshold: threshold, exponent: exponent)
67
- check_params_string(criterion: criterion)
68
- check_params_positive(n_estimators: n_estimators, threshold: threshold, exponent: exponent,
69
- max_depth: max_depth,
70
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
71
- max_features: max_features)
72
- @params = {}
73
- @params[:n_estimators] = n_estimators
74
- @params[:threshold] = threshold
75
- @params[:exponent] = exponent
76
- @params[:criterion] = criterion
77
- @params[:max_depth] = max_depth
78
- @params[:max_leaf_nodes] = max_leaf_nodes
79
- @params[:min_samples_leaf] = min_samples_leaf
80
- @params[:max_features] = max_features
81
- @params[:random_seed] = random_seed
82
- @params[:random_seed] ||= srand
83
- @estimators = nil
84
- @feature_importances = nil
85
- @rng = Random.new(@params[:random_seed])
86
- end
87
-
88
- # Fit the model with given training data.
89
- #
90
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
91
- # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
92
- # @return [AdaBoostRegressor] The learned regressor itself.
93
- def fit(x, y) # rubocop:disable Metrics/AbcSize
94
- check_sample_array(x)
95
- check_tvalue_array(y)
96
- check_sample_tvalue_size(x, y)
97
- # Check target values
98
- raise ArgumentError, 'Expect target value vector to be 1-D arrray' unless y.shape.size == 1
99
- # Initialize some variables.
100
- n_samples, n_features = x.shape
101
- @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
102
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
103
- observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
104
- @estimators = []
105
- @estimator_weights = []
106
- @feature_importances = Numo::DFloat.zeros(n_features)
107
- # Construct forest.
108
- @params[:n_estimators].times do |_t|
109
- # Fit weak learner.
110
- ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng)
111
- tree = Tree::DecisionTreeRegressor.new(
112
- criterion: @params[:criterion], max_depth: @params[:max_depth],
113
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
114
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
115
- )
116
- tree.fit(x[ids, true], y[ids])
117
- p = tree.predict(x)
118
- # Calculate errors.
119
- abs_err = ((p - y) / y).abs
120
- err = observation_weights[abs_err.gt(@params[:threshold])].sum
121
- break if err <= 0.0
122
- # Calculate weight.
123
- beta = err**@params[:exponent]
124
- weight = Math.log(1.fdiv(beta))
125
- # Store model.
126
- @estimators.push(tree)
127
- @estimator_weights.push(weight)
128
- @feature_importances += weight * tree.feature_importances
129
- # Update observation weights.
130
- update = Numo::DFloat.ones(n_samples)
131
- update[abs_err.le(@params[:threshold])] = beta
132
- observation_weights *= update
133
- observation_weights = observation_weights.clip(1.0e-15, nil)
134
- sum_observation_weights = observation_weights.sum
135
- break if sum_observation_weights.zero?
136
- observation_weights /= sum_observation_weights
137
- end
138
- @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
139
- @feature_importances /= @estimator_weights.sum
140
- self
141
- end
142
-
143
- # Predict values for samples.
144
- #
145
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
146
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
147
- def predict(x)
148
- check_sample_array(x)
149
- n_samples, = x.shape
150
- predictions = Numo::DFloat.zeros(n_samples)
151
- @estimators.size.times do |t|
152
- predictions += @estimator_weights[t] * @estimators[t].predict(x)
153
- end
154
- sum_weight = @estimator_weights.sum
155
- predictions / sum_weight
156
- end
157
-
158
- # Dump marshal data.
159
- # @return [Hash] The marshal data about AdaBoostRegressor.
160
- def marshal_dump
161
- { params: @params,
162
- estimators: @estimators,
163
- estimator_weights: @estimator_weights,
164
- feature_importances: @feature_importances,
165
- rng: @rng }
166
- end
167
-
168
- # Load marshal data.
169
- # @return [nil]
170
- def marshal_load(obj)
171
- @params = obj[:params]
172
- @estimators = obj[:estimators]
173
- @estimator_weights = obj[:estimator_weights]
174
- @feature_importances = obj[:feature_importances]
175
- @rng = obj[:rng]
176
- nil
177
- end
178
- end
179
- end
180
- end
@@ -1,182 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/values'
5
- require 'svmkit/base/base_estimator'
6
- require 'svmkit/base/classifier'
7
- require 'svmkit/tree/decision_tree_classifier'
8
-
9
- module SVMKit
10
- # This module consists of the classes that implement ensemble-based methods.
11
- module Ensemble
12
- # RandomForestClassifier is a class that implements random forest for classification.
13
- #
14
- # @example
15
- # estimator =
16
- # SVMKit::Ensemble::RandomForestClassifier.new(
17
- # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
18
- # estimator.fit(training_samples, traininig_labels)
19
- # results = estimator.predict(testing_samples)
20
- #
21
- class RandomForestClassifier
22
- include Base::BaseEstimator
23
- include Base::Classifier
24
- include Validation
25
-
26
- # Return the set of estimators.
27
- # @return [Array<DecisionTreeClassifier>]
28
- attr_reader :estimators
29
-
30
- # Return the class labels.
31
- # @return [Numo::Int32] (size: n_classes)
32
- attr_reader :classes
33
-
34
- # Return the importance for each feature.
35
- # @return [Numo::DFloat] (size: n_features)
36
- attr_reader :feature_importances
37
-
38
- # Return the random generator for random selection of feature index.
39
- # @return [Random]
40
- attr_reader :rng
41
-
42
- # Create a new classifier with random forest.
43
- #
44
- # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
45
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
46
- # @param max_depth [Integer] The maximum depth of the tree.
47
- # If nil is given, decision tree grows without concern for depth.
48
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
49
- # If nil is given, number of leaves is not limited.
50
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
51
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
52
- # If nil is given, split process considers all features.
53
- # @param random_seed [Integer] The seed value using to initialize the random generator.
54
- # It is used to randomly determine the order of features when deciding spliting point.
55
- def initialize(n_estimators: 10,
56
- criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
57
- max_features: nil, random_seed: nil)
58
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
- max_features: max_features, random_seed: random_seed)
60
- check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
61
- check_params_string(criterion: criterion)
62
- check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
63
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
64
- max_features: max_features)
65
- @params = {}
66
- @params[:n_estimators] = n_estimators
67
- @params[:criterion] = criterion
68
- @params[:max_depth] = max_depth
69
- @params[:max_leaf_nodes] = max_leaf_nodes
70
- @params[:min_samples_leaf] = min_samples_leaf
71
- @params[:max_features] = max_features
72
- @params[:random_seed] = random_seed
73
- @params[:random_seed] ||= srand
74
- @estimators = nil
75
- @classes = nil
76
- @feature_importances = nil
77
- @rng = Random.new(@params[:random_seed])
78
- end
79
-
80
- # Fit the model with given training data.
81
- #
82
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
83
- # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
84
- # @return [RandomForestClassifier] The learned classifier itself.
85
- def fit(x, y)
86
- check_sample_array(x)
87
- check_label_array(y)
88
- check_sample_label_size(x, y)
89
- # Initialize some variables.
90
- n_samples, n_features = x.shape
91
- @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
92
- @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
93
- @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
94
- @feature_importances = Numo::DFloat.zeros(n_features)
95
- # Construct forest.
96
- @estimators = Array.new(@params[:n_estimators]) do
97
- tree = Tree::DecisionTreeClassifier.new(
98
- criterion: @params[:criterion], max_depth: @params[:max_depth],
99
- max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
100
- max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values.int_max)
101
- )
102
- bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
103
- tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
104
- @feature_importances += tree.feature_importances
105
- tree
106
- end
107
- @feature_importances /= @feature_importances.sum
108
- self
109
- end
110
-
111
- # Predict class labels for samples.
112
- #
113
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
114
- # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
115
- def predict(x)
116
- check_sample_array(x)
117
- n_samples, = x.shape
118
- n_classes = @classes.size
119
- classes_arr = @classes.to_a
120
- ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
121
- @estimators.each do |tree|
122
- predicted = tree.predict(x)
123
- n_samples.times do |n|
124
- class_id = classes_arr.index(predicted[n])
125
- ballot_box[n, class_id] += 1.0 unless class_id.nil?
126
- end
127
- end
128
- Numo::Int32[*Array.new(n_samples) { |n| @classes[ballot_box[n, true].max_index] }]
129
- end
130
-
131
- # Predict probability for samples.
132
- #
133
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
134
- # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
135
- def predict_proba(x)
136
- check_sample_array(x)
137
- n_samples, = x.shape
138
- n_classes = @classes.size
139
- classes_arr = @classes.to_a
140
- ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
141
- @estimators.each do |tree|
142
- probs = tree.predict_proba(x)
143
- tree.classes.size.times do |n|
144
- class_id = classes_arr.index(tree.classes[n])
145
- ballot_box[true, class_id] += probs[true, n] unless class_id.nil?
146
- end
147
- end
148
- (ballot_box.transpose / ballot_box.sum(axis: 1)).transpose
149
- end
150
-
151
- # Return the index of the leaf that each sample reached.
152
- #
153
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
154
- # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
155
- def apply(x)
156
- check_sample_array(x)
157
- Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
158
- end
159
-
160
- # Dump marshal data.
161
- # @return [Hash] The marshal data about RandomForestClassifier.
162
- def marshal_dump
163
- { params: @params,
164
- estimators: @estimators,
165
- classes: @classes,
166
- feature_importances: @feature_importances,
167
- rng: @rng }
168
- end
169
-
170
- # Load marshal data.
171
- # @return [nil]
172
- def marshal_load(obj)
173
- @params = obj[:params]
174
- @estimators = obj[:estimators]
175
- @classes = obj[:classes]
176
- @feature_importances = obj[:feature_importances]
177
- @rng = obj[:rng]
178
- nil
179
- end
180
- end
181
- end
182
- end