rumale 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,178 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/decision_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # AdaBoostRegressor is a class that implements random forest for regression.
11
+ # This class uses decision tree for a weak learner.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::AdaBoostRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - D. L. Shrestha and D. P. Solomatine, "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
22
+ #
23
+ class AdaBoostRegressor
24
+ include Base::BaseEstimator
25
+ include Base::Regressor
26
+
27
+ # Return the set of estimators.
28
+ # @return [Array<DecisionTreeRegressor>]
29
+ attr_reader :estimators
30
+
31
+ # Return the weight for each weak learner.
32
+ # @return [Numo::DFloat] (size: n_estimates)
33
+ attr_reader :estimator_weights
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new regressor with random forest.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
46
+ # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
47
+ # @param exponent [Float] The exponent for the weight of each weak learner.
48
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
49
+ # @param max_depth [Integer] The maximum depth of the tree.
50
+ # If nil is given, decision tree grows without concern for depth.
51
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
52
+ # If nil is given, number of leaves is not limited.
53
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
54
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
55
+ # If nil is given, split process considers all features.
56
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
57
+ # It is used to randomly determine the order of features when deciding spliting point.
58
+ def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
59
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
60
+ max_features: nil, random_seed: nil)
61
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
62
+ max_features: max_features, random_seed: random_seed)
63
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
64
+ check_params_float(threshold: threshold, exponent: exponent)
65
+ check_params_string(criterion: criterion)
66
+ check_params_positive(n_estimators: n_estimators, threshold: threshold, exponent: exponent,
67
+ max_depth: max_depth,
68
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
69
+ max_features: max_features)
70
+ @params = {}
71
+ @params[:n_estimators] = n_estimators
72
+ @params[:threshold] = threshold
73
+ @params[:exponent] = exponent
74
+ @params[:criterion] = criterion
75
+ @params[:max_depth] = max_depth
76
+ @params[:max_leaf_nodes] = max_leaf_nodes
77
+ @params[:min_samples_leaf] = min_samples_leaf
78
+ @params[:max_features] = max_features
79
+ @params[:random_seed] = random_seed
80
+ @params[:random_seed] ||= srand
81
+ @estimators = nil
82
+ @feature_importances = nil
83
+ @rng = Random.new(@params[:random_seed])
84
+ end
85
+
86
+ # Fit the model with given training data.
87
+ #
88
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
89
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
90
+ # @return [AdaBoostRegressor] The learned regressor itself.
91
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
92
+ check_sample_array(x)
93
+ check_tvalue_array(y)
94
+ check_sample_tvalue_size(x, y)
95
+ # Check target values
96
+ raise ArgumentError, 'Expect target value vector to be 1-D arrray' unless y.shape.size == 1
97
+ # Initialize some variables.
98
+ n_samples, n_features = x.shape
99
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
100
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
101
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
102
+ @estimators = []
103
+ @estimator_weights = []
104
+ @feature_importances = Numo::DFloat.zeros(n_features)
105
+ # Construct forest.
106
+ @params[:n_estimators].times do |_t|
107
+ # Fit weak learner.
108
+ ids = Rumale::Utils.choice_ids(n_samples, observation_weights, @rng)
109
+ tree = Tree::DecisionTreeRegressor.new(
110
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
111
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
112
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
113
+ )
114
+ tree.fit(x[ids, true], y[ids])
115
+ p = tree.predict(x)
116
+ # Calculate errors.
117
+ abs_err = ((p - y) / y).abs
118
+ err = observation_weights[abs_err.gt(@params[:threshold])].sum
119
+ break if err <= 0.0
120
+ # Calculate weight.
121
+ beta = err**@params[:exponent]
122
+ weight = Math.log(1.fdiv(beta))
123
+ # Store model.
124
+ @estimators.push(tree)
125
+ @estimator_weights.push(weight)
126
+ @feature_importances += weight * tree.feature_importances
127
+ # Update observation weights.
128
+ update = Numo::DFloat.ones(n_samples)
129
+ update[abs_err.le(@params[:threshold])] = beta
130
+ observation_weights *= update
131
+ observation_weights = observation_weights.clip(1.0e-15, nil)
132
+ sum_observation_weights = observation_weights.sum
133
+ break if sum_observation_weights.zero?
134
+ observation_weights /= sum_observation_weights
135
+ end
136
+ @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
137
+ @feature_importances /= @estimator_weights.sum
138
+ self
139
+ end
140
+
141
+ # Predict values for samples.
142
+ #
143
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
144
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
145
+ def predict(x)
146
+ check_sample_array(x)
147
+ n_samples, = x.shape
148
+ predictions = Numo::DFloat.zeros(n_samples)
149
+ @estimators.size.times do |t|
150
+ predictions += @estimator_weights[t] * @estimators[t].predict(x)
151
+ end
152
+ sum_weight = @estimator_weights.sum
153
+ predictions / sum_weight
154
+ end
155
+
156
+ # Dump marshal data.
157
+ # @return [Hash] The marshal data about AdaBoostRegressor.
158
+ def marshal_dump
159
+ { params: @params,
160
+ estimators: @estimators,
161
+ estimator_weights: @estimator_weights,
162
+ feature_importances: @feature_importances,
163
+ rng: @rng }
164
+ end
165
+
166
+ # Load marshal data.
167
+ # @return [nil]
168
+ def marshal_load(obj)
169
+ @params = obj[:params]
170
+ @estimators = obj[:estimators]
171
+ @estimator_weights = obj[:estimator_weights]
172
+ @feature_importances = obj[:feature_importances]
173
+ @rng = obj[:rng]
174
+ nil
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,180 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/tree/decision_tree_classifier'
7
+
8
+ module Rumale
9
+ # This module consists of the classes that implement ensemble-based methods.
10
+ module Ensemble
11
+ # RandomForestClassifier is a class that implements random forest for classification.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::RandomForestClassifier.new(
16
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ class RandomForestClassifier
21
+ include Base::BaseEstimator
22
+ include Base::Classifier
23
+
24
+ # Return the set of estimators.
25
+ # @return [Array<DecisionTreeClassifier>]
26
+ attr_reader :estimators
27
+
28
+ # Return the class labels.
29
+ # @return [Numo::Int32] (size: n_classes)
30
+ attr_reader :classes
31
+
32
+ # Return the importance for each feature.
33
+ # @return [Numo::DFloat] (size: n_features)
34
+ attr_reader :feature_importances
35
+
36
+ # Return the random generator for random selection of feature index.
37
+ # @return [Random]
38
+ attr_reader :rng
39
+
40
+ # Create a new classifier with random forest.
41
+ #
42
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
43
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
44
+ # @param max_depth [Integer] The maximum depth of the tree.
45
+ # If nil is given, decision tree grows without concern for depth.
46
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
+ # If nil is given, number of leaves is not limited.
48
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
+ # If nil is given, split process considers all features.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ # It is used to randomly determine the order of features when deciding spliting point.
53
+ def initialize(n_estimators: 10,
54
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
55
+ max_features: nil, random_seed: nil)
56
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
57
+ max_features: max_features, random_seed: random_seed)
58
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
59
+ check_params_string(criterion: criterion)
60
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
61
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
62
+ max_features: max_features)
63
+ @params = {}
64
+ @params[:n_estimators] = n_estimators
65
+ @params[:criterion] = criterion
66
+ @params[:max_depth] = max_depth
67
+ @params[:max_leaf_nodes] = max_leaf_nodes
68
+ @params[:min_samples_leaf] = min_samples_leaf
69
+ @params[:max_features] = max_features
70
+ @params[:random_seed] = random_seed
71
+ @params[:random_seed] ||= srand
72
+ @estimators = nil
73
+ @classes = nil
74
+ @feature_importances = nil
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
82
+ # @return [RandomForestClassifier] The learned classifier itself.
83
+ def fit(x, y)
84
+ check_sample_array(x)
85
+ check_label_array(y)
86
+ check_sample_label_size(x, y)
87
+ # Initialize some variables.
88
+ n_samples, n_features = x.shape
89
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
90
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
91
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
92
+ @feature_importances = Numo::DFloat.zeros(n_features)
93
+ # Construct forest.
94
+ @estimators = Array.new(@params[:n_estimators]) do
95
+ tree = Tree::DecisionTreeClassifier.new(
96
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
97
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
98
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
99
+ )
100
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
101
+ tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
102
+ @feature_importances += tree.feature_importances
103
+ tree
104
+ end
105
+ @feature_importances /= @feature_importances.sum
106
+ self
107
+ end
108
+
109
+ # Predict class labels for samples.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
112
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
113
+ def predict(x)
114
+ check_sample_array(x)
115
+ n_samples, = x.shape
116
+ n_classes = @classes.size
117
+ classes_arr = @classes.to_a
118
+ ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
119
+ @estimators.each do |tree|
120
+ predicted = tree.predict(x)
121
+ n_samples.times do |n|
122
+ class_id = classes_arr.index(predicted[n])
123
+ ballot_box[n, class_id] += 1.0 unless class_id.nil?
124
+ end
125
+ end
126
+ Numo::Int32[*Array.new(n_samples) { |n| @classes[ballot_box[n, true].max_index] }]
127
+ end
128
+
129
+ # Predict probability for samples.
130
+ #
131
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
132
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
133
+ def predict_proba(x)
134
+ check_sample_array(x)
135
+ n_samples, = x.shape
136
+ n_classes = @classes.size
137
+ classes_arr = @classes.to_a
138
+ ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
139
+ @estimators.each do |tree|
140
+ probs = tree.predict_proba(x)
141
+ tree.classes.size.times do |n|
142
+ class_id = classes_arr.index(tree.classes[n])
143
+ ballot_box[true, class_id] += probs[true, n] unless class_id.nil?
144
+ end
145
+ end
146
+ (ballot_box.transpose / ballot_box.sum(axis: 1)).transpose
147
+ end
148
+
149
+ # Return the index of the leaf that each sample reached.
150
+ #
151
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
152
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
153
+ def apply(x)
154
+ check_sample_array(x)
155
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
156
+ end
157
+
158
+ # Dump marshal data.
159
+ # @return [Hash] The marshal data about RandomForestClassifier.
160
+ def marshal_dump
161
+ { params: @params,
162
+ estimators: @estimators,
163
+ classes: @classes,
164
+ feature_importances: @feature_importances,
165
+ rng: @rng }
166
+ end
167
+
168
+ # Load marshal data.
169
+ # @return [nil]
170
+ def marshal_load(obj)
171
+ @params = obj[:params]
172
+ @estimators = obj[:estimators]
173
+ @classes = obj[:classes]
174
+ @feature_importances = obj[:feature_importances]
175
+ @rng = obj[:rng]
176
+ nil
177
+ end
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/decision_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # RandomForestRegressor is a class that implements random forest for regression
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # Rumale::Ensemble::RandomForestRegressor.new(
15
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
+ # estimator.fit(training_samples, traininig_values)
17
+ # results = estimator.predict(testing_samples)
18
+ #
19
+ class RandomForestRegressor
20
+ include Base::BaseEstimator
21
+ include Base::Regressor
22
+
23
+ # Return the set of estimators.
24
+ # @return [Array<DecisionTreeRegressor>]
25
+ attr_reader :estimators
26
+
27
+ # Return the importance for each feature.
28
+ # @return [Numo::DFloat] (size: n_features)
29
+ attr_reader :feature_importances
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Create a new regressor with random forest.
36
+ #
37
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
38
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
+ # @param max_depth [Integer] The maximum depth of the tree.
40
+ # If nil is given, decision tree grows without concern for depth.
41
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
42
+ # If nil is given, number of leaves is not limited.
43
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
+ # If nil is given, split process considers all features.
46
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
47
+ # It is used to randomly determine the order of features when deciding spliting point.
48
+ def initialize(n_estimators: 10,
49
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
50
+ max_features: nil, random_seed: nil)
51
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
52
+ max_features: max_features, random_seed: random_seed)
53
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
54
+ check_params_string(criterion: criterion)
55
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
56
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
57
+ max_features: max_features)
58
+ @params = {}
59
+ @params[:n_estimators] = n_estimators
60
+ @params[:criterion] = criterion
61
+ @params[:max_depth] = max_depth
62
+ @params[:max_leaf_nodes] = max_leaf_nodes
63
+ @params[:min_samples_leaf] = min_samples_leaf
64
+ @params[:max_features] = max_features
65
+ @params[:random_seed] = random_seed
66
+ @params[:random_seed] ||= srand
67
+ @estimators = nil
68
+ @feature_importances = nil
69
+ @rng = Random.new(@params[:random_seed])
70
+ end
71
+
72
+ # Fit the model with given training data.
73
+ #
74
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
75
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
76
+ # @return [RandomForestRegressor] The learned regressor itself.
77
+ def fit(x, y)
78
+ check_sample_array(x)
79
+ check_tvalue_array(y)
80
+ check_sample_tvalue_size(x, y)
81
+ # Initialize some variables.
82
+ n_samples, n_features = x.shape
83
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
84
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
85
+ @feature_importances = Numo::DFloat.zeros(n_features)
86
+ single_target = y.shape[1].nil?
87
+ # Construct forest.
88
+ @estimators = Array.new(@params[:n_estimators]) do
89
+ tree = Tree::DecisionTreeRegressor.new(
90
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
91
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
92
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
93
+ )
94
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
95
+ tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
96
+ @feature_importances += tree.feature_importances
97
+ tree
98
+ end
99
+ @feature_importances /= @feature_importances.sum
100
+ self
101
+ end
102
+
103
+ # Predict values for samples.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
107
+ def predict(x)
108
+ check_sample_array(x)
109
+ @estimators.map { |est| est.predict(x) }.reduce(&:+) / @params[:n_estimators]
110
+ end
111
+
112
+ # Return the index of the leaf that each sample reached.
113
+ #
114
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
115
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
116
+ def apply(x)
117
+ check_sample_array(x)
118
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
119
+ end
120
+
121
+ # Dump marshal data.
122
+ # @return [Hash] The marshal data about RandomForestRegressor.
123
+ def marshal_dump
124
+ { params: @params,
125
+ estimators: @estimators,
126
+ feature_importances: @feature_importances,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @estimators = obj[:estimators]
135
+ @feature_importances = obj[:feature_importances]
136
+ @rng = obj[:rng]
137
+ nil
138
+ end
139
+ end
140
+ end
141
+ end