rumale 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +1 -0
  3. data/.gitignore +20 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +47 -0
  6. data/.rubocop_todo.yml +58 -0
  7. data/.travis.yml +13 -0
  8. data/CHANGELOG.md +2 -0
  9. data/CODE_OF_CONDUCT.md +74 -0
  10. data/Gemfile +4 -0
  11. data/LICENSE.txt +23 -0
  12. data/README.md +175 -0
  13. data/Rakefile +6 -0
  14. data/bin/console +14 -0
  15. data/bin/setup +8 -0
  16. data/lib/rumale.rb +70 -0
  17. data/lib/rumale/base/base_estimator.rb +13 -0
  18. data/lib/rumale/base/classifier.rb +36 -0
  19. data/lib/rumale/base/cluster_analyzer.rb +31 -0
  20. data/lib/rumale/base/evaluator.rb +17 -0
  21. data/lib/rumale/base/regressor.rb +36 -0
  22. data/lib/rumale/base/splitter.rb +21 -0
  23. data/lib/rumale/base/transformer.rb +22 -0
  24. data/lib/rumale/clustering/dbscan.rb +125 -0
  25. data/lib/rumale/clustering/k_means.rb +138 -0
  26. data/lib/rumale/dataset.rb +110 -0
  27. data/lib/rumale/decomposition/nmf.rb +141 -0
  28. data/lib/rumale/decomposition/pca.rb +148 -0
  29. data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
  30. data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
  31. data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
  32. data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
  33. data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
  34. data/lib/rumale/evaluation_measure/f_score.rb +50 -0
  35. data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
  36. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
  37. data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
  38. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
  39. data/lib/rumale/evaluation_measure/precision.rb +50 -0
  40. data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
  41. data/lib/rumale/evaluation_measure/purity.rb +40 -0
  42. data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
  43. data/lib/rumale/evaluation_measure/recall.rb +50 -0
  44. data/lib/rumale/kernel_approximation/rbf.rb +121 -0
  45. data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
  46. data/lib/rumale/linear_model/base_linear_model.rb +89 -0
  47. data/lib/rumale/linear_model/lasso.rb +136 -0
  48. data/lib/rumale/linear_model/linear_regression.rb +110 -0
  49. data/lib/rumale/linear_model/logistic_regression.rb +159 -0
  50. data/lib/rumale/linear_model/ridge.rb +110 -0
  51. data/lib/rumale/linear_model/svc.rb +183 -0
  52. data/lib/rumale/linear_model/svr.rb +122 -0
  53. data/lib/rumale/model_selection/cross_validation.rb +123 -0
  54. data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
  55. data/lib/rumale/model_selection/k_fold.rb +76 -0
  56. data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
  57. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
  58. data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
  59. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
  60. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
  61. data/lib/rumale/optimizer/nadam.rb +90 -0
  62. data/lib/rumale/optimizer/rmsprop.rb +69 -0
  63. data/lib/rumale/optimizer/sgd.rb +65 -0
  64. data/lib/rumale/optimizer/yellow_fin.rb +144 -0
  65. data/lib/rumale/pairwise_metric.rb +91 -0
  66. data/lib/rumale/pipeline/pipeline.rb +197 -0
  67. data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
  68. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
  69. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
  70. data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
  71. data/lib/rumale/preprocessing/label_encoder.rb +94 -0
  72. data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
  73. data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
  74. data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
  75. data/lib/rumale/probabilistic_output.rb +112 -0
  76. data/lib/rumale/tree/base_decision_tree.rb +153 -0
  77. data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
  78. data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
  79. data/lib/rumale/tree/node.rb +70 -0
  80. data/lib/rumale/utils.rb +37 -0
  81. data/lib/rumale/validation.rb +79 -0
  82. data/lib/rumale/values.rb +13 -0
  83. data/lib/rumale/version.rb +6 -0
  84. data/rumale.gemspec +41 -0
  85. metadata +204 -0
@@ -0,0 +1,178 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/decision_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # AdaBoostRegressor is a class that implements random forest for regression.
11
+ # This class uses decision tree for a weak learner.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::AdaBoostRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - D. L. Shrestha and D. P. Solomatine, "Experiments with AdaBoost.RT, an Improved Boosting Scheme for Regression," Neural Computation 18 (7), pp. 1678--1710, 2006.
22
+ #
23
+ class AdaBoostRegressor
24
+ include Base::BaseEstimator
25
+ include Base::Regressor
26
+
27
+ # Return the set of estimators.
28
+ # @return [Array<DecisionTreeRegressor>]
29
+ attr_reader :estimators
30
+
31
+ # Return the weight for each weak learner.
32
+ # @return [Numo::DFloat] (size: n_estimates)
33
+ attr_reader :estimator_weights
34
+
35
+ # Return the importance for each feature.
36
+ # @return [Numo::DFloat] (size: n_features)
37
+ attr_reader :feature_importances
38
+
39
+ # Return the random generator for random selection of feature index.
40
+ # @return [Random]
41
+ attr_reader :rng
42
+
43
+ # Create a new regressor with random forest.
44
+ #
45
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
46
+ # @param threshold [Float] The threshold for delimiting correct and incorrect predictions. That is constrained to [0, 1]
47
+ # @param exponent [Float] The exponent for the weight of each weak learner.
48
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
49
+ # @param max_depth [Integer] The maximum depth of the tree.
50
+ # If nil is given, decision tree grows without concern for depth.
51
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
52
+ # If nil is given, number of leaves is not limited.
53
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
54
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
55
+ # If nil is given, split process considers all features.
56
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
57
+ # It is used to randomly determine the order of features when deciding spliting point.
58
+ def initialize(n_estimators: 10, threshold: 0.2, exponent: 1.0,
59
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
60
+ max_features: nil, random_seed: nil)
61
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
62
+ max_features: max_features, random_seed: random_seed)
63
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
64
+ check_params_float(threshold: threshold, exponent: exponent)
65
+ check_params_string(criterion: criterion)
66
+ check_params_positive(n_estimators: n_estimators, threshold: threshold, exponent: exponent,
67
+ max_depth: max_depth,
68
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
69
+ max_features: max_features)
70
+ @params = {}
71
+ @params[:n_estimators] = n_estimators
72
+ @params[:threshold] = threshold
73
+ @params[:exponent] = exponent
74
+ @params[:criterion] = criterion
75
+ @params[:max_depth] = max_depth
76
+ @params[:max_leaf_nodes] = max_leaf_nodes
77
+ @params[:min_samples_leaf] = min_samples_leaf
78
+ @params[:max_features] = max_features
79
+ @params[:random_seed] = random_seed
80
+ @params[:random_seed] ||= srand
81
+ @estimators = nil
82
+ @feature_importances = nil
83
+ @rng = Random.new(@params[:random_seed])
84
+ end
85
+
86
+ # Fit the model with given training data.
87
+ #
88
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
89
+ # @param y [Numo::DFloat] (shape: [n_samples]) The target values to be used for fitting the model.
90
+ # @return [AdaBoostRegressor] The learned regressor itself.
91
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
92
+ check_sample_array(x)
93
+ check_tvalue_array(y)
94
+ check_sample_tvalue_size(x, y)
95
+ # Check target values
96
+ raise ArgumentError, 'Expect target value vector to be 1-D arrray' unless y.shape.size == 1
97
+ # Initialize some variables.
98
+ n_samples, n_features = x.shape
99
+ @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
100
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
101
+ observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
102
+ @estimators = []
103
+ @estimator_weights = []
104
+ @feature_importances = Numo::DFloat.zeros(n_features)
105
+ # Construct forest.
106
+ @params[:n_estimators].times do |_t|
107
+ # Fit weak learner.
108
+ ids = Rumale::Utils.choice_ids(n_samples, observation_weights, @rng)
109
+ tree = Tree::DecisionTreeRegressor.new(
110
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
111
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
112
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
113
+ )
114
+ tree.fit(x[ids, true], y[ids])
115
+ p = tree.predict(x)
116
+ # Calculate errors.
117
+ abs_err = ((p - y) / y).abs
118
+ err = observation_weights[abs_err.gt(@params[:threshold])].sum
119
+ break if err <= 0.0
120
+ # Calculate weight.
121
+ beta = err**@params[:exponent]
122
+ weight = Math.log(1.fdiv(beta))
123
+ # Store model.
124
+ @estimators.push(tree)
125
+ @estimator_weights.push(weight)
126
+ @feature_importances += weight * tree.feature_importances
127
+ # Update observation weights.
128
+ update = Numo::DFloat.ones(n_samples)
129
+ update[abs_err.le(@params[:threshold])] = beta
130
+ observation_weights *= update
131
+ observation_weights = observation_weights.clip(1.0e-15, nil)
132
+ sum_observation_weights = observation_weights.sum
133
+ break if sum_observation_weights.zero?
134
+ observation_weights /= sum_observation_weights
135
+ end
136
+ @estimator_weights = Numo::DFloat.asarray(@estimator_weights)
137
+ @feature_importances /= @estimator_weights.sum
138
+ self
139
+ end
140
+
141
+ # Predict values for samples.
142
+ #
143
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
144
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
145
+ def predict(x)
146
+ check_sample_array(x)
147
+ n_samples, = x.shape
148
+ predictions = Numo::DFloat.zeros(n_samples)
149
+ @estimators.size.times do |t|
150
+ predictions += @estimator_weights[t] * @estimators[t].predict(x)
151
+ end
152
+ sum_weight = @estimator_weights.sum
153
+ predictions / sum_weight
154
+ end
155
+
156
+ # Dump marshal data.
157
+ # @return [Hash] The marshal data about AdaBoostRegressor.
158
+ def marshal_dump
159
+ { params: @params,
160
+ estimators: @estimators,
161
+ estimator_weights: @estimator_weights,
162
+ feature_importances: @feature_importances,
163
+ rng: @rng }
164
+ end
165
+
166
+ # Load marshal data.
167
+ # @return [nil]
168
+ def marshal_load(obj)
169
+ @params = obj[:params]
170
+ @estimators = obj[:estimators]
171
+ @estimator_weights = obj[:estimator_weights]
172
+ @feature_importances = obj[:feature_importances]
173
+ @rng = obj[:rng]
174
+ nil
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,180 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/classifier'
6
+ require 'rumale/tree/decision_tree_classifier'
7
+
8
+ module Rumale
9
+ # This module consists of the classes that implement ensemble-based methods.
10
+ module Ensemble
11
+ # RandomForestClassifier is a class that implements random forest for classification.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::RandomForestClassifier.new(
16
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ class RandomForestClassifier
21
+ include Base::BaseEstimator
22
+ include Base::Classifier
23
+
24
+ # Return the set of estimators.
25
+ # @return [Array<DecisionTreeClassifier>]
26
+ attr_reader :estimators
27
+
28
+ # Return the class labels.
29
+ # @return [Numo::Int32] (size: n_classes)
30
+ attr_reader :classes
31
+
32
+ # Return the importance for each feature.
33
+ # @return [Numo::DFloat] (size: n_features)
34
+ attr_reader :feature_importances
35
+
36
+ # Return the random generator for random selection of feature index.
37
+ # @return [Random]
38
+ attr_reader :rng
39
+
40
+ # Create a new classifier with random forest.
41
+ #
42
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
43
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
44
+ # @param max_depth [Integer] The maximum depth of the tree.
45
+ # If nil is given, decision tree grows without concern for depth.
46
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
47
+ # If nil is given, number of leaves is not limited.
48
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
49
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
50
+ # If nil is given, split process considers all features.
51
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
52
+ # It is used to randomly determine the order of features when deciding spliting point.
53
+ def initialize(n_estimators: 10,
54
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
55
+ max_features: nil, random_seed: nil)
56
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
57
+ max_features: max_features, random_seed: random_seed)
58
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
59
+ check_params_string(criterion: criterion)
60
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
61
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
62
+ max_features: max_features)
63
+ @params = {}
64
+ @params[:n_estimators] = n_estimators
65
+ @params[:criterion] = criterion
66
+ @params[:max_depth] = max_depth
67
+ @params[:max_leaf_nodes] = max_leaf_nodes
68
+ @params[:min_samples_leaf] = min_samples_leaf
69
+ @params[:max_features] = max_features
70
+ @params[:random_seed] = random_seed
71
+ @params[:random_seed] ||= srand
72
+ @estimators = nil
73
+ @classes = nil
74
+ @feature_importances = nil
75
+ @rng = Random.new(@params[:random_seed])
76
+ end
77
+
78
+ # Fit the model with given training data.
79
+ #
80
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
81
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
82
+ # @return [RandomForestClassifier] The learned classifier itself.
83
+ def fit(x, y)
84
+ check_sample_array(x)
85
+ check_label_array(y)
86
+ check_sample_label_size(x, y)
87
+ # Initialize some variables.
88
+ n_samples, n_features = x.shape
89
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
90
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
91
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
92
+ @feature_importances = Numo::DFloat.zeros(n_features)
93
+ # Construct forest.
94
+ @estimators = Array.new(@params[:n_estimators]) do
95
+ tree = Tree::DecisionTreeClassifier.new(
96
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
97
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
98
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
99
+ )
100
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
101
+ tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
102
+ @feature_importances += tree.feature_importances
103
+ tree
104
+ end
105
+ @feature_importances /= @feature_importances.sum
106
+ self
107
+ end
108
+
109
+ # Predict class labels for samples.
110
+ #
111
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
112
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
113
+ def predict(x)
114
+ check_sample_array(x)
115
+ n_samples, = x.shape
116
+ n_classes = @classes.size
117
+ classes_arr = @classes.to_a
118
+ ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
119
+ @estimators.each do |tree|
120
+ predicted = tree.predict(x)
121
+ n_samples.times do |n|
122
+ class_id = classes_arr.index(predicted[n])
123
+ ballot_box[n, class_id] += 1.0 unless class_id.nil?
124
+ end
125
+ end
126
+ Numo::Int32[*Array.new(n_samples) { |n| @classes[ballot_box[n, true].max_index] }]
127
+ end
128
+
129
+ # Predict probability for samples.
130
+ #
131
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
132
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
133
+ def predict_proba(x)
134
+ check_sample_array(x)
135
+ n_samples, = x.shape
136
+ n_classes = @classes.size
137
+ classes_arr = @classes.to_a
138
+ ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
139
+ @estimators.each do |tree|
140
+ probs = tree.predict_proba(x)
141
+ tree.classes.size.times do |n|
142
+ class_id = classes_arr.index(tree.classes[n])
143
+ ballot_box[true, class_id] += probs[true, n] unless class_id.nil?
144
+ end
145
+ end
146
+ (ballot_box.transpose / ballot_box.sum(axis: 1)).transpose
147
+ end
148
+
149
+ # Return the index of the leaf that each sample reached.
150
+ #
151
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
152
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
153
+ def apply(x)
154
+ check_sample_array(x)
155
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
156
+ end
157
+
158
+ # Dump marshal data.
159
+ # @return [Hash] The marshal data about RandomForestClassifier.
160
+ def marshal_dump
161
+ { params: @params,
162
+ estimators: @estimators,
163
+ classes: @classes,
164
+ feature_importances: @feature_importances,
165
+ rng: @rng }
166
+ end
167
+
168
+ # Load marshal data.
169
+ # @return [nil]
170
+ def marshal_load(obj)
171
+ @params = obj[:params]
172
+ @estimators = obj[:estimators]
173
+ @classes = obj[:classes]
174
+ @feature_importances = obj[:feature_importances]
175
+ @rng = obj[:rng]
176
+ nil
177
+ end
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/values'
4
+ require 'rumale/base/base_estimator'
5
+ require 'rumale/base/regressor'
6
+ require 'rumale/tree/decision_tree_regressor'
7
+
8
+ module Rumale
9
+ module Ensemble
10
+ # RandomForestRegressor is a class that implements random forest for regression
11
+ #
12
+ # @example
13
+ # estimator =
14
+ # Rumale::Ensemble::RandomForestRegressor.new(
15
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
+ # estimator.fit(training_samples, traininig_values)
17
+ # results = estimator.predict(testing_samples)
18
+ #
19
+ class RandomForestRegressor
20
+ include Base::BaseEstimator
21
+ include Base::Regressor
22
+
23
+ # Return the set of estimators.
24
+ # @return [Array<DecisionTreeRegressor>]
25
+ attr_reader :estimators
26
+
27
+ # Return the importance for each feature.
28
+ # @return [Numo::DFloat] (size: n_features)
29
+ attr_reader :feature_importances
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Create a new regressor with random forest.
36
+ #
37
+ # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
38
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
+ # @param max_depth [Integer] The maximum depth of the tree.
40
+ # If nil is given, decision tree grows without concern for depth.
41
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
42
+ # If nil is given, number of leaves is not limited.
43
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
+ # If nil is given, split process considers all features.
46
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
47
+ # It is used to randomly determine the order of features when deciding spliting point.
48
+ def initialize(n_estimators: 10,
49
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
50
+ max_features: nil, random_seed: nil)
51
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
52
+ max_features: max_features, random_seed: random_seed)
53
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
54
+ check_params_string(criterion: criterion)
55
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
56
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
57
+ max_features: max_features)
58
+ @params = {}
59
+ @params[:n_estimators] = n_estimators
60
+ @params[:criterion] = criterion
61
+ @params[:max_depth] = max_depth
62
+ @params[:max_leaf_nodes] = max_leaf_nodes
63
+ @params[:min_samples_leaf] = min_samples_leaf
64
+ @params[:max_features] = max_features
65
+ @params[:random_seed] = random_seed
66
+ @params[:random_seed] ||= srand
67
+ @estimators = nil
68
+ @feature_importances = nil
69
+ @rng = Random.new(@params[:random_seed])
70
+ end
71
+
72
+ # Fit the model with given training data.
73
+ #
74
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
75
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
76
+ # @return [RandomForestRegressor] The learned regressor itself.
77
+ def fit(x, y)
78
+ check_sample_array(x)
79
+ check_tvalue_array(y)
80
+ check_sample_tvalue_size(x, y)
81
+ # Initialize some variables.
82
+ n_samples, n_features = x.shape
83
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
84
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
85
+ @feature_importances = Numo::DFloat.zeros(n_features)
86
+ single_target = y.shape[1].nil?
87
+ # Construct forest.
88
+ @estimators = Array.new(@params[:n_estimators]) do
89
+ tree = Tree::DecisionTreeRegressor.new(
90
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
91
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
92
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
93
+ )
94
+ bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
95
+ tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
96
+ @feature_importances += tree.feature_importances
97
+ tree
98
+ end
99
+ @feature_importances /= @feature_importances.sum
100
+ self
101
+ end
102
+
103
+ # Predict values for samples.
104
+ #
105
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
106
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
107
+ def predict(x)
108
+ check_sample_array(x)
109
+ @estimators.map { |est| est.predict(x) }.reduce(&:+) / @params[:n_estimators]
110
+ end
111
+
112
+ # Return the index of the leaf that each sample reached.
113
+ #
114
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
115
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
116
+ def apply(x)
117
+ check_sample_array(x)
118
+ Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
119
+ end
120
+
121
+ # Dump marshal data.
122
+ # @return [Hash] The marshal data about RandomForestRegressor.
123
+ def marshal_dump
124
+ { params: @params,
125
+ estimators: @estimators,
126
+ feature_importances: @feature_importances,
127
+ rng: @rng }
128
+ end
129
+
130
+ # Load marshal data.
131
+ # @return [nil]
132
+ def marshal_load(obj)
133
+ @params = obj[:params]
134
+ @estimators = obj[:estimators]
135
+ @feature_importances = obj[:feature_importances]
136
+ @rng = obj[:rng]
137
+ nil
138
+ end
139
+ end
140
+ end
141
+ end