rumale 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/ext/rumale/rumale.c +40 -44
- data/lib/rumale/ensemble/extra_trees_classifier.rb +135 -0
- data/lib/rumale/ensemble/extra_trees_regressor.rb +121 -0
- data/lib/rumale/tree/decision_tree_classifier.rb +2 -2
- data/lib/rumale/tree/decision_tree_regressor.rb +1 -1
- data/lib/rumale/tree/extra_tree_classifier.rb +119 -0
- data/lib/rumale/tree/extra_tree_regressor.rb +106 -0
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +4 -0
- data/rumale.gemspec +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
|
4
|
+
data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
|
7
|
+
data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# 0.9.1
|
2
|
+
- Add class for Extra-Trees classifier.
|
3
|
+
- Add class for Extra-Trees regressor.
|
4
|
+
- Refactor extension modules of decision tree estimators for improving performance.
|
5
|
+
|
1
6
|
# 0.9.0
|
2
7
|
## Breaking changes
|
3
8
|
- Decide to introduce Ruby extensions for improving performance.
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
|
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor classifier,
|
15
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
16
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -183,33 +183,32 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
|
|
183
183
|
* @!visibility private
|
184
184
|
* Find for split point with maximum information gain.
|
185
185
|
*
|
186
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_labels,
|
186
|
+
* @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
|
187
187
|
*
|
188
188
|
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
189
189
|
* @param impurity [Float] The impurity of whole dataset.
|
190
190
|
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
191
191
|
* @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
|
192
|
-
* @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
|
193
192
|
* @param n_classes [Integer] The number of classes.
|
194
193
|
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
195
194
|
*/
|
196
195
|
static VALUE
|
197
|
-
find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE
|
196
|
+
find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
|
198
197
|
{
|
199
|
-
long i;
|
200
|
-
long curr_pos;
|
201
|
-
long next_pos;
|
202
|
-
long n_l_elements;
|
203
|
-
long n_r_elements;
|
204
198
|
const long n_classes = NUM2LONG(n_classes_);
|
205
199
|
const long n_elements = RARRAY_LEN(sorted_f);
|
206
|
-
const long n_uniq_elements = RARRAY_LEN(uniqed_f);
|
207
200
|
const double w_impurity = NUM2DBL(whole_impurity);
|
201
|
+
long iter = 0;
|
202
|
+
long curr_pos = 0;
|
203
|
+
long next_pos = 0;
|
204
|
+
long n_l_elements = 0;
|
205
|
+
long n_r_elements = n_elements;
|
206
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
207
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
208
|
+
double next_el;
|
208
209
|
double l_impurity;
|
209
210
|
double r_impurity;
|
210
211
|
double gain;
|
211
|
-
double curr_el;
|
212
|
-
double next_el;
|
213
212
|
VALUE l_histogram = create_zero_vector(n_classes);
|
214
213
|
VALUE r_histogram = create_zero_vector(n_classes);
|
215
214
|
VALUE opt_params = rb_ary_new2(4);
|
@@ -217,22 +216,18 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
217
216
|
/* Initialize optimal parameters. */
|
218
217
|
rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
|
219
218
|
rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
|
220
|
-
rb_ary_store(opt_params, 2, rb_ary_entry(
|
219
|
+
rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
|
221
220
|
rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
|
222
221
|
|
223
222
|
/* Initialize child node variables. */
|
224
|
-
|
225
|
-
|
226
|
-
for (i = 0; i < n_elements; i++) {
|
227
|
-
increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
|
223
|
+
for (iter = 0; iter < n_elements; iter++) {
|
224
|
+
increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
|
228
225
|
}
|
229
226
|
|
230
227
|
/* Find optimal parameters. */
|
231
|
-
|
232
|
-
/* Find new split point. */
|
233
|
-
curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
|
228
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
234
229
|
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
235
|
-
while (next_pos < n_elements && next_el
|
230
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
236
231
|
increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
|
237
232
|
n_l_elements++;
|
238
233
|
decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
|
@@ -250,6 +245,9 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
250
245
|
rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
|
251
246
|
rb_ary_store(opt_params, 3, DBL2NUM(gain));
|
252
247
|
}
|
248
|
+
if (next_pos == n_elements) break;
|
249
|
+
curr_pos = next_pos;
|
250
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
253
251
|
}
|
254
252
|
|
255
253
|
return opt_params;
|
@@ -259,32 +257,31 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
259
257
|
* @!visibility private
|
260
258
|
* Find for split point with maximum information gain.
|
261
259
|
*
|
262
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets
|
260
|
+
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
|
263
261
|
*
|
264
262
|
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
265
263
|
* @param impurity [Float] The impurity of whole dataset.
|
266
264
|
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
267
265
|
* @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
|
268
|
-
* @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
|
269
266
|
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
270
267
|
*/
|
271
268
|
static VALUE
|
272
|
-
find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y
|
269
|
+
find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y)
|
273
270
|
{
|
274
|
-
long i;
|
275
|
-
long curr_pos;
|
276
|
-
long next_pos;
|
277
|
-
long n_l_elements;
|
278
|
-
long n_r_elements;
|
279
271
|
const long n_elements = RARRAY_LEN(sorted_f);
|
280
|
-
const long n_uniq_elements = RARRAY_LEN(uniqed_f);
|
281
272
|
const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
|
282
273
|
const double w_impurity = NUM2DBL(whole_impurity);
|
274
|
+
long iter = 0;
|
275
|
+
long curr_pos = 0;
|
276
|
+
long next_pos = 0;
|
277
|
+
long n_l_elements = 0;
|
278
|
+
long n_r_elements = n_elements;
|
279
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
280
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
281
|
+
double next_el;
|
283
282
|
double l_impurity;
|
284
283
|
double r_impurity;
|
285
284
|
double gain;
|
286
|
-
double curr_el;
|
287
|
-
double next_el;
|
288
285
|
VALUE l_sum_vec = create_zero_vector(n_dimensions);
|
289
286
|
VALUE r_sum_vec = create_zero_vector(n_dimensions);
|
290
287
|
VALUE l_target_vecs = rb_ary_new();
|
@@ -295,24 +292,20 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
295
292
|
/* Initialize optimal parameters. */
|
296
293
|
rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
|
297
294
|
rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
|
298
|
-
rb_ary_store(opt_params, 2, rb_ary_entry(
|
295
|
+
rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
|
299
296
|
rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
|
300
297
|
|
301
298
|
/* Initialize child node variables. */
|
302
|
-
|
303
|
-
|
304
|
-
for (i = 0; i < n_elements; i++) {
|
305
|
-
target = rb_ary_entry(sorted_y, i);
|
299
|
+
for (iter = 0; iter < n_elements; iter++) {
|
300
|
+
target = rb_ary_entry(sorted_y, iter);
|
306
301
|
add_sum_vec(r_sum_vec, target);
|
307
302
|
rb_ary_push(r_target_vecs, target);
|
308
303
|
}
|
309
304
|
|
310
305
|
/* Find optimal parameters. */
|
311
|
-
|
312
|
-
/* Find new split point. */
|
313
|
-
curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
|
306
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
314
307
|
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
315
|
-
while (next_pos < n_elements && next_el
|
308
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
316
309
|
target = rb_ary_entry(sorted_y, next_pos);
|
317
310
|
add_sum_vec(l_sum_vec, target);
|
318
311
|
rb_ary_push(l_target_vecs, target);
|
@@ -333,6 +326,9 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
333
326
|
rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
|
334
327
|
rb_ary_store(opt_params, 3, DBL2NUM(gain));
|
335
328
|
}
|
329
|
+
if (next_pos == n_elements) break;
|
330
|
+
curr_pos = next_pos;
|
331
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
336
332
|
}
|
337
333
|
|
338
334
|
return opt_params;
|
@@ -411,8 +407,8 @@ void Init_rumale(void)
|
|
411
407
|
*/
|
412
408
|
VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
|
413
409
|
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
410
|
+
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
|
411
|
+
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
|
412
|
+
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
|
413
|
+
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
418
414
|
}
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/extra_tree_classifier'
|
4
|
+
require 'rumale/ensemble/random_forest_classifier'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Ensemble
|
8
|
+
# ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
|
9
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
10
|
+
# The features of the algorithm of extremely randomized trees are
|
11
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::ExtraTreesClassifier.new(
|
16
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_labels)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *Reference*
|
21
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
22
|
+
class ExtraTreesClassifier < RandomForestClassifier
|
23
|
+
# Return the set of estimators.
|
24
|
+
# @return [Array<ExtraTreeClassifier>]
|
25
|
+
attr_reader :estimators
|
26
|
+
|
27
|
+
# Return the class labels.
|
28
|
+
# @return [Numo::Int32] (size: n_classes)
|
29
|
+
attr_reader :classes
|
30
|
+
|
31
|
+
# Return the importance for each feature.
|
32
|
+
# @return [Numo::DFloat] (size: n_features)
|
33
|
+
attr_reader :feature_importances
|
34
|
+
|
35
|
+
# Return the random generator for random selection of feature index.
|
36
|
+
# @return [Random]
|
37
|
+
attr_reader :rng
|
38
|
+
|
39
|
+
# Create a new classifier with extremely randomized trees.
|
40
|
+
#
|
41
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
42
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
43
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
44
|
+
# If nil is given, extra tree grows without concern for depth.
|
45
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
46
|
+
# If nil is given, number of leaves is not limited.
|
47
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
48
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
49
|
+
# If nil is given, split process considers all features.
|
50
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
51
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
52
|
+
def initialize(n_estimators: 10,
|
53
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
54
|
+
max_features: nil, random_seed: nil)
|
55
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
56
|
+
max_features: max_features, random_seed: random_seed)
|
57
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
58
|
+
check_params_string(criterion: criterion)
|
59
|
+
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
60
|
+
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
61
|
+
max_features: max_features)
|
62
|
+
super
|
63
|
+
end
|
64
|
+
|
65
|
+
# Fit the model with given training data.
|
66
|
+
#
|
67
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
68
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
69
|
+
# @return [ExtraTreesClassifier] The learned classifier itself.
|
70
|
+
def fit(x, y)
|
71
|
+
check_sample_array(x)
|
72
|
+
check_label_array(y)
|
73
|
+
check_sample_label_size(x, y)
|
74
|
+
# Initialize some variables.
|
75
|
+
n_features = x.shape[1]
|
76
|
+
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
77
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
78
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
79
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
80
|
+
# Construct trees.
|
81
|
+
@estimators = Array.new(@params[:n_estimators]) do
|
82
|
+
tree = Tree::ExtraTreeClassifier.new(
|
83
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
84
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
85
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
86
|
+
)
|
87
|
+
tree.fit(x, y)
|
88
|
+
@feature_importances += tree.feature_importances
|
89
|
+
tree
|
90
|
+
end
|
91
|
+
@feature_importances /= @feature_importances.sum
|
92
|
+
self
|
93
|
+
end
|
94
|
+
|
95
|
+
# Predict class labels for samples.
|
96
|
+
#
|
97
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
98
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
99
|
+
def predict(x)
|
100
|
+
check_sample_array(x)
|
101
|
+
super
|
102
|
+
end
|
103
|
+
|
104
|
+
# Predict probability for samples.
|
105
|
+
#
|
106
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
107
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
108
|
+
def predict_proba(x)
|
109
|
+
check_sample_array(x)
|
110
|
+
super
|
111
|
+
end
|
112
|
+
|
113
|
+
# Return the index of the leaf that each sample reached.
|
114
|
+
#
|
115
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
116
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
117
|
+
def apply(x)
|
118
|
+
check_sample_array(x)
|
119
|
+
super
|
120
|
+
end
|
121
|
+
|
122
|
+
# Dump marshal data.
|
123
|
+
# @return [Hash] The marshal data about ExtraTreesClassifier.
|
124
|
+
def marshal_dump
|
125
|
+
super
|
126
|
+
end
|
127
|
+
|
128
|
+
# Load marshal data.
|
129
|
+
# @return [nil]
|
130
|
+
def marshal_load(obj)
|
131
|
+
super
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/extra_tree_regressor'
|
4
|
+
require 'rumale/ensemble/random_forest_regressor'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Ensemble
|
8
|
+
# ExtraTreesRegressor is a class that implements extremely randomized trees for regression
|
9
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
10
|
+
# The features of the algorithm of extremely randomized trees are
|
11
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::ExtraTreesRegressor.new(
|
16
|
+
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_values)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *Reference*
|
21
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
22
|
+
class ExtraTreesRegressor < RandomForestRegressor
|
23
|
+
# Return the set of estimators.
|
24
|
+
# @return [Array<ExtraTreeRegressor>]
|
25
|
+
attr_reader :estimators
|
26
|
+
|
27
|
+
# Return the importance for each feature.
|
28
|
+
# @return [Numo::DFloat] (size: n_features)
|
29
|
+
attr_reader :feature_importances
|
30
|
+
|
31
|
+
# Return the random generator for random selection of feature index.
|
32
|
+
# @return [Random]
|
33
|
+
attr_reader :rng
|
34
|
+
|
35
|
+
# Create a new regressor with extremely randomized trees.
|
36
|
+
#
|
37
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
38
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
39
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
40
|
+
# If nil is given, extra tree grows without concern for depth.
|
41
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
42
|
+
# If nil is given, number of leaves is not limited.
|
43
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
44
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
45
|
+
# If nil is given, split process considers all features.
|
46
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
47
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
48
|
+
def initialize(n_estimators: 10,
|
49
|
+
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
50
|
+
max_features: nil, random_seed: nil)
|
51
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
52
|
+
max_features: max_features, random_seed: random_seed)
|
53
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
54
|
+
check_params_string(criterion: criterion)
|
55
|
+
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
56
|
+
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
57
|
+
max_features: max_features)
|
58
|
+
super
|
59
|
+
end
|
60
|
+
|
61
|
+
# Fit the model with given training data.
|
62
|
+
#
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
64
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
65
|
+
# @return [ExtraTreesRegressor] The learned regressor itself.
|
66
|
+
def fit(x, y)
|
67
|
+
check_sample_array(x)
|
68
|
+
check_tvalue_array(y)
|
69
|
+
check_sample_tvalue_size(x, y)
|
70
|
+
# Initialize some variables.
|
71
|
+
n_features = x.shape[1]
|
72
|
+
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
73
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
74
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
75
|
+
# Construct forest.
|
76
|
+
@estimators = Array.new(@params[:n_estimators]) do
|
77
|
+
tree = Tree::ExtraTreeRegressor.new(
|
78
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
79
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
80
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
81
|
+
)
|
82
|
+
tree.fit(x, y)
|
83
|
+
@feature_importances += tree.feature_importances
|
84
|
+
tree
|
85
|
+
end
|
86
|
+
@feature_importances /= @feature_importances.sum
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
# Predict values for samples.
|
91
|
+
#
|
92
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
93
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
94
|
+
def predict(x)
|
95
|
+
check_sample_array(x)
|
96
|
+
super
|
97
|
+
end
|
98
|
+
|
99
|
+
# Return the index of the leaf that each sample reached.
|
100
|
+
#
|
101
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
|
102
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
103
|
+
def apply(x)
|
104
|
+
check_sample_array(x)
|
105
|
+
super
|
106
|
+
end
|
107
|
+
|
108
|
+
# Dump marshal data.
|
109
|
+
# @return [Hash] The marshal data about ExtraTreesRegressor.
|
110
|
+
def marshal_dump
|
111
|
+
super
|
112
|
+
end
|
113
|
+
|
114
|
+
# Load marshal data.
|
115
|
+
# @return [nil]
|
116
|
+
def marshal_load(obj)
|
117
|
+
super
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -155,9 +155,9 @@ module Rumale
|
|
155
155
|
def best_split(features, y, whole_impurity)
|
156
156
|
order = features.sort_index
|
157
157
|
sorted_f = features[order].to_a
|
158
|
-
sorted_y = y[order,
|
158
|
+
sorted_y = y[order, 0].to_a
|
159
159
|
n_classes = @classes.size
|
160
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y,
|
160
|
+
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
|
161
161
|
end
|
162
162
|
|
163
163
|
def impurity(y)
|
@@ -129,7 +129,7 @@ module Rumale
|
|
129
129
|
order = features.sort_index
|
130
130
|
sorted_f = features[order].to_a
|
131
131
|
sorted_y = y[order, true].to_a
|
132
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y
|
132
|
+
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
|
133
133
|
end
|
134
134
|
|
135
135
|
def impurity(y)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/decision_tree_classifier'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module Tree
|
7
|
+
# ExtraTreeClassifier is a class that implements extra randomized tree for classification.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# estimator =
|
11
|
+
# Rumale::Tree::ExtraTreeClassifier.new(
|
12
|
+
# criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
13
|
+
# estimator.fit(training_samples, traininig_labels)
|
14
|
+
# results = estimator.predict(testing_samples)
|
15
|
+
#
|
16
|
+
# *Reference*
|
17
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
18
|
+
class ExtraTreeClassifier < DecisionTreeClassifier
|
19
|
+
# Return the class labels.
|
20
|
+
# @return [Numo::Int32] (size: n_classes)
|
21
|
+
attr_reader :classes
|
22
|
+
|
23
|
+
# Return the importance for each feature.
|
24
|
+
# @return [Numo::DFloat] (size: n_features)
|
25
|
+
attr_reader :feature_importances
|
26
|
+
|
27
|
+
# Return the learned tree.
|
28
|
+
# @return [Node]
|
29
|
+
attr_reader :tree
|
30
|
+
|
31
|
+
# Return the random generator for random selection of feature index.
|
32
|
+
# @return [Random]
|
33
|
+
attr_reader :rng
|
34
|
+
|
35
|
+
# Return the labels assigned each leaf.
|
36
|
+
# @return [Numo::Int32] (size: n_leafs)
|
37
|
+
attr_reader :leaf_labels
|
38
|
+
|
39
|
+
# Create a new classifier with extra randomized tree algorithm.
|
40
|
+
#
|
41
|
+
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
42
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
43
|
+
# If nil is given, extra tree grows without concern for depth.
|
44
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
45
|
+
# If nil is given, number of leaves is not limited.
|
46
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
47
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
48
|
+
# If nil is given, split process considers all features.
|
49
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
50
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
51
|
+
def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
52
|
+
random_seed: nil)
|
53
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
54
|
+
max_features: max_features, random_seed: random_seed)
|
55
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
56
|
+
check_params_string(criterion: criterion)
|
57
|
+
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
58
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
59
|
+
super
|
60
|
+
end
|
61
|
+
|
62
|
+
# Fit the model with given training data.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
65
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
66
|
+
# @return [ExtraTreeClassifier] The learned classifier itself.
|
67
|
+
def fit(x, y)
|
68
|
+
check_sample_array(x)
|
69
|
+
check_label_array(y)
|
70
|
+
check_sample_label_size(x, y)
|
71
|
+
super
|
72
|
+
end
|
73
|
+
|
74
|
+
# Predict class labels for samples.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
77
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
78
|
+
def predict(x)
|
79
|
+
check_sample_array(x)
|
80
|
+
super
|
81
|
+
end
|
82
|
+
|
83
|
+
# Predict probability for samples.
|
84
|
+
#
|
85
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
86
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
87
|
+
def predict_proba(x)
|
88
|
+
check_sample_array(x)
|
89
|
+
super
|
90
|
+
end
|
91
|
+
|
92
|
+
# Dump marshal data.
|
93
|
+
# @return [Hash] The marshal data about ExtraTreeClassifier
|
94
|
+
def marshal_dump
|
95
|
+
super
|
96
|
+
end
|
97
|
+
|
98
|
+
# Load marshal data.
|
99
|
+
# @return [nil]
|
100
|
+
def marshal_load(obj)
|
101
|
+
super
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def best_split(features, y, whole_impurity)
|
107
|
+
threshold = @rng.rand(features.min..features.max)
|
108
|
+
l_ids = features.le(threshold).where
|
109
|
+
r_ids = features.gt(threshold).where
|
110
|
+
l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
|
111
|
+
r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
|
112
|
+
gain = whole_impurity -
|
113
|
+
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
114
|
+
r_impurity * r_ids.size.fdiv(y.shape[0])
|
115
|
+
[l_impurity, r_impurity, threshold, gain]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/decision_tree_regressor'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module Tree
|
7
|
+
# ExtraTreeRegressor is a class that implements extra randomized tree for regression.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# estimator =
|
11
|
+
# Rumale::Tree::ExtraTreeRegressor.new(
|
12
|
+
# max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
13
|
+
# estimator.fit(training_samples, traininig_values)
|
14
|
+
# results = estimator.predict(testing_samples)
|
15
|
+
#
|
16
|
+
# *Reference*
|
17
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
18
|
+
class ExtraTreeRegressor < DecisionTreeRegressor
|
19
|
+
# Return the importance for each feature.
|
20
|
+
# @return [Numo::DFloat] (size: n_features)
|
21
|
+
attr_reader :feature_importances
|
22
|
+
|
23
|
+
# Return the learned tree.
|
24
|
+
# @return [Node]
|
25
|
+
attr_reader :tree
|
26
|
+
|
27
|
+
# Return the random generator for random selection of feature index.
|
28
|
+
# @return [Random]
|
29
|
+
attr_reader :rng
|
30
|
+
|
31
|
+
# Return the values assigned each leaf.
|
32
|
+
# @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
|
33
|
+
attr_reader :leaf_values
|
34
|
+
|
35
|
+
# Create a new regressor with extra randomized tree algorithm.
|
36
|
+
#
|
37
|
+
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
38
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
39
|
+
# If nil is given, extra tree grows without concern for depth.
|
40
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
41
|
+
# If nil is given, number of leaves is not limited.
|
42
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
43
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
44
|
+
# If nil is given, split process considers all features.
|
45
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
46
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
47
|
+
def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
48
|
+
random_seed: nil)
|
49
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
50
|
+
max_features: max_features, random_seed: random_seed)
|
51
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
52
|
+
check_params_string(criterion: criterion)
|
53
|
+
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
54
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
55
|
+
super
|
56
|
+
end
|
57
|
+
|
58
|
+
# Fit the model with given training data.
|
59
|
+
#
|
60
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
61
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
|
62
|
+
# @return [ExtraTreeRegressor] The learned regressor itself.
|
63
|
+
def fit(x, y)
|
64
|
+
check_sample_array(x)
|
65
|
+
check_tvalue_array(y)
|
66
|
+
check_sample_tvalue_size(x, y)
|
67
|
+
super
|
68
|
+
end
|
69
|
+
|
70
|
+
# Predict values for samples.
|
71
|
+
#
|
72
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
73
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
74
|
+
def predict(x)
|
75
|
+
check_sample_array(x)
|
76
|
+
super
|
77
|
+
end
|
78
|
+
|
79
|
+
# Dump marshal data.
|
80
|
+
# @return [Hash] The marshal data about ExtraTreeRegressor
|
81
|
+
def marshal_dump
|
82
|
+
super
|
83
|
+
end
|
84
|
+
|
85
|
+
# Load marshal data.
|
86
|
+
# @return [nil]
|
87
|
+
def marshal_load(obj)
|
88
|
+
super
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def best_split(features, y, whole_impurity)
|
94
|
+
threshold = @rng.rand(features.min..features.max)
|
95
|
+
l_ids = features.le(threshold).where
|
96
|
+
r_ids = features.gt(threshold).where
|
97
|
+
l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
|
98
|
+
r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
|
99
|
+
gain = whole_impurity -
|
100
|
+
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
101
|
+
r_impurity * r_ids.size.fdiv(y.shape[0])
|
102
|
+
[l_impurity, r_impurity, threshold, gain]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/lib/rumale/version.rb
CHANGED
data/lib/rumale.rb
CHANGED
@@ -45,10 +45,14 @@ require 'rumale/tree/node'
|
|
45
45
|
require 'rumale/tree/base_decision_tree'
|
46
46
|
require 'rumale/tree/decision_tree_classifier'
|
47
47
|
require 'rumale/tree/decision_tree_regressor'
|
48
|
+
require 'rumale/tree/extra_tree_classifier'
|
49
|
+
require 'rumale/tree/extra_tree_regressor'
|
48
50
|
require 'rumale/ensemble/ada_boost_classifier'
|
49
51
|
require 'rumale/ensemble/ada_boost_regressor'
|
50
52
|
require 'rumale/ensemble/random_forest_classifier'
|
51
53
|
require 'rumale/ensemble/random_forest_regressor'
|
54
|
+
require 'rumale/ensemble/extra_trees_classifier'
|
55
|
+
require 'rumale/ensemble/extra_trees_regressor'
|
52
56
|
require 'rumale/clustering/k_means'
|
53
57
|
require 'rumale/clustering/dbscan'
|
54
58
|
require 'rumale/decomposition/pca'
|
data/rumale.gemspec
CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
|
|
17
17
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
18
18
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
19
19
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
20
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
|
20
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
21
21
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
22
|
MSG
|
23
23
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -99,7 +99,7 @@ description: |
|
|
99
99
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
100
100
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
101
101
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
102
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
|
102
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
103
103
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
104
104
|
email:
|
105
105
|
- yoshoku@outlook.com
|
@@ -140,6 +140,8 @@ files:
|
|
140
140
|
- lib/rumale/decomposition/pca.rb
|
141
141
|
- lib/rumale/ensemble/ada_boost_classifier.rb
|
142
142
|
- lib/rumale/ensemble/ada_boost_regressor.rb
|
143
|
+
- lib/rumale/ensemble/extra_trees_classifier.rb
|
144
|
+
- lib/rumale/ensemble/extra_trees_regressor.rb
|
143
145
|
- lib/rumale/ensemble/random_forest_classifier.rb
|
144
146
|
- lib/rumale/ensemble/random_forest_regressor.rb
|
145
147
|
- lib/rumale/evaluation_measure/accuracy.rb
|
@@ -199,6 +201,8 @@ files:
|
|
199
201
|
- lib/rumale/tree/base_decision_tree.rb
|
200
202
|
- lib/rumale/tree/decision_tree_classifier.rb
|
201
203
|
- lib/rumale/tree/decision_tree_regressor.rb
|
204
|
+
- lib/rumale/tree/extra_tree_classifier.rb
|
205
|
+
- lib/rumale/tree/extra_tree_regressor.rb
|
202
206
|
- lib/rumale/tree/node.rb
|
203
207
|
- lib/rumale/utils.rb
|
204
208
|
- lib/rumale/validation.rb
|