rumale 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/ext/rumale/rumale.c +40 -44
- data/lib/rumale/ensemble/extra_trees_classifier.rb +135 -0
- data/lib/rumale/ensemble/extra_trees_regressor.rb +121 -0
- data/lib/rumale/tree/decision_tree_classifier.rb +2 -2
- data/lib/rumale/tree/decision_tree_regressor.rb +1 -1
- data/lib/rumale/tree/extra_tree_classifier.rb +119 -0
- data/lib/rumale/tree/extra_tree_regressor.rb +106 -0
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +4 -0
- data/rumale.gemspec +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
|
4
|
+
data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
|
7
|
+
data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# 0.9.1
|
2
|
+
- Add class for Extra-Trees classifier.
|
3
|
+
- Add class for Extra-Trees regressor.
|
4
|
+
- Refactor extension modules of decision tree estimators for improving performance.
|
5
|
+
|
1
6
|
# 0.9.0
|
2
7
|
## Breaking changes
|
3
8
|
- Decide to introduce Ruby extensions for improving performance.
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
|
|
12
12
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
13
13
|
Rumale supports Linear / Kernel Support Vector Machine,
|
14
14
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
15
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor classifier,
|
15
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
|
16
16
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
17
17
|
|
18
18
|
This project was formerly known as "SVMKit".
|
data/ext/rumale/rumale.c
CHANGED
@@ -183,33 +183,32 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
|
|
183
183
|
* @!visibility private
|
184
184
|
* Find for split point with maximum information gain.
|
185
185
|
*
|
186
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_labels,
|
186
|
+
* @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
|
187
187
|
*
|
188
188
|
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
189
189
|
* @param impurity [Float] The impurity of whole dataset.
|
190
190
|
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
191
191
|
* @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
|
192
|
-
* @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
|
193
192
|
* @param n_classes [Integer] The number of classes.
|
194
193
|
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
195
194
|
*/
|
196
195
|
static VALUE
|
197
|
-
find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE
|
196
|
+
find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
|
198
197
|
{
|
199
|
-
long i;
|
200
|
-
long curr_pos;
|
201
|
-
long next_pos;
|
202
|
-
long n_l_elements;
|
203
|
-
long n_r_elements;
|
204
198
|
const long n_classes = NUM2LONG(n_classes_);
|
205
199
|
const long n_elements = RARRAY_LEN(sorted_f);
|
206
|
-
const long n_uniq_elements = RARRAY_LEN(uniqed_f);
|
207
200
|
const double w_impurity = NUM2DBL(whole_impurity);
|
201
|
+
long iter = 0;
|
202
|
+
long curr_pos = 0;
|
203
|
+
long next_pos = 0;
|
204
|
+
long n_l_elements = 0;
|
205
|
+
long n_r_elements = n_elements;
|
206
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
207
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
208
|
+
double next_el;
|
208
209
|
double l_impurity;
|
209
210
|
double r_impurity;
|
210
211
|
double gain;
|
211
|
-
double curr_el;
|
212
|
-
double next_el;
|
213
212
|
VALUE l_histogram = create_zero_vector(n_classes);
|
214
213
|
VALUE r_histogram = create_zero_vector(n_classes);
|
215
214
|
VALUE opt_params = rb_ary_new2(4);
|
@@ -217,22 +216,18 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
217
216
|
/* Initialize optimal parameters. */
|
218
217
|
rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
|
219
218
|
rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
|
220
|
-
rb_ary_store(opt_params, 2, rb_ary_entry(
|
219
|
+
rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
|
221
220
|
rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
|
222
221
|
|
223
222
|
/* Initialize child node variables. */
|
224
|
-
|
225
|
-
|
226
|
-
for (i = 0; i < n_elements; i++) {
|
227
|
-
increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
|
223
|
+
for (iter = 0; iter < n_elements; iter++) {
|
224
|
+
increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
|
228
225
|
}
|
229
226
|
|
230
227
|
/* Find optimal parameters. */
|
231
|
-
|
232
|
-
/* Find new split point. */
|
233
|
-
curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
|
228
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
234
229
|
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
235
|
-
while (next_pos < n_elements && next_el
|
230
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
236
231
|
increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
|
237
232
|
n_l_elements++;
|
238
233
|
decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
|
@@ -250,6 +245,9 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
250
245
|
rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
|
251
246
|
rb_ary_store(opt_params, 3, DBL2NUM(gain));
|
252
247
|
}
|
248
|
+
if (next_pos == n_elements) break;
|
249
|
+
curr_pos = next_pos;
|
250
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
253
251
|
}
|
254
252
|
|
255
253
|
return opt_params;
|
@@ -259,32 +257,31 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
259
257
|
* @!visibility private
|
260
258
|
* Find for split point with maximum information gain.
|
261
259
|
*
|
262
|
-
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets
|
260
|
+
* @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
|
263
261
|
*
|
264
262
|
* @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
265
263
|
* @param impurity [Float] The impurity of whole dataset.
|
266
264
|
* @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
|
267
265
|
* @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
|
268
|
-
* @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
|
269
266
|
* @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
|
270
267
|
*/
|
271
268
|
static VALUE
|
272
|
-
find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y
|
269
|
+
find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y)
|
273
270
|
{
|
274
|
-
long i;
|
275
|
-
long curr_pos;
|
276
|
-
long next_pos;
|
277
|
-
long n_l_elements;
|
278
|
-
long n_r_elements;
|
279
271
|
const long n_elements = RARRAY_LEN(sorted_f);
|
280
|
-
const long n_uniq_elements = RARRAY_LEN(uniqed_f);
|
281
272
|
const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
|
282
273
|
const double w_impurity = NUM2DBL(whole_impurity);
|
274
|
+
long iter = 0;
|
275
|
+
long curr_pos = 0;
|
276
|
+
long next_pos = 0;
|
277
|
+
long n_l_elements = 0;
|
278
|
+
long n_r_elements = n_elements;
|
279
|
+
double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
|
280
|
+
double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
|
281
|
+
double next_el;
|
283
282
|
double l_impurity;
|
284
283
|
double r_impurity;
|
285
284
|
double gain;
|
286
|
-
double curr_el;
|
287
|
-
double next_el;
|
288
285
|
VALUE l_sum_vec = create_zero_vector(n_dimensions);
|
289
286
|
VALUE r_sum_vec = create_zero_vector(n_dimensions);
|
290
287
|
VALUE l_target_vecs = rb_ary_new();
|
@@ -295,24 +292,20 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
295
292
|
/* Initialize optimal parameters. */
|
296
293
|
rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
|
297
294
|
rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
|
298
|
-
rb_ary_store(opt_params, 2, rb_ary_entry(
|
295
|
+
rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
|
299
296
|
rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
|
300
297
|
|
301
298
|
/* Initialize child node variables. */
|
302
|
-
|
303
|
-
|
304
|
-
for (i = 0; i < n_elements; i++) {
|
305
|
-
target = rb_ary_entry(sorted_y, i);
|
299
|
+
for (iter = 0; iter < n_elements; iter++) {
|
300
|
+
target = rb_ary_entry(sorted_y, iter);
|
306
301
|
add_sum_vec(r_sum_vec, target);
|
307
302
|
rb_ary_push(r_target_vecs, target);
|
308
303
|
}
|
309
304
|
|
310
305
|
/* Find optimal parameters. */
|
311
|
-
|
312
|
-
/* Find new split point. */
|
313
|
-
curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
|
306
|
+
while (curr_pos < n_elements && curr_el != last_el) {
|
314
307
|
next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
|
315
|
-
while (next_pos < n_elements && next_el
|
308
|
+
while (next_pos < n_elements && next_el == curr_el) {
|
316
309
|
target = rb_ary_entry(sorted_y, next_pos);
|
317
310
|
add_sum_vec(l_sum_vec, target);
|
318
311
|
rb_ary_push(l_target_vecs, target);
|
@@ -333,6 +326,9 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
|
|
333
326
|
rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
|
334
327
|
rb_ary_store(opt_params, 3, DBL2NUM(gain));
|
335
328
|
}
|
329
|
+
if (next_pos == n_elements) break;
|
330
|
+
curr_pos = next_pos;
|
331
|
+
curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
|
336
332
|
}
|
337
333
|
|
338
334
|
return opt_params;
|
@@ -411,8 +407,8 @@ void Init_rumale(void)
|
|
411
407
|
*/
|
412
408
|
VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
|
413
409
|
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
410
|
+
rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
|
411
|
+
rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
|
412
|
+
rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
|
413
|
+
rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
|
418
414
|
}
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/extra_tree_classifier'
|
4
|
+
require 'rumale/ensemble/random_forest_classifier'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Ensemble
|
8
|
+
# ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
|
9
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
10
|
+
# The features of the algorithm of extremely randomized trees are
|
11
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::ExtraTreesClassifier.new(
|
16
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_labels)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *Reference*
|
21
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
22
|
+
class ExtraTreesClassifier < RandomForestClassifier
|
23
|
+
# Return the set of estimators.
|
24
|
+
# @return [Array<ExtraTreeClassifier>]
|
25
|
+
attr_reader :estimators
|
26
|
+
|
27
|
+
# Return the class labels.
|
28
|
+
# @return [Numo::Int32] (size: n_classes)
|
29
|
+
attr_reader :classes
|
30
|
+
|
31
|
+
# Return the importance for each feature.
|
32
|
+
# @return [Numo::DFloat] (size: n_features)
|
33
|
+
attr_reader :feature_importances
|
34
|
+
|
35
|
+
# Return the random generator for random selection of feature index.
|
36
|
+
# @return [Random]
|
37
|
+
attr_reader :rng
|
38
|
+
|
39
|
+
# Create a new classifier with extremely randomized trees.
|
40
|
+
#
|
41
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
42
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
43
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
44
|
+
# If nil is given, extra tree grows without concern for depth.
|
45
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
46
|
+
# If nil is given, number of leaves is not limited.
|
47
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
48
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
49
|
+
# If nil is given, split process considers all features.
|
50
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
51
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
52
|
+
def initialize(n_estimators: 10,
|
53
|
+
criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
54
|
+
max_features: nil, random_seed: nil)
|
55
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
56
|
+
max_features: max_features, random_seed: random_seed)
|
57
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
58
|
+
check_params_string(criterion: criterion)
|
59
|
+
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
60
|
+
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
61
|
+
max_features: max_features)
|
62
|
+
super
|
63
|
+
end
|
64
|
+
|
65
|
+
# Fit the model with given training data.
|
66
|
+
#
|
67
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
68
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
69
|
+
# @return [ExtraTreesClassifier] The learned classifier itself.
|
70
|
+
def fit(x, y)
|
71
|
+
check_sample_array(x)
|
72
|
+
check_label_array(y)
|
73
|
+
check_sample_label_size(x, y)
|
74
|
+
# Initialize some variables.
|
75
|
+
n_features = x.shape[1]
|
76
|
+
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
77
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
78
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
79
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
80
|
+
# Construct trees.
|
81
|
+
@estimators = Array.new(@params[:n_estimators]) do
|
82
|
+
tree = Tree::ExtraTreeClassifier.new(
|
83
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
84
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
85
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
86
|
+
)
|
87
|
+
tree.fit(x, y)
|
88
|
+
@feature_importances += tree.feature_importances
|
89
|
+
tree
|
90
|
+
end
|
91
|
+
@feature_importances /= @feature_importances.sum
|
92
|
+
self
|
93
|
+
end
|
94
|
+
|
95
|
+
# Predict class labels for samples.
|
96
|
+
#
|
97
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
98
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
99
|
+
def predict(x)
|
100
|
+
check_sample_array(x)
|
101
|
+
super
|
102
|
+
end
|
103
|
+
|
104
|
+
# Predict probability for samples.
|
105
|
+
#
|
106
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
107
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
108
|
+
def predict_proba(x)
|
109
|
+
check_sample_array(x)
|
110
|
+
super
|
111
|
+
end
|
112
|
+
|
113
|
+
# Return the index of the leaf that each sample reached.
|
114
|
+
#
|
115
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
116
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
117
|
+
def apply(x)
|
118
|
+
check_sample_array(x)
|
119
|
+
super
|
120
|
+
end
|
121
|
+
|
122
|
+
# Dump marshal data.
|
123
|
+
# @return [Hash] The marshal data about ExtraTreesClassifier.
|
124
|
+
def marshal_dump
|
125
|
+
super
|
126
|
+
end
|
127
|
+
|
128
|
+
# Load marshal data.
|
129
|
+
# @return [nil]
|
130
|
+
def marshal_load(obj)
|
131
|
+
super
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/extra_tree_regressor'
|
4
|
+
require 'rumale/ensemble/random_forest_regressor'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Ensemble
|
8
|
+
# ExtraTreesRegressor is a class that implements extremely randomized trees for regression
|
9
|
+
# The algorithm of extremely randomized trees is similar to random forest.
|
10
|
+
# The features of the algorithm of extremely randomized trees are
|
11
|
+
# not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# estimator =
|
15
|
+
# Rumale::Ensemble::ExtraTreesRegressor.new(
|
16
|
+
# n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
17
|
+
# estimator.fit(training_samples, traininig_values)
|
18
|
+
# results = estimator.predict(testing_samples)
|
19
|
+
#
|
20
|
+
# *Reference*
|
21
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
22
|
+
class ExtraTreesRegressor < RandomForestRegressor
|
23
|
+
# Return the set of estimators.
|
24
|
+
# @return [Array<ExtraTreeRegressor>]
|
25
|
+
attr_reader :estimators
|
26
|
+
|
27
|
+
# Return the importance for each feature.
|
28
|
+
# @return [Numo::DFloat] (size: n_features)
|
29
|
+
attr_reader :feature_importances
|
30
|
+
|
31
|
+
# Return the random generator for random selection of feature index.
|
32
|
+
# @return [Random]
|
33
|
+
attr_reader :rng
|
34
|
+
|
35
|
+
# Create a new regressor with extremely randomized trees.
|
36
|
+
#
|
37
|
+
# @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
|
38
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
39
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
40
|
+
# If nil is given, extra tree grows without concern for depth.
|
41
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
42
|
+
# If nil is given, number of leaves is not limited.
|
43
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
44
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
45
|
+
# If nil is given, split process considers all features.
|
46
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
47
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
48
|
+
def initialize(n_estimators: 10,
|
49
|
+
criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
50
|
+
max_features: nil, random_seed: nil)
|
51
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
52
|
+
max_features: max_features, random_seed: random_seed)
|
53
|
+
check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
|
54
|
+
check_params_string(criterion: criterion)
|
55
|
+
check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
|
56
|
+
max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
|
57
|
+
max_features: max_features)
|
58
|
+
super
|
59
|
+
end
|
60
|
+
|
61
|
+
# Fit the model with given training data.
|
62
|
+
#
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
64
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
65
|
+
# @return [ExtraTreesRegressor] The learned regressor itself.
|
66
|
+
def fit(x, y)
|
67
|
+
check_sample_array(x)
|
68
|
+
check_tvalue_array(y)
|
69
|
+
check_sample_tvalue_size(x, y)
|
70
|
+
# Initialize some variables.
|
71
|
+
n_features = x.shape[1]
|
72
|
+
@params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
|
73
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
74
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
75
|
+
# Construct forest.
|
76
|
+
@estimators = Array.new(@params[:n_estimators]) do
|
77
|
+
tree = Tree::ExtraTreeRegressor.new(
|
78
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
79
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
80
|
+
max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
|
81
|
+
)
|
82
|
+
tree.fit(x, y)
|
83
|
+
@feature_importances += tree.feature_importances
|
84
|
+
tree
|
85
|
+
end
|
86
|
+
@feature_importances /= @feature_importances.sum
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
# Predict values for samples.
|
91
|
+
#
|
92
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
93
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
|
94
|
+
def predict(x)
|
95
|
+
check_sample_array(x)
|
96
|
+
super
|
97
|
+
end
|
98
|
+
|
99
|
+
# Return the index of the leaf that each sample reached.
|
100
|
+
#
|
101
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
|
102
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
103
|
+
def apply(x)
|
104
|
+
check_sample_array(x)
|
105
|
+
super
|
106
|
+
end
|
107
|
+
|
108
|
+
# Dump marshal data.
|
109
|
+
# @return [Hash] The marshal data about ExtraTreesRegressor.
|
110
|
+
def marshal_dump
|
111
|
+
super
|
112
|
+
end
|
113
|
+
|
114
|
+
# Load marshal data.
|
115
|
+
# @return [nil]
|
116
|
+
def marshal_load(obj)
|
117
|
+
super
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -155,9 +155,9 @@ module Rumale
|
|
155
155
|
def best_split(features, y, whole_impurity)
|
156
156
|
order = features.sort_index
|
157
157
|
sorted_f = features[order].to_a
|
158
|
-
sorted_y = y[order,
|
158
|
+
sorted_y = y[order, 0].to_a
|
159
159
|
n_classes = @classes.size
|
160
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y,
|
160
|
+
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
|
161
161
|
end
|
162
162
|
|
163
163
|
def impurity(y)
|
@@ -129,7 +129,7 @@ module Rumale
|
|
129
129
|
order = features.sort_index
|
130
130
|
sorted_f = features[order].to_a
|
131
131
|
sorted_y = y[order, true].to_a
|
132
|
-
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y
|
132
|
+
find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
|
133
133
|
end
|
134
134
|
|
135
135
|
def impurity(y)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/decision_tree_classifier'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module Tree
|
7
|
+
# ExtraTreeClassifier is a class that implements extra randomized tree for classification.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# estimator =
|
11
|
+
# Rumale::Tree::ExtraTreeClassifier.new(
|
12
|
+
# criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
13
|
+
# estimator.fit(training_samples, traininig_labels)
|
14
|
+
# results = estimator.predict(testing_samples)
|
15
|
+
#
|
16
|
+
# *Reference*
|
17
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
18
|
+
class ExtraTreeClassifier < DecisionTreeClassifier
|
19
|
+
# Return the class labels.
|
20
|
+
# @return [Numo::Int32] (size: n_classes)
|
21
|
+
attr_reader :classes
|
22
|
+
|
23
|
+
# Return the importance for each feature.
|
24
|
+
# @return [Numo::DFloat] (size: n_features)
|
25
|
+
attr_reader :feature_importances
|
26
|
+
|
27
|
+
# Return the learned tree.
|
28
|
+
# @return [Node]
|
29
|
+
attr_reader :tree
|
30
|
+
|
31
|
+
# Return the random generator for random selection of feature index.
|
32
|
+
# @return [Random]
|
33
|
+
attr_reader :rng
|
34
|
+
|
35
|
+
# Return the labels assigned each leaf.
|
36
|
+
# @return [Numo::Int32] (size: n_leafs)
|
37
|
+
attr_reader :leaf_labels
|
38
|
+
|
39
|
+
# Create a new classifier with extra randomized tree algorithm.
|
40
|
+
#
|
41
|
+
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
|
42
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
43
|
+
# If nil is given, extra tree grows without concern for depth.
|
44
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
45
|
+
# If nil is given, number of leaves is not limited.
|
46
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
47
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
48
|
+
# If nil is given, split process considers all features.
|
49
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
50
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
51
|
+
def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
52
|
+
random_seed: nil)
|
53
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
54
|
+
max_features: max_features, random_seed: random_seed)
|
55
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
56
|
+
check_params_string(criterion: criterion)
|
57
|
+
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
58
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
59
|
+
super
|
60
|
+
end
|
61
|
+
|
62
|
+
# Fit the model with given training data.
|
63
|
+
#
|
64
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
65
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
66
|
+
# @return [ExtraTreeClassifier] The learned classifier itself.
|
67
|
+
def fit(x, y)
|
68
|
+
check_sample_array(x)
|
69
|
+
check_label_array(y)
|
70
|
+
check_sample_label_size(x, y)
|
71
|
+
super
|
72
|
+
end
|
73
|
+
|
74
|
+
# Predict class labels for samples.
|
75
|
+
#
|
76
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
77
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
78
|
+
def predict(x)
|
79
|
+
check_sample_array(x)
|
80
|
+
super
|
81
|
+
end
|
82
|
+
|
83
|
+
# Predict probability for samples.
|
84
|
+
#
|
85
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
86
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
87
|
+
def predict_proba(x)
|
88
|
+
check_sample_array(x)
|
89
|
+
super
|
90
|
+
end
|
91
|
+
|
92
|
+
# Dump marshal data.
|
93
|
+
# @return [Hash] The marshal data about ExtraTreeClassifier
|
94
|
+
def marshal_dump
|
95
|
+
super
|
96
|
+
end
|
97
|
+
|
98
|
+
# Load marshal data.
|
99
|
+
# @return [nil]
|
100
|
+
def marshal_load(obj)
|
101
|
+
super
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def best_split(features, y, whole_impurity)
|
107
|
+
threshold = @rng.rand(features.min..features.max)
|
108
|
+
l_ids = features.le(threshold).where
|
109
|
+
r_ids = features.gt(threshold).where
|
110
|
+
l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
|
111
|
+
r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
|
112
|
+
gain = whole_impurity -
|
113
|
+
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
114
|
+
r_impurity * r_ids.size.fdiv(y.shape[0])
|
115
|
+
[l_impurity, r_impurity, threshold, gain]
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/tree/decision_tree_regressor'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module Tree
|
7
|
+
# ExtraTreeRegressor is a class that implements extra randomized tree for regression.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# estimator =
|
11
|
+
# Rumale::Tree::ExtraTreeRegressor.new(
|
12
|
+
# max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
13
|
+
# estimator.fit(training_samples, traininig_values)
|
14
|
+
# results = estimator.predict(testing_samples)
|
15
|
+
#
|
16
|
+
# *Reference*
|
17
|
+
# - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
|
18
|
+
class ExtraTreeRegressor < DecisionTreeRegressor
|
19
|
+
# Return the importance for each feature.
|
20
|
+
# @return [Numo::DFloat] (size: n_features)
|
21
|
+
attr_reader :feature_importances
|
22
|
+
|
23
|
+
# Return the learned tree.
|
24
|
+
# @return [Node]
|
25
|
+
attr_reader :tree
|
26
|
+
|
27
|
+
# Return the random generator for random selection of feature index.
|
28
|
+
# @return [Random]
|
29
|
+
attr_reader :rng
|
30
|
+
|
31
|
+
# Return the values assigned each leaf.
|
32
|
+
# @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
|
33
|
+
attr_reader :leaf_values
|
34
|
+
|
35
|
+
# Create a new regressor with extra randomized tree algorithm.
|
36
|
+
#
|
37
|
+
# @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
|
38
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
39
|
+
# If nil is given, extra tree grows without concern for depth.
|
40
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
|
41
|
+
# If nil is given, number of leaves is not limited.
|
42
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
43
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
44
|
+
# If nil is given, split process considers all features.
|
45
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
46
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
47
|
+
def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
48
|
+
random_seed: nil)
|
49
|
+
check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
50
|
+
max_features: max_features, random_seed: random_seed)
|
51
|
+
check_params_integer(min_samples_leaf: min_samples_leaf)
|
52
|
+
check_params_string(criterion: criterion)
|
53
|
+
check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
|
54
|
+
min_samples_leaf: min_samples_leaf, max_features: max_features)
|
55
|
+
super
|
56
|
+
end
|
57
|
+
|
58
|
+
# Fit the model with given training data.
|
59
|
+
#
|
60
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
61
|
+
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
|
62
|
+
# @return [ExtraTreeRegressor] The learned regressor itself.
|
63
|
+
def fit(x, y)
|
64
|
+
check_sample_array(x)
|
65
|
+
check_tvalue_array(y)
|
66
|
+
check_sample_tvalue_size(x, y)
|
67
|
+
super
|
68
|
+
end
|
69
|
+
|
70
|
+
# Predict values for samples.
|
71
|
+
#
|
72
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
|
73
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
|
74
|
+
def predict(x)
|
75
|
+
check_sample_array(x)
|
76
|
+
super
|
77
|
+
end
|
78
|
+
|
79
|
+
# Dump marshal data.
|
80
|
+
# @return [Hash] The marshal data about ExtraTreeRegressor
|
81
|
+
def marshal_dump
|
82
|
+
super
|
83
|
+
end
|
84
|
+
|
85
|
+
# Load marshal data.
|
86
|
+
# @return [nil]
|
87
|
+
def marshal_load(obj)
|
88
|
+
super
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def best_split(features, y, whole_impurity)
|
94
|
+
threshold = @rng.rand(features.min..features.max)
|
95
|
+
l_ids = features.le(threshold).where
|
96
|
+
r_ids = features.gt(threshold).where
|
97
|
+
l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
|
98
|
+
r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
|
99
|
+
gain = whole_impurity -
|
100
|
+
l_impurity * l_ids.size.fdiv(y.shape[0]) -
|
101
|
+
r_impurity * r_ids.size.fdiv(y.shape[0])
|
102
|
+
[l_impurity, r_impurity, threshold, gain]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
data/lib/rumale/version.rb
CHANGED
data/lib/rumale.rb
CHANGED
@@ -45,10 +45,14 @@ require 'rumale/tree/node'
|
|
45
45
|
require 'rumale/tree/base_decision_tree'
|
46
46
|
require 'rumale/tree/decision_tree_classifier'
|
47
47
|
require 'rumale/tree/decision_tree_regressor'
|
48
|
+
require 'rumale/tree/extra_tree_classifier'
|
49
|
+
require 'rumale/tree/extra_tree_regressor'
|
48
50
|
require 'rumale/ensemble/ada_boost_classifier'
|
49
51
|
require 'rumale/ensemble/ada_boost_regressor'
|
50
52
|
require 'rumale/ensemble/random_forest_classifier'
|
51
53
|
require 'rumale/ensemble/random_forest_regressor'
|
54
|
+
require 'rumale/ensemble/extra_trees_classifier'
|
55
|
+
require 'rumale/ensemble/extra_trees_regressor'
|
52
56
|
require 'rumale/clustering/k_means'
|
53
57
|
require 'rumale/clustering/dbscan'
|
54
58
|
require 'rumale/decomposition/pca'
|
data/rumale.gemspec
CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
|
|
17
17
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
18
18
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
19
19
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
20
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
|
20
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
21
21
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
22
22
|
MSG
|
23
23
|
spec.homepage = 'https://github.com/yoshoku/rumale'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -99,7 +99,7 @@ description: |
|
|
99
99
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
100
100
|
Rumale currently supports Linear / Kernel Support Vector Machine,
|
101
101
|
Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
|
102
|
-
Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
|
102
|
+
Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
|
103
103
|
K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
|
104
104
|
email:
|
105
105
|
- yoshoku@outlook.com
|
@@ -140,6 +140,8 @@ files:
|
|
140
140
|
- lib/rumale/decomposition/pca.rb
|
141
141
|
- lib/rumale/ensemble/ada_boost_classifier.rb
|
142
142
|
- lib/rumale/ensemble/ada_boost_regressor.rb
|
143
|
+
- lib/rumale/ensemble/extra_trees_classifier.rb
|
144
|
+
- lib/rumale/ensemble/extra_trees_regressor.rb
|
143
145
|
- lib/rumale/ensemble/random_forest_classifier.rb
|
144
146
|
- lib/rumale/ensemble/random_forest_regressor.rb
|
145
147
|
- lib/rumale/evaluation_measure/accuracy.rb
|
@@ -199,6 +201,8 @@ files:
|
|
199
201
|
- lib/rumale/tree/base_decision_tree.rb
|
200
202
|
- lib/rumale/tree/decision_tree_classifier.rb
|
201
203
|
- lib/rumale/tree/decision_tree_regressor.rb
|
204
|
+
- lib/rumale/tree/extra_tree_classifier.rb
|
205
|
+
- lib/rumale/tree/extra_tree_regressor.rb
|
202
206
|
- lib/rumale/tree/node.rb
|
203
207
|
- lib/rumale/utils.rb
|
204
208
|
- lib/rumale/validation.rb
|