rumale 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2dfbc60c9d47e741fc91497f8c58ade390e6c8f
4
- data.tar.gz: d4cbc26e0d81fbe0de5e83d785cc836e9a5b2099
3
+ metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
4
+ data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
5
5
  SHA512:
6
- metadata.gz: 7f2b4b8ba5d7511215a2e850add19f0942cbff4157a8373eba1950c0eac9fcd0e44925d3a88b2a709c0308ef4c03cca44c501b710f4a22dc4dd573e6866d94dc
7
- data.tar.gz: 4630710eef59af88274e9a411a6ad12de7e4a616280f8fc94d185e24c7bc667bf8c1f662425c64cf05f6ec9accd914ac32e1039688d09629b920329ad85354c8
6
+ metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
7
+ data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ # 0.9.1
2
+ - Add class for Extra-Trees classifier.
3
+ - Add class for Extra-Trees regressor.
4
+ - Refactor extension modules of decision tree estimators for improving performance.
5
+
1
6
  # 0.9.0
2
7
  ## Breaking changes
3
8
  - Decide to introduce Ruby extensions for improving performance.
data/README.md CHANGED
@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
12
12
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
13
13
  Rumale supports Linear / Kernel Support Vector Machine,
14
14
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
15
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor classifier,
15
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
16
16
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
17
17
 
18
18
  This project was formerly known as "SVMKit".
data/ext/rumale/rumale.c CHANGED
@@ -183,33 +183,32 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
183
183
  * @!visibility private
184
184
  * Find for split point with maximum information gain.
185
185
  *
186
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, uniqed_features, n_classes) -> Array<Float>
186
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
187
187
  *
188
188
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
189
189
  * @param impurity [Float] The impurity of whole dataset.
190
190
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
191
191
  * @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
192
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
193
192
  * @param n_classes [Integer] The number of classes.
194
193
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
195
194
  */
196
195
  static VALUE
197
- find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f, VALUE n_classes_)
196
+ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
198
197
  {
199
- long i;
200
- long curr_pos;
201
- long next_pos;
202
- long n_l_elements;
203
- long n_r_elements;
204
198
  const long n_classes = NUM2LONG(n_classes_);
205
199
  const long n_elements = RARRAY_LEN(sorted_f);
206
- const long n_uniq_elements = RARRAY_LEN(uniqed_f);
207
200
  const double w_impurity = NUM2DBL(whole_impurity);
201
+ long iter = 0;
202
+ long curr_pos = 0;
203
+ long next_pos = 0;
204
+ long n_l_elements = 0;
205
+ long n_r_elements = n_elements;
206
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
207
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
208
+ double next_el;
208
209
  double l_impurity;
209
210
  double r_impurity;
210
211
  double gain;
211
- double curr_el;
212
- double next_el;
213
212
  VALUE l_histogram = create_zero_vector(n_classes);
214
213
  VALUE r_histogram = create_zero_vector(n_classes);
215
214
  VALUE opt_params = rb_ary_new2(4);
@@ -217,22 +216,18 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
217
216
  /* Initialize optimal parameters. */
218
217
  rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
219
218
  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
220
- rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
219
+ rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
221
220
  rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
222
221
 
223
222
  /* Initialize child node variables. */
224
- n_l_elements = 0;
225
- n_r_elements = n_elements;
226
- for (i = 0; i < n_elements; i++) {
227
- increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
223
+ for (iter = 0; iter < n_elements; iter++) {
224
+ increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
228
225
  }
229
226
 
230
227
  /* Find optimal parameters. */
231
- for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
232
- /* Find new split point. */
233
- curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
228
+ while (curr_pos < n_elements && curr_el != last_el) {
234
229
  next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
235
- while (next_pos < n_elements && next_el <= curr_el) {
230
+ while (next_pos < n_elements && next_el == curr_el) {
236
231
  increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
237
232
  n_l_elements++;
238
233
  decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
@@ -250,6 +245,9 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
250
245
  rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
251
246
  rb_ary_store(opt_params, 3, DBL2NUM(gain));
252
247
  }
248
+ if (next_pos == n_elements) break;
249
+ curr_pos = next_pos;
250
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
253
251
  }
254
252
 
255
253
  return opt_params;
@@ -259,32 +257,31 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
259
257
  * @!visibility private
260
258
  * Find for split point with maximum information gain.
261
259
  *
262
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets, uniqed_features) -> Array<Float>
260
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
263
261
  *
264
262
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
265
263
  * @param impurity [Float] The impurity of whole dataset.
266
264
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
267
265
  * @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
268
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
269
266
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
270
267
  */
271
268
  static VALUE
272
- find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f)
269
+ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y)
273
270
  {
274
- long i;
275
- long curr_pos;
276
- long next_pos;
277
- long n_l_elements;
278
- long n_r_elements;
279
271
  const long n_elements = RARRAY_LEN(sorted_f);
280
- const long n_uniq_elements = RARRAY_LEN(uniqed_f);
281
272
  const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
282
273
  const double w_impurity = NUM2DBL(whole_impurity);
274
+ long iter = 0;
275
+ long curr_pos = 0;
276
+ long next_pos = 0;
277
+ long n_l_elements = 0;
278
+ long n_r_elements = n_elements;
279
+ double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
280
+ double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
281
+ double next_el;
283
282
  double l_impurity;
284
283
  double r_impurity;
285
284
  double gain;
286
- double curr_el;
287
- double next_el;
288
285
  VALUE l_sum_vec = create_zero_vector(n_dimensions);
289
286
  VALUE r_sum_vec = create_zero_vector(n_dimensions);
290
287
  VALUE l_target_vecs = rb_ary_new();
@@ -295,24 +292,20 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
295
292
  /* Initialize optimal parameters. */
296
293
  rb_ary_store(opt_params, 0, DBL2NUM(0)); /* left impurity */
297
294
  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity)); /* right impurity */
298
- rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
295
+ rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
299
296
  rb_ary_store(opt_params, 3, DBL2NUM(0)); /* gain */
300
297
 
301
298
  /* Initialize child node variables. */
302
- n_l_elements = 0;
303
- n_r_elements = n_elements;
304
- for (i = 0; i < n_elements; i++) {
305
- target = rb_ary_entry(sorted_y, i);
299
+ for (iter = 0; iter < n_elements; iter++) {
300
+ target = rb_ary_entry(sorted_y, iter);
306
301
  add_sum_vec(r_sum_vec, target);
307
302
  rb_ary_push(r_target_vecs, target);
308
303
  }
309
304
 
310
305
  /* Find optimal parameters. */
311
- for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
312
- /* Find new split point. */
313
- curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
306
+ while (curr_pos < n_elements && curr_el != last_el) {
314
307
  next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
315
- while (next_pos < n_elements && next_el <= curr_el) {
308
+ while (next_pos < n_elements && next_el == curr_el) {
316
309
  target = rb_ary_entry(sorted_y, next_pos);
317
310
  add_sum_vec(l_sum_vec, target);
318
311
  rb_ary_push(l_target_vecs, target);
@@ -333,6 +326,9 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
333
326
  rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
334
327
  rb_ary_store(opt_params, 3, DBL2NUM(gain));
335
328
  }
329
+ if (next_pos == n_elements) break;
330
+ curr_pos = next_pos;
331
+ curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
336
332
  }
337
333
 
338
334
  return opt_params;
@@ -411,8 +407,8 @@ void Init_rumale(void)
411
407
  */
412
408
  VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
413
409
 
414
- rb_define_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
415
- rb_define_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
416
- rb_define_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
417
- rb_define_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
410
+ rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
411
+ rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
412
+ rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
413
+ rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
418
414
  }
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/extra_tree_classifier'
4
+ require 'rumale/ensemble/random_forest_classifier'
5
+
6
+ module Rumale
7
+ module Ensemble
8
+ # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
9
+ # The algorithm of extremely randomized trees is similar to random forest.
10
+ # The features of the algorithm of extremely randomized trees are
11
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::ExtraTreesClassifier.new(
16
+ # n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_labels)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
+ class ExtraTreesClassifier < RandomForestClassifier
23
+ # Return the set of estimators.
24
+ # @return [Array<ExtraTreeClassifier>]
25
+ attr_reader :estimators
26
+
27
+ # Return the class labels.
28
+ # @return [Numo::Int32] (size: n_classes)
29
+ attr_reader :classes
30
+
31
+ # Return the importance for each feature.
32
+ # @return [Numo::DFloat] (size: n_features)
33
+ attr_reader :feature_importances
34
+
35
+ # Return the random generator for random selection of feature index.
36
+ # @return [Random]
37
+ attr_reader :rng
38
+
39
+ # Create a new classifier with extremely randomized trees.
40
+ #
41
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
42
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
43
+ # @param max_depth [Integer] The maximum depth of the tree.
44
+ # If nil is given, extra tree grows without concern for depth.
45
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
46
+ # If nil is given, number of leaves is not limited.
47
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
+ # If nil is given, split process considers all features.
50
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
51
+ # It is used to randomly determine the order of features when deciding spliting point.
52
+ def initialize(n_estimators: 10,
53
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
54
+ max_features: nil, random_seed: nil)
55
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
56
+ max_features: max_features, random_seed: random_seed)
57
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
58
+ check_params_string(criterion: criterion)
59
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
60
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
61
+ max_features: max_features)
62
+ super
63
+ end
64
+
65
+ # Fit the model with given training data.
66
+ #
67
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
68
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
69
+ # @return [ExtraTreesClassifier] The learned classifier itself.
70
+ def fit(x, y)
71
+ check_sample_array(x)
72
+ check_label_array(y)
73
+ check_sample_label_size(x, y)
74
+ # Initialize some variables.
75
+ n_features = x.shape[1]
76
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
77
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
78
+ @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
79
+ @feature_importances = Numo::DFloat.zeros(n_features)
80
+ # Construct trees.
81
+ @estimators = Array.new(@params[:n_estimators]) do
82
+ tree = Tree::ExtraTreeClassifier.new(
83
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
84
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
85
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
86
+ )
87
+ tree.fit(x, y)
88
+ @feature_importances += tree.feature_importances
89
+ tree
90
+ end
91
+ @feature_importances /= @feature_importances.sum
92
+ self
93
+ end
94
+
95
+ # Predict class labels for samples.
96
+ #
97
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
98
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
99
+ def predict(x)
100
+ check_sample_array(x)
101
+ super
102
+ end
103
+
104
+ # Predict probability for samples.
105
+ #
106
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
107
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
108
+ def predict_proba(x)
109
+ check_sample_array(x)
110
+ super
111
+ end
112
+
113
+ # Return the index of the leaf that each sample reached.
114
+ #
115
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
116
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
117
+ def apply(x)
118
+ check_sample_array(x)
119
+ super
120
+ end
121
+
122
+ # Dump marshal data.
123
+ # @return [Hash] The marshal data about ExtraTreesClassifier.
124
+ def marshal_dump
125
+ super
126
+ end
127
+
128
+ # Load marshal data.
129
+ # @return [nil]
130
+ def marshal_load(obj)
131
+ super
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/extra_tree_regressor'
4
+ require 'rumale/ensemble/random_forest_regressor'
5
+
6
+ module Rumale
7
+ module Ensemble
8
+ # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
9
+ # The algorithm of extremely randomized trees is similar to random forest.
10
+ # The features of the algorithm of extremely randomized trees are
11
+ # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
12
+ #
13
+ # @example
14
+ # estimator =
15
+ # Rumale::Ensemble::ExtraTreesRegressor.new(
16
+ # n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
17
+ # estimator.fit(training_samples, traininig_values)
18
+ # results = estimator.predict(testing_samples)
19
+ #
20
+ # *Reference*
21
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
22
+ class ExtraTreesRegressor < RandomForestRegressor
23
+ # Return the set of estimators.
24
+ # @return [Array<ExtraTreeRegressor>]
25
+ attr_reader :estimators
26
+
27
+ # Return the importance for each feature.
28
+ # @return [Numo::DFloat] (size: n_features)
29
+ attr_reader :feature_importances
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Create a new regressor with extremely randomized trees.
36
+ #
37
+ # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
38
+ # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
39
+ # @param max_depth [Integer] The maximum depth of the tree.
40
+ # If nil is given, extra tree grows without concern for depth.
41
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
42
+ # If nil is given, number of leaves is not limited.
43
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
44
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
45
+ # If nil is given, split process considers all features.
46
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
47
+ # It is used to randomly determine the order of features when deciding spliting point.
48
+ def initialize(n_estimators: 10,
49
+ criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
50
+ max_features: nil, random_seed: nil)
51
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
52
+ max_features: max_features, random_seed: random_seed)
53
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
54
+ check_params_string(criterion: criterion)
55
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
56
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
57
+ max_features: max_features)
58
+ super
59
+ end
60
+
61
+ # Fit the model with given training data.
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
64
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
65
+ # @return [ExtraTreesRegressor] The learned regressor itself.
66
+ def fit(x, y)
67
+ check_sample_array(x)
68
+ check_tvalue_array(y)
69
+ check_sample_tvalue_size(x, y)
70
+ # Initialize some variables.
71
+ n_features = x.shape[1]
72
+ @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
73
+ @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
74
+ @feature_importances = Numo::DFloat.zeros(n_features)
75
+ # Construct forest.
76
+ @estimators = Array.new(@params[:n_estimators]) do
77
+ tree = Tree::ExtraTreeRegressor.new(
78
+ criterion: @params[:criterion], max_depth: @params[:max_depth],
79
+ max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
80
+ max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
81
+ )
82
+ tree.fit(x, y)
83
+ @feature_importances += tree.feature_importances
84
+ tree
85
+ end
86
+ @feature_importances /= @feature_importances.sum
87
+ self
88
+ end
89
+
90
+ # Predict values for samples.
91
+ #
92
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
93
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
94
+ def predict(x)
95
+ check_sample_array(x)
96
+ super
97
+ end
98
+
99
+ # Return the index of the leaf that each sample reached.
100
+ #
101
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
102
+ # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
103
+ def apply(x)
104
+ check_sample_array(x)
105
+ super
106
+ end
107
+
108
+ # Dump marshal data.
109
+ # @return [Hash] The marshal data about ExtraTreesRegressor.
110
+ def marshal_dump
111
+ super
112
+ end
113
+
114
+ # Load marshal data.
115
+ # @return [nil]
116
+ def marshal_load(obj)
117
+ super
118
+ end
119
+ end
120
+ end
121
+ end
@@ -155,9 +155,9 @@ module Rumale
155
155
  def best_split(features, y, whole_impurity)
156
156
  order = features.sort_index
157
157
  sorted_f = features[order].to_a
158
- sorted_y = y[order, true].to_a.flatten
158
+ sorted_y = y[order, 0].to_a
159
159
  n_classes = @classes.size
160
- find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq, n_classes)
160
+ find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
161
161
  end
162
162
 
163
163
  def impurity(y)
@@ -129,7 +129,7 @@ module Rumale
129
129
  order = features.sort_index
130
130
  sorted_f = features[order].to_a
131
131
  sorted_y = y[order, true].to_a
132
- find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq)
132
+ find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
133
133
  end
134
134
 
135
135
  def impurity(y)
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/decision_tree_classifier'
4
+
5
+ module Rumale
6
+ module Tree
7
+ # ExtraTreeClassifier is a class that implements extra randomized tree for classification.
8
+ #
9
+ # @example
10
+ # estimator =
11
+ # Rumale::Tree::ExtraTreeClassifier.new(
12
+ # criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
+ # estimator.fit(training_samples, traininig_labels)
14
+ # results = estimator.predict(testing_samples)
15
+ #
16
+ # *Reference*
17
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
+ class ExtraTreeClassifier < DecisionTreeClassifier
19
+ # Return the class labels.
20
+ # @return [Numo::Int32] (size: n_classes)
21
+ attr_reader :classes
22
+
23
+ # Return the importance for each feature.
24
+ # @return [Numo::DFloat] (size: n_features)
25
+ attr_reader :feature_importances
26
+
27
+ # Return the learned tree.
28
+ # @return [Node]
29
+ attr_reader :tree
30
+
31
+ # Return the random generator for random selection of feature index.
32
+ # @return [Random]
33
+ attr_reader :rng
34
+
35
+ # Return the labels assigned each leaf.
36
+ # @return [Numo::Int32] (size: n_leafs)
37
+ attr_reader :leaf_labels
38
+
39
+ # Create a new classifier with extra randomized tree algorithm.
40
+ #
41
+ # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
42
+ # @param max_depth [Integer] The maximum depth of the tree.
43
+ # If nil is given, extra tree grows without concern for depth.
44
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
45
+ # If nil is given, number of leaves is not limited.
46
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
47
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
48
+ # If nil is given, split process considers all features.
49
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
50
+ # It is used to randomly determine the order of features when deciding spliting point.
51
+ def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
52
+ random_seed: nil)
53
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
+ max_features: max_features, random_seed: random_seed)
55
+ check_params_integer(min_samples_leaf: min_samples_leaf)
56
+ check_params_string(criterion: criterion)
57
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
58
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
59
+ super
60
+ end
61
+
62
+ # Fit the model with given training data.
63
+ #
64
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
65
+ # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
66
+ # @return [ExtraTreeClassifier] The learned classifier itself.
67
+ def fit(x, y)
68
+ check_sample_array(x)
69
+ check_label_array(y)
70
+ check_sample_label_size(x, y)
71
+ super
72
+ end
73
+
74
+ # Predict class labels for samples.
75
+ #
76
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
77
+ # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
78
+ def predict(x)
79
+ check_sample_array(x)
80
+ super
81
+ end
82
+
83
+ # Predict probability for samples.
84
+ #
85
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
86
+ # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
87
+ def predict_proba(x)
88
+ check_sample_array(x)
89
+ super
90
+ end
91
+
92
+ # Dump marshal data.
93
+ # @return [Hash] The marshal data about ExtraTreeClassifier
94
+ def marshal_dump
95
+ super
96
+ end
97
+
98
+ # Load marshal data.
99
+ # @return [nil]
100
+ def marshal_load(obj)
101
+ super
102
+ end
103
+
104
+ private
105
+
106
+ def best_split(features, y, whole_impurity)
107
+ threshold = @rng.rand(features.min..features.max)
108
+ l_ids = features.le(threshold).where
109
+ r_ids = features.gt(threshold).where
110
+ l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
111
+ r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
112
+ gain = whole_impurity -
113
+ l_impurity * l_ids.size.fdiv(y.shape[0]) -
114
+ r_impurity * r_ids.size.fdiv(y.shape[0])
115
+ [l_impurity, r_impurity, threshold, gain]
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/tree/decision_tree_regressor'
4
+
5
+ module Rumale
6
+ module Tree
7
+ # ExtraTreeRegressor is a class that implements extra randomized tree for regression.
8
+ #
9
+ # @example
10
+ # estimator =
11
+ # Rumale::Tree::ExtraTreeRegressor.new(
12
+ # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
13
+ # estimator.fit(training_samples, traininig_values)
14
+ # results = estimator.predict(testing_samples)
15
+ #
16
+ # *Reference*
17
+ # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
18
+ class ExtraTreeRegressor < DecisionTreeRegressor
19
+ # Return the importance for each feature.
20
+ # @return [Numo::DFloat] (size: n_features)
21
+ attr_reader :feature_importances
22
+
23
+ # Return the learned tree.
24
+ # @return [Node]
25
+ attr_reader :tree
26
+
27
+ # Return the random generator for random selection of feature index.
28
+ # @return [Random]
29
+ attr_reader :rng
30
+
31
+ # Return the values assigned each leaf.
32
+ # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
33
+ attr_reader :leaf_values
34
+
35
+ # Create a new regressor with extra randomized tree algorithm.
36
+ #
37
+ # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
38
+ # @param max_depth [Integer] The maximum depth of the tree.
39
+ # If nil is given, extra tree grows without concern for depth.
40
+ # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
41
+ # If nil is given, number of leaves is not limited.
42
+ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
43
+ # @param max_features [Integer] The number of features to consider when searching optimal split point.
44
+ # If nil is given, split process considers all features.
45
+ # @param random_seed [Integer] The seed value using to initialize the random generator.
46
+ # It is used to randomly determine the order of features when deciding spliting point.
47
+ def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
48
+ random_seed: nil)
49
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
50
+ max_features: max_features, random_seed: random_seed)
51
+ check_params_integer(min_samples_leaf: min_samples_leaf)
52
+ check_params_string(criterion: criterion)
53
+ check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
54
+ min_samples_leaf: min_samples_leaf, max_features: max_features)
55
+ super
56
+ end
57
+
58
+ # Fit the model with given training data.
59
+ #
60
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
61
+ # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
62
+ # @return [ExtraTreeRegressor] The learned regressor itself.
63
+ def fit(x, y)
64
+ check_sample_array(x)
65
+ check_tvalue_array(y)
66
+ check_sample_tvalue_size(x, y)
67
+ super
68
+ end
69
+
70
+ # Predict values for samples.
71
+ #
72
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
73
+ # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
74
+ def predict(x)
75
+ check_sample_array(x)
76
+ super
77
+ end
78
+
79
+ # Dump marshal data.
80
+ # @return [Hash] The marshal data about ExtraTreeRegressor
81
+ def marshal_dump
82
+ super
83
+ end
84
+
85
+ # Load marshal data.
86
+ # @return [nil]
87
+ def marshal_load(obj)
88
+ super
89
+ end
90
+
91
+ private
92
+
93
+ def best_split(features, y, whole_impurity)
94
+ threshold = @rng.rand(features.min..features.max)
95
+ l_ids = features.le(threshold).where
96
+ r_ids = features.gt(threshold).where
97
+ l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
98
+ r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
99
+ gain = whole_impurity -
100
+ l_impurity * l_ids.size.fdiv(y.shape[0]) -
101
+ r_impurity * r_ids.size.fdiv(y.shape[0])
102
+ [l_impurity, r_impurity, threshold, gain]
103
+ end
104
+ end
105
+ end
106
+ end
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.9.0'
6
+ VERSION = '0.9.1'
7
7
  end
data/lib/rumale.rb CHANGED
@@ -45,10 +45,14 @@ require 'rumale/tree/node'
45
45
  require 'rumale/tree/base_decision_tree'
46
46
  require 'rumale/tree/decision_tree_classifier'
47
47
  require 'rumale/tree/decision_tree_regressor'
48
+ require 'rumale/tree/extra_tree_classifier'
49
+ require 'rumale/tree/extra_tree_regressor'
48
50
  require 'rumale/ensemble/ada_boost_classifier'
49
51
  require 'rumale/ensemble/ada_boost_regressor'
50
52
  require 'rumale/ensemble/random_forest_classifier'
51
53
  require 'rumale/ensemble/random_forest_regressor'
54
+ require 'rumale/ensemble/extra_trees_classifier'
55
+ require 'rumale/ensemble/extra_trees_regressor'
52
56
  require 'rumale/clustering/k_means'
53
57
  require 'rumale/clustering/dbscan'
54
58
  require 'rumale/decomposition/pca'
data/rumale.gemspec CHANGED
@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
17
17
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
18
18
  Rumale currently supports Linear / Kernel Support Vector Machine,
19
19
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
20
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
20
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
21
21
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
22
22
  MSG
23
23
  spec.homepage = 'https://github.com/yoshoku/rumale'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-22 00:00:00.000000000 Z
11
+ date: 2019-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -99,7 +99,7 @@ description: |
99
99
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
100
100
  Rumale currently supports Linear / Kernel Support Vector Machine,
101
101
  Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
102
- Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
102
+ Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
103
103
  K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
104
104
  email:
105
105
  - yoshoku@outlook.com
@@ -140,6 +140,8 @@ files:
140
140
  - lib/rumale/decomposition/pca.rb
141
141
  - lib/rumale/ensemble/ada_boost_classifier.rb
142
142
  - lib/rumale/ensemble/ada_boost_regressor.rb
143
+ - lib/rumale/ensemble/extra_trees_classifier.rb
144
+ - lib/rumale/ensemble/extra_trees_regressor.rb
143
145
  - lib/rumale/ensemble/random_forest_classifier.rb
144
146
  - lib/rumale/ensemble/random_forest_regressor.rb
145
147
  - lib/rumale/evaluation_measure/accuracy.rb
@@ -199,6 +201,8 @@ files:
199
201
  - lib/rumale/tree/base_decision_tree.rb
200
202
  - lib/rumale/tree/decision_tree_classifier.rb
201
203
  - lib/rumale/tree/decision_tree_regressor.rb
204
+ - lib/rumale/tree/extra_tree_classifier.rb
205
+ - lib/rumale/tree/extra_tree_regressor.rb
202
206
  - lib/rumale/tree/node.rb
203
207
  - lib/rumale/utils.rb
204
208
  - lib/rumale/validation.rb