RubyGems - rumale - Versions diffs - 0.9.0 → 0.9.1 - Mend

rumale 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +1 -1
data/ext/rumale/rumale.c +40 -44
data/lib/rumale/ensemble/extra_trees_classifier.rb +135 -0
data/lib/rumale/ensemble/extra_trees_regressor.rb +121 -0
data/lib/rumale/tree/decision_tree_classifier.rb +2 -2
data/lib/rumale/tree/decision_tree_regressor.rb +1 -1
data/lib/rumale/tree/extra_tree_classifier.rb +119 -0
data/lib/rumale/tree/extra_tree_regressor.rb +106 -0
data/lib/rumale/version.rb +1 -1
data/lib/rumale.rb +4 -0
data/rumale.gemspec +1 -1
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a2dfbc60c9d47e741fc91497f8c58ade390e6c8f
-  data.tar.gz: d4cbc26e0d81fbe0de5e83d785cc836e9a5b2099
+  metadata.gz: 48089085f7a6249801c36408822454d4e0b293fb
+  data.tar.gz: c069743334925f090699ca30da72b35c8e70f5f2
 SHA512:
-  metadata.gz: 7f2b4b8ba5d7511215a2e850add19f0942cbff4157a8373eba1950c0eac9fcd0e44925d3a88b2a709c0308ef4c03cca44c501b710f4a22dc4dd573e6866d94dc
-  data.tar.gz: 4630710eef59af88274e9a411a6ad12de7e4a616280f8fc94d185e24c7bc667bf8c1f662425c64cf05f6ec9accd914ac32e1039688d09629b920329ad85354c8
+  metadata.gz: d95950b1d358be77f93b6d4e0593355fd043a1abe712763b9613b57a87a83e627d41c978c1a236ce94c9b259bc533a03b471fe7630f862f94bd7aeea8c77377e
+  data.tar.gz: 307713e776a611ed05c0a21630c69de8abb12717f97a1c452bdba4bfe177dbe10c3b73dc20b64e236c42a1402875678cada4c736058555755586833ebb460c71

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+# 0.9.1
+- Add class for Extra-Trees classifier.
+- Add class for Extra-Trees regressor.
+- Refactor extension modules of decision tree estimators for improving performance.
 # 0.9.0
 ## Breaking changes
 - Decide to introduce Ruby extensions for improving performance.

data/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ Rumale (**Ru**by **ma**chine **le**arning) is a machine learninig library in Rub
 Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 Rumale supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor classifier,
+Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor classifier,
 K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
 This project was formerly known as "SVMKit".

data/ext/rumale/rumale.c CHANGED Viewed

@@ -183,33 +183,32 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
  * @!visibility private
  * Find for split point with maximum information gain.
  *
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, uniqed_features, n_classes) -> Array<Float>
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
  *
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
  * @param impurity [Float] The impurity of whole dataset.
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
  * @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
  * @param n_classes [Integer] The number of classes.
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
  */
 static VALUE
-find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f, VALUE n_classes_)
+find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
 {
-  long i;
-  long curr_pos;
-  long next_pos;
-  long n_l_elements;
-  long n_r_elements;
   const long n_classes = NUM2LONG(n_classes_);
   const long n_elements = RARRAY_LEN(sorted_f);
-  const long n_uniq_elements = RARRAY_LEN(uniqed_f);
   const double w_impurity = NUM2DBL(whole_impurity);
+  long iter = 0;
+  long curr_pos = 0;
+  long next_pos = 0;
+  long n_l_elements = 0;
+  long n_r_elements = n_elements;
+  double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
+  double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
+  double next_el;
   double l_impurity;
   double r_impurity;
   double gain;
-  double curr_el;
-  double next_el;
   VALUE l_histogram = create_zero_vector(n_classes);
   VALUE r_histogram = create_zero_vector(n_classes);
   VALUE opt_params = rb_ary_new2(4);
@@ -217,22 +216,18 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
   /* Initialize optimal parameters. */
   rb_ary_store(opt_params, 0, DBL2NUM(0));                /* left impurity */
   rb_ary_store(opt_params, 1, DBL2NUM(w_impurity));       /* right impurity */
-  rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
+  rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
   rb_ary_store(opt_params, 3, DBL2NUM(0));                /* gain */
   /* Initialize child node variables. */
-  n_l_elements = 0;
-  n_r_elements = n_elements;
-  for (i = 0; i < n_elements; i++) {
-    increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
+  for (iter = 0; iter < n_elements; iter++) {
+    increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
   }
   /* Find optimal parameters. */
-  for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
-    /* Find new split point. */
-    curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
+  while (curr_pos < n_elements && curr_el != last_el) {
     next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
-    while (next_pos < n_elements && next_el <= curr_el) {
+    while (next_pos < n_elements && next_el == curr_el) {
       increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
       n_l_elements++;
       decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
@@ -250,6 +245,9 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
       rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
       rb_ary_store(opt_params, 3, DBL2NUM(gain));
     }
+    if (next_pos == n_elements) break;
+    curr_pos = next_pos;
+    curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
   }
   return opt_params;
@@ -259,32 +257,31 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
  * @!visibility private
  * Find for split point with maximum information gain.
  *
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets, uniqed_features) -> Array<Float>
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
  *
  * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
  * @param impurity [Float] The impurity of whole dataset.
  * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
  * @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
- * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
  * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
  */
 static VALUE
-find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f)
+find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y)
 {
-  long i;
-  long curr_pos;
-  long next_pos;
-  long n_l_elements;
-  long n_r_elements;
   const long n_elements = RARRAY_LEN(sorted_f);
-  const long n_uniq_elements = RARRAY_LEN(uniqed_f);
   const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
   const double w_impurity = NUM2DBL(whole_impurity);
+  long iter = 0;
+  long curr_pos = 0;
+  long next_pos = 0;
+  long n_l_elements = 0;
+  long n_r_elements = n_elements;
+  double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
+  double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
+  double next_el;
   double l_impurity;
   double r_impurity;
   double gain;
-  double curr_el;
-  double next_el;
   VALUE l_sum_vec = create_zero_vector(n_dimensions);
   VALUE r_sum_vec = create_zero_vector(n_dimensions);
   VALUE l_target_vecs = rb_ary_new();
@@ -295,24 +292,20 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
   /* Initialize optimal parameters. */
   rb_ary_store(opt_params, 0, DBL2NUM(0));                /* left impurity */
   rb_ary_store(opt_params, 1, DBL2NUM(w_impurity));       /* right impurity */
-  rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
+  rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
   rb_ary_store(opt_params, 3, DBL2NUM(0));                /* gain */
   /* Initialize child node variables. */
-  n_l_elements = 0;
-  n_r_elements = n_elements;
-  for (i = 0; i < n_elements; i++) {
-    target = rb_ary_entry(sorted_y, i);
+  for (iter = 0; iter < n_elements; iter++) {
+    target = rb_ary_entry(sorted_y, iter);
     add_sum_vec(r_sum_vec, target);
     rb_ary_push(r_target_vecs, target);
   }
   /* Find optimal parameters. */
-  for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
-    /* Find new split point. */
-    curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
+  while (curr_pos < n_elements && curr_el != last_el) {
     next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
-    while (next_pos < n_elements && next_el <= curr_el) {
+    while (next_pos < n_elements && next_el == curr_el) {
       target = rb_ary_entry(sorted_y, next_pos);
       add_sum_vec(l_sum_vec, target);
       rb_ary_push(l_target_vecs, target);
@@ -333,6 +326,9 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
       rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
       rb_ary_store(opt_params, 3, DBL2NUM(gain));
     }
+    if (next_pos == n_elements) break;
+    curr_pos = next_pos;
+    curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
   }
   return opt_params;
@@ -411,8 +407,8 @@ void Init_rumale(void)
    */
   VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
-  rb_define_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
-  rb_define_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
-  rb_define_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
-  rb_define_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
+  rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
+  rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
+  rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
+  rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
 }

data/lib/rumale/ensemble/extra_trees_classifier.rb ADDED Viewed

@@ -0,0 +1,135 @@
+# frozen_string_literal: true
+require 'rumale/tree/extra_tree_classifier'
+require 'rumale/ensemble/random_forest_classifier'
+module Rumale
+  module Ensemble
+    # ExtraTreesClassifier is a class that implements extremely randomized trees for classification.
+    # The algorithm of extremely randomized trees is similar to random forest.
+    # The features of the algorithm of extremely randomized trees are
+    # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
+    #
+    # @example
+    #   estimator =
+    #     Rumale::Ensemble::ExtraTreesClassifier.new(
+    #       n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreesClassifier < RandomForestClassifier
+      # Return the set of estimators.
+      # @return [Array<ExtraTreeClassifier>]
+      attr_reader :estimators
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new classifier with extremely randomized trees.
+      #
+      # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10,
+                     criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
+                              max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
+                              max_features: max_features)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [ExtraTreesClassifier] The learned classifier itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_label_array(y)
+        check_sample_label_size(x, y)
+        # Initialize some variables.
+        n_features = x.shape[1]
+        @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        # Construct trees.
+        @estimators = Array.new(@params[:n_estimators]) do
+          tree = Tree::ExtraTreeClassifier.new(
+            criterion: @params[:criterion], max_depth: @params[:max_depth],
+            max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+            max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
+          )
+          tree.fit(x, y)
+          @feature_importances += tree.feature_importances
+          tree
+        end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      def predict(x)
+        check_sample_array(x)
+        super
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        check_sample_array(x)
+        super
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        check_sample_array(x)
+        super
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about ExtraTreesClassifier.
+      def marshal_dump
+        super
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        super
+      end
+    end
+  end
+end

data/lib/rumale/ensemble/extra_trees_regressor.rb ADDED Viewed

@@ -0,0 +1,121 @@
+# frozen_string_literal: true
+require 'rumale/tree/extra_tree_regressor'
+require 'rumale/ensemble/random_forest_regressor'
+module Rumale
+  module Ensemble
+    # ExtraTreesRegressor is a class that implements extremely randomized trees for regression
+    # The algorithm of extremely randomized trees is similar to random forest.
+    # The features of the algorithm of extremely randomized trees are
+    # not to apply the bagging procedure and to randomly select the threshold for splitting feature space.
+    #
+    # @example
+    #   estimator =
+    #     Rumale::Ensemble::ExtraTreesRegressor.new(
+    #       n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreesRegressor < RandomForestRegressor
+      # Return the set of estimators.
+      # @return [Array<ExtraTreeRegressor>]
+      attr_reader :estimators
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new regressor with extremely randomized trees.
+      #
+      # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10,
+                     criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
+                              max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
+                              max_features: max_features)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @return [ExtraTreesRegressor] The learned regressor itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        # Initialize some variables.
+        n_features = x.shape[1]
+        @params[:max_features] = Math.sqrt(n_features).to_i unless @params[:max_features].is_a?(Integer)
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        # Construct forest.
+        @estimators = Array.new(@params[:n_estimators]) do
+          tree = Tree::ExtraTreeRegressor.new(
+            criterion: @params[:criterion], max_depth: @params[:max_depth],
+            max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+            max_features: @params[:max_features], random_seed: @rng.rand(Rumale::Values.int_max)
+          )
+          tree.fit(x, y)
+          @feature_importances += tree.feature_importances
+          tree
+        end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
+      def predict(x)
+        check_sample_array(x)
+        super
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        check_sample_array(x)
+        super
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about ExtraTreesRegressor.
+      def marshal_dump
+        super
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        super
+      end
+    end
+  end
+end

data/lib/rumale/tree/decision_tree_classifier.rb CHANGED Viewed

@@ -155,9 +155,9 @@ module Rumale
       def best_split(features, y, whole_impurity)
         order = features.sort_index
         sorted_f = features[order].to_a
-        sorted_y = y[order, true].to_a.flatten
+        sorted_y = y[order, 0].to_a
         n_classes = @classes.size
-        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq, n_classes)
+        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
       end
       def impurity(y)

data/lib/rumale/tree/decision_tree_regressor.rb CHANGED Viewed

@@ -129,7 +129,7 @@ module Rumale
         order = features.sort_index
         sorted_f = features[order].to_a
         sorted_y = y[order, true].to_a
-        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq)
+        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
       end
       def impurity(y)

data/lib/rumale/tree/extra_tree_classifier.rb ADDED Viewed

@@ -0,0 +1,119 @@
+# frozen_string_literal: true
+require 'rumale/tree/decision_tree_classifier'
+module Rumale
+  module Tree
+    # ExtraTreeClassifier is a class that implements extra randomized tree for classification.
+    #
+    # @example
+    #   estimator =
+    #     Rumale::Tree::ExtraTreeClassifier.new(
+    #       criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_labels)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreeClassifier < DecisionTreeClassifier
+      # Return the class labels.
+      # @return [Numo::Int32] (size: n_classes)
+      attr_reader :classes
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the labels assigned each leaf.
+      # @return [Numo::Int32] (size: n_leafs)
+      attr_reader :leaf_labels
+      # Create a new classifier with extra randomized tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                              min_samples_leaf: min_samples_leaf, max_features: max_features)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @return [ExtraTreeClassifier] The learned classifier itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_label_array(y)
+        check_sample_label_size(x, y)
+        super
+      end
+      # Predict class labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      def predict(x)
+        check_sample_array(x)
+        super
+      end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        check_sample_array(x)
+        super
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about ExtraTreeClassifier
+      def marshal_dump
+        super
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        super
+      end
+      private
+      def best_split(features, y, whole_impurity)
+        threshold = @rng.rand(features.min..features.max)
+        l_ids = features.le(threshold).where
+        r_ids = features.gt(threshold).where
+        l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
+        r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
+        gain = whole_impurity -
+               l_impurity * l_ids.size.fdiv(y.shape[0]) -
+               r_impurity * r_ids.size.fdiv(y.shape[0])
+        [l_impurity, r_impurity, threshold, gain]
+      end
+    end
+  end
+end

data/lib/rumale/tree/extra_tree_regressor.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+require 'rumale/tree/decision_tree_regressor'
+module Rumale
+  module Tree
+    # ExtraTreeRegressor is a class that implements extra randomized tree for regression.
+    #
+    # @example
+    #   estimator =
+    #     Rumale::Tree::ExtraTreeRegressor.new(
+    #       max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - P. Geurts, D. Ernst, and L. Wehenkel, "Extremely randomized trees," Machine Learning, vol. 63 (1), pp. 3--42, 2006.
+    class ExtraTreeRegressor < DecisionTreeRegressor
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the values assigned each leaf.
+      # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
+      attr_reader :leaf_values
+      # Create a new regressor with extra randomized tree algorithm.
+      #
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, extra tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on extra tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                              min_samples_leaf: min_samples_leaf, max_features: max_features)
+        super
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
+      # @return [ExtraTreeRegressor] The learned regressor itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        super
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      def predict(x)
+        check_sample_array(x)
+        super
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about ExtraTreeRegressor
+      def marshal_dump
+        super
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        super
+      end
+      private
+      def best_split(features, y, whole_impurity)
+        threshold = @rng.rand(features.min..features.max)
+        l_ids = features.le(threshold).where
+        r_ids = features.gt(threshold).where
+        l_impurity = l_ids.size > 0 ? impurity(y[l_ids, true]) : 0.0
+        r_impurity = r_ids.size > 0 ? impurity(y[r_ids, true]) : 0.0
+        gain = whole_impurity -
+               l_impurity * l_ids.size.fdiv(y.shape[0]) -
+               r_impurity * r_ids.size.fdiv(y.shape[0])
+        [l_impurity, r_impurity, threshold, gain]
+      end
+    end
+  end
+end

data/lib/rumale/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.9.0'
+  VERSION = '0.9.1'
 end

data/lib/rumale.rb CHANGED Viewed

@@ -45,10 +45,14 @@ require 'rumale/tree/node'
 require 'rumale/tree/base_decision_tree'
 require 'rumale/tree/decision_tree_classifier'
 require 'rumale/tree/decision_tree_regressor'
+require 'rumale/tree/extra_tree_classifier'
+require 'rumale/tree/extra_tree_regressor'
 require 'rumale/ensemble/ada_boost_classifier'
 require 'rumale/ensemble/ada_boost_regressor'
 require 'rumale/ensemble/random_forest_classifier'
 require 'rumale/ensemble/random_forest_regressor'
+require 'rumale/ensemble/extra_trees_classifier'
+require 'rumale/ensemble/extra_trees_regressor'
 require 'rumale/clustering/k_means'
 require 'rumale/clustering/dbscan'
 require 'rumale/decomposition/pca'

data/rumale.gemspec CHANGED Viewed

@@ -17,7 +17,7 @@ Rumale is a machine learninig library in Ruby.
 Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 Rumale currently supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
+Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
 K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
 MSG
   spec.homepage      = 'https://github.com/yoshoku/rumale'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 0.9.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-04-22 00:00:00.000000000 Z
+date: 2019-05-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -99,7 +99,7 @@ description: |
   Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
   Rumale currently supports Linear / Kernel Support Vector Machine,
   Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-  Naive Bayes, Decision Tree, AdaBoost, Random Forest, K-nearest neighbor algorithm,
+  Naive Bayes, Decision Tree, AdaBoost, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
   K-Means, DBSCAN, Principal Component Analysis, and Non-negative Matrix Factorization.
 email:
 - yoshoku@outlook.com
@@ -140,6 +140,8 @@ files:
 - lib/rumale/decomposition/pca.rb
 - lib/rumale/ensemble/ada_boost_classifier.rb
 - lib/rumale/ensemble/ada_boost_regressor.rb
+- lib/rumale/ensemble/extra_trees_classifier.rb
+- lib/rumale/ensemble/extra_trees_regressor.rb
 - lib/rumale/ensemble/random_forest_classifier.rb
 - lib/rumale/ensemble/random_forest_regressor.rb
 - lib/rumale/evaluation_measure/accuracy.rb
@@ -199,6 +201,8 @@ files:
 - lib/rumale/tree/base_decision_tree.rb
 - lib/rumale/tree/decision_tree_classifier.rb
 - lib/rumale/tree/decision_tree_regressor.rb
+- lib/rumale/tree/extra_tree_classifier.rb
+- lib/rumale/tree/extra_tree_regressor.rb
 - lib/rumale/tree/node.rb
 - lib/rumale/utils.rb
 - lib/rumale/validation.rb