RubyGems - rumale - Versions diffs - 0.8.4 → 0.9.0 - Mend

rumale 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/CHANGELOG.md +5 -0
data/Rakefile +9 -1
data/ext/rumale/extconf.rb +3 -0
data/ext/rumale/rumale.c +418 -0
data/ext/rumale/rumale.h +9 -0
data/lib/rumale.rb +2 -0
data/lib/rumale/tree/base_decision_tree.rb +10 -18
data/lib/rumale/tree/decision_tree_classifier.rb +15 -9
data/lib/rumale/tree/decision_tree_regressor.rb +13 -8
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +2 -0
metadata +21 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8e895ca9462569e1ec2b3b9c1cb985aebd7d4b19
-  data.tar.gz: 9374969bd955cada6ba54ec6edf1d7bc174a5102
+  metadata.gz: a2dfbc60c9d47e741fc91497f8c58ade390e6c8f
+  data.tar.gz: d4cbc26e0d81fbe0de5e83d785cc836e9a5b2099
 SHA512:
-  metadata.gz: 7e9eadf3404e74ee887007a1fb1df4e2b933f10394c1a61afdd8400492afcc4fbb40479c453983d2349db5d5c7cd52ab280a9ef28aa26d91b19cb41253bdb233
-  data.tar.gz: 674bf164a3f1be2971fa2f882999ed34a22773d15674d6c2c24da66af888be855ffd3daa1d66a08ee7ec4bc99c87e864ffe61ebb7d6356dfae78a0a20e97c3a9
+  metadata.gz: 7f2b4b8ba5d7511215a2e850add19f0942cbff4157a8373eba1950c0eac9fcd0e44925d3a88b2a709c0308ef4c03cca44c501b710f4a22dc4dd573e6866d94dc
+  data.tar.gz: 4630710eef59af88274e9a411a6ad12de7e4a616280f8fc94d185e24c7bc667bf8c1f662425c64cf05f6ec9accd914ac32e1039688d09629b920329ad85354c8

data/.gitignore CHANGED

@@ -12,6 +12,7 @@
 .rspec_status
 *.swp
+*.bundle
 .DS_Store
 .ruby-version
 /spec/dump_dbl.t

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+# 0.9.0
+## Breaking changes
+- Decide to introduce Ruby extensions for improving performance.
+- Fix to find split point on decision tree estimators using extension modules.
 # 0.8.4
 - Remove unused parameter on Nadam.
 - Fix condition to stop growing tree about decision tree.

data/Rakefile CHANGED

@@ -3,4 +3,12 @@ require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
-task :default => :spec
+require 'rake/extensiontask'
+task :build => :compile
+Rake::ExtensionTask.new('rumale') do |ext|
+  ext.lib_dir = 'lib/rumale'
+end
+task :default => [:clobber, :compile, :spec]

data/ext/rumale/extconf.rb ADDED

@@ -0,0 +1,3 @@
+require 'mkmf'
+create_makefile('rumale/rumale')

data/ext/rumale/rumale.c ADDED

@@ -0,0 +1,418 @@
+#include "rumale.h"
+VALUE
+create_zero_vector(const long n_dimensions)
+{
+  long i;
+  VALUE vec = rb_ary_new2(n_dimensions);
+  for (i = 0; i < n_dimensions; i++) {
+    rb_ary_store(vec, i, DBL2NUM(0));
+  }
+  return vec;
+}
+double
+calc_gini_coef(VALUE histogram, const long n_elements)
+{
+  long i;
+  double el;
+  double gini = 0.0;
+  const long n_classes = RARRAY_LEN(histogram);
+  for (i = 0; i < n_classes; i++) {
+    el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
+    gini += el * el;
+  }
+  return 1.0 - gini;
+}
+double
+calc_entropy(VALUE histogram, const long n_elements)
+{
+  long i;
+  double el;
+  double entropy = 0.0;
+  const long n_classes = RARRAY_LEN(histogram);
+  for (i = 0; i < n_classes; i++) {
+    el = NUM2DBL(rb_ary_entry(histogram, i)) / n_elements;
+    entropy += el * log(el + 1.0);
+  }
+  return -entropy;
+}
+VALUE
+calc_mean_vec(VALUE sum_vec, const long n_elements)
+{
+  long i;
+  const long n_dimensions = RARRAY_LEN(sum_vec);
+  VALUE mean_vec = rb_ary_new2(n_dimensions);
+  for (i = 0; i < n_dimensions; i++) {
+    rb_ary_store(mean_vec, i, DBL2NUM(NUM2DBL(rb_ary_entry(sum_vec, i)) / n_elements));
+  }
+  return mean_vec;
+}
+double
+calc_vec_mae(VALUE vec_a, VALUE vec_b)
+{
+  long i;
+  const long n_dimensions = RARRAY_LEN(vec_a);
+  double sum = 0.0;
+  double diff;
+  for (i = 0; i < n_dimensions; i++) {
+    diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
+    sum += fabs(diff);
+  }
+  return sum / n_dimensions;
+}
+double
+calc_vec_mse(VALUE vec_a, VALUE vec_b)
+{
+  long i;
+  const long n_dimensions = RARRAY_LEN(vec_a);
+  double sum = 0.0;
+  double diff;
+  for (i = 0; i < n_dimensions; i++) {
+    diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i));
+    sum += diff * diff;
+  }
+  return sum / n_dimensions;
+}
+double
+calc_mae(VALUE target_vecs, VALUE sum_vec)
+{
+  long i;
+  const long n_elements = RARRAY_LEN(target_vecs);
+  double sum = 0.0;
+  VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
+  for (i = 0; i < n_elements; i++) {
+    sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec);
+  }
+  return sum / n_elements;
+}
+double
+calc_mse(VALUE target_vecs, VALUE sum_vec)
+{
+  long i;
+  const long n_elements = RARRAY_LEN(target_vecs);
+  double sum = 0.0;
+  VALUE mean_vec = calc_mean_vec(sum_vec, n_elements);
+  for (i = 0; i < n_elements; i++) {
+    sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec);
+  }
+  return sum / n_elements;
+}
+double
+calc_impurity_cls(VALUE criterion, VALUE histogram, const long n_elements)
+{
+  if (strcmp(StringValuePtr(criterion), "entropy") == 0) {
+    return calc_entropy(histogram, n_elements);
+  }
+  return calc_gini_coef(histogram, n_elements);
+}
+double
+calc_impurity_reg(VALUE criterion, VALUE target_vecs, VALUE sum_vec)
+{
+  if (strcmp(StringValuePtr(criterion), "mae") == 0) {
+    return calc_mae(target_vecs, sum_vec);
+  }
+  return calc_mse(target_vecs, sum_vec);
+}
+void
+increment_histogram(VALUE histogram, const long bin_id)
+{
+  const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) + 1;
+  rb_ary_store(histogram, bin_id, DBL2NUM(updated));
+}
+void
+decrement_histogram(VALUE histogram, const long bin_id)
+{
+  const double updated = NUM2DBL(rb_ary_entry(histogram, bin_id)) - 1;
+  rb_ary_store(histogram, bin_id, DBL2NUM(updated));
+}
+void
+add_sum_vec(VALUE sum_vec, VALUE target)
+{
+  long i;
+  const long n_dimensions = RARRAY_LEN(sum_vec);
+  double el;
+  for (i = 0; i < n_dimensions; i++) {
+    el = NUM2DBL(rb_ary_entry(sum_vec, i)) + NUM2DBL(rb_ary_entry(target, i));
+    rb_ary_store(sum_vec, i, DBL2NUM(el));
+  }
+}
+void
+sub_sum_vec(VALUE sum_vec, VALUE target)
+{
+  long i;
+  const long n_dimensions = RARRAY_LEN(sum_vec);
+  double el;
+  for (i = 0; i < n_dimensions; i++) {
+    el = NUM2DBL(rb_ary_entry(sum_vec, i)) - NUM2DBL(rb_ary_entry(target, i));
+    rb_ary_store(sum_vec, i, DBL2NUM(el));
+  }
+}
+/**
+ * @!visibility private
+ * Find for split point with maximum information gain.
+ *
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, uniqed_features, n_classes) -> Array<Float>
+ *
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
+ * @param impurity [Float] The impurity of whole dataset.
+ * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
+ * @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
+ * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
+ * @param n_classes [Integer] The number of classes.
+ * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
+ */
+static VALUE
+find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f, VALUE n_classes_)
+{
+  long i;
+  long curr_pos;
+  long next_pos;
+  long n_l_elements;
+  long n_r_elements;
+  const long n_classes = NUM2LONG(n_classes_);
+  const long n_elements = RARRAY_LEN(sorted_f);
+  const long n_uniq_elements = RARRAY_LEN(uniqed_f);
+  const double w_impurity = NUM2DBL(whole_impurity);
+  double l_impurity;
+  double r_impurity;
+  double gain;
+  double curr_el;
+  double next_el;
+  VALUE l_histogram = create_zero_vector(n_classes);
+  VALUE r_histogram = create_zero_vector(n_classes);
+  VALUE opt_params = rb_ary_new2(4);
+  /* Initialize optimal parameters. */
+  rb_ary_store(opt_params, 0, DBL2NUM(0));                /* left impurity */
+  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity));       /* right impurity */
+  rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
+  rb_ary_store(opt_params, 3, DBL2NUM(0));                /* gain */
+  /* Initialize child node variables. */
+  n_l_elements = 0;
+  n_r_elements = n_elements;
+  for (i = 0; i < n_elements; i++) {
+    increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, i)));
+  }
+  /* Find optimal parameters. */
+  for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
+    /* Find new split point. */
+    curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
+    next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
+    while (next_pos < n_elements && next_el <= curr_el) {
+      increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
+      n_l_elements++;
+      decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
+      n_r_elements--;
+      next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
+    }
+    /* Calculate gain of new split. */
+    l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements);
+    r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements);
+    gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
+    /* Update optimal parameters. */
+    if (gain > NUM2DBL(rb_ary_entry(opt_params, 3))) {
+      rb_ary_store(opt_params, 0, DBL2NUM(l_impurity));
+      rb_ary_store(opt_params, 1, DBL2NUM(r_impurity));
+      rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
+      rb_ary_store(opt_params, 3, DBL2NUM(gain));
+    }
+  }
+  return opt_params;
+}
+/**
+ * @!visibility private
+ * Find for split point with maximum information gain.
+ *
+ * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets, uniqed_features) -> Array<Float>
+ *
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
+ * @param impurity [Float] The impurity of whole dataset.
+ * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
+ * @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
+ * @param uniqed_features [Numo::DFloat] (shape: [n_uniqed_features]) The unique feature values.
+ * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
+ */
+static VALUE
+find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE uniqed_f)
+{
+  long i;
+  long curr_pos;
+  long next_pos;
+  long n_l_elements;
+  long n_r_elements;
+  const long n_elements = RARRAY_LEN(sorted_f);
+  const long n_uniq_elements = RARRAY_LEN(uniqed_f);
+  const long n_dimensions = RARRAY_LEN(rb_ary_entry(sorted_y, 0));
+  const double w_impurity = NUM2DBL(whole_impurity);
+  double l_impurity;
+  double r_impurity;
+  double gain;
+  double curr_el;
+  double next_el;
+  VALUE l_sum_vec = create_zero_vector(n_dimensions);
+  VALUE r_sum_vec = create_zero_vector(n_dimensions);
+  VALUE l_target_vecs = rb_ary_new();
+  VALUE r_target_vecs = rb_ary_new();
+  VALUE target;
+  VALUE opt_params = rb_ary_new2(4);
+  /* Initialize optimal parameters. */
+  rb_ary_store(opt_params, 0, DBL2NUM(0));                /* left impurity */
+  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity));       /* right impurity */
+  rb_ary_store(opt_params, 2, rb_ary_entry(uniqed_f, 0)); /* threshold */
+  rb_ary_store(opt_params, 3, DBL2NUM(0));                /* gain */
+  /* Initialize child node variables. */
+  n_l_elements = 0;
+  n_r_elements = n_elements;
+  for (i = 0; i < n_elements; i++) {
+    target = rb_ary_entry(sorted_y, i);
+    add_sum_vec(r_sum_vec, target);
+    rb_ary_push(r_target_vecs, target);
+  }
+  /* Find optimal parameters. */
+  for (curr_pos = 0, next_pos = 0; curr_pos < n_uniq_elements - 1; curr_pos++) {
+    /* Find new split point. */
+    curr_el = NUM2DBL(rb_ary_entry(uniqed_f, curr_pos));
+    next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
+    while (next_pos < n_elements && next_el <= curr_el) {
+      target = rb_ary_entry(sorted_y, next_pos);
+      add_sum_vec(l_sum_vec, target);
+      rb_ary_push(l_target_vecs, target);
+      n_l_elements++;
+      sub_sum_vec(r_sum_vec, target);
+      rb_ary_shift(r_target_vecs);
+      n_r_elements--;
+      next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
+    }
+    /* Calculate gain of new split. */
+    l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
+    r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
+    gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
+    /* Update optimal parameters. */
+    if (gain > NUM2DBL(rb_ary_entry(opt_params, 3))) {
+      rb_ary_store(opt_params, 0, DBL2NUM(l_impurity));
+      rb_ary_store(opt_params, 1, DBL2NUM(r_impurity));
+      rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
+      rb_ary_store(opt_params, 3, DBL2NUM(gain));
+    }
+  }
+  return opt_params;
+}
+/**
+ * @!visibility private
+ * Calculate impurity based on criterion.
+ *
+ * @overload node_impurity(criterion, y, n_classes) -> Float
+ *
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
+ * @param y [Numo::Int32] (shape: [n_samples]) The labels.
+ * @param n_classes [Integer] The number of classes.
+ * @return [Float] impurity
+ */
+static VALUE
+node_impurity_cls(VALUE self, VALUE criterion, VALUE y, VALUE n_classes)
+{
+  long i;
+  const long n_elements = RARRAY_LEN(y);
+  VALUE histogram = create_zero_vector(NUM2LONG(n_classes));
+  for (i = 0; i < n_elements; i++) {
+    increment_histogram(histogram, NUM2LONG(rb_ary_entry(y, i)));
+  }
+  return DBL2NUM(calc_impurity_cls(criterion, histogram, n_elements));
+}
+/**
+ * @!visibility private
+ * Calculate impurity based on criterion.
+ *
+ * @overload node_impurity(criterion, y) -> Float
+ *
+ * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
+ * @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values.
+ * @return [Float] impurity
+ */
+static VALUE
+node_impurity_reg(VALUE self, VALUE criterion, VALUE y)
+{
+  long i;
+  const long n_elements = RARRAY_LEN(y);
+  const long n_dimensions = RARRAY_LEN(rb_ary_entry(y, 0));
+  VALUE sum_vec = create_zero_vector(n_dimensions);
+  VALUE target_vecs = rb_ary_new();
+  VALUE target;
+  for (i = 0; i < n_elements; i++) {
+    target = rb_ary_entry(y, i);
+    add_sum_vec(sum_vec, target);
+    rb_ary_push(target_vecs, target);
+  }
+  return DBL2NUM(calc_impurity_reg(criterion, target_vecs, sum_vec));
+}
+void Init_rumale(void)
+{
+  VALUE mRumale = rb_define_module("Rumale");
+  VALUE mTree = rb_define_module_under(mRumale, "Tree");
+  /**
+   * Document-module: Rumale::Tree::ExtDecisionTreeClassifier
+   * @!visibility private
+   * The mixin module consisting of extension method for DecisionTreeClassifier class.
+   * This module is used internally.
+   */
+  VALUE mExtDTreeCls = rb_define_module_under(mTree, "ExtDecisionTreeClassifier");
+  /**
+   * Document-module: Rumale::Tree::ExtDecisionTreeRegressor
+   * @!visibility private
+   * The mixin module consisting of extension method for DecisionTreeRegressor class.
+   * This module is used internally.
+   */
+  VALUE mExtDTreeReg = rb_define_module_under(mTree, "ExtDecisionTreeRegressor");
+  rb_define_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
+  rb_define_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
+  rb_define_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
+  rb_define_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
+}

data/ext/rumale/rumale.h ADDED

@@ -0,0 +1,9 @@
+#ifndef RUMALE_H
+#define RUMALE_H 1
+#include <math.h>
+#include <string.h>
+#include "ruby.h"
+#endif /* RUMALE_H */

data/lib/rumale.rb CHANGED

@@ -2,6 +2,8 @@
 require 'numo/narray'
+require 'rumale/rumale'
 require 'rumale/version'
 require 'rumale/validation'
 require 'rumale/values'

data/lib/rumale/tree/base_decision_tree.rb CHANGED

@@ -86,13 +86,16 @@ module Rumale
         return put_leaf(node, y) if stop_growing?(y)
         # calculate optimal parameters.
-        feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
-          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
+        feature_id, left_ids, right_ids, left_imp, right_imp, threshold, gain = rand_ids(n_features).map do |fid|
+          ft = x[true, fid]
+          limp, rimp, th, ga = best_split(ft, y, whole_impurity)
+          [fid, ft.le(th).where, ft.gt(th).where, limp, rimp, th, ga]
+        end.max_by(&:last)
         return put_leaf(node, y) if gain.nil? || gain.zero?
-        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
-        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
+        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_imp)
+        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_imp)
         return put_leaf(node, y) if node.left.nil? && node.right.nil?
@@ -114,22 +117,11 @@ module Rumale
         [*0...n].sample(@params[:max_features], random: @rng)
       end
-      def best_split(features, targets, whole_impurity)
-        n_samples = targets.shape[0]
-        features.to_a.uniq.sort.each_cons(2).map do |l, r|
-          threshold = 0.5 * (l + r)
-          left_ids = features.le(threshold).where
-          right_ids = features.gt(threshold).where
-          left_impurity = impurity(targets[left_ids, true])
-          right_impurity = impurity(targets[right_ids, true])
-          gain = whole_impurity -
-                 left_impurity * left_ids.size.fdiv(n_samples) -
-                 right_impurity * right_ids.size.fdiv(n_samples)
-          [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
-        end.max_by(&:last)
+      def best_split(_features, _y, _impurity)
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
       end
-      def impurity(_targets)
+      def impurity(_y)
         raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
       end

data/lib/rumale/tree/decision_tree_classifier.rb CHANGED

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require 'rumale/rumale'
 require 'rumale/tree/base_decision_tree'
 require 'rumale/base/classifier'
@@ -16,6 +17,7 @@ module Rumale
     #
     class DecisionTreeClassifier < BaseDecisionTree
       include Base::Classifier
+      include ExtDecisionTreeClassifier
       # Return the class labels.
       # @return [Numo::Int32] (size: n_classes)
@@ -39,7 +41,7 @@ module Rumale
       # Create a new classifier with decision tree algorithm.
       #
-      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
       # @param max_depth [Integer] The maximum depth of the tree.
       #   If nil is given, decision tree grows without concern for depth.
       # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
@@ -89,7 +91,7 @@ module Rumale
       # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
       def predict(x)
         check_sample_array(x)
-        @leaf_labels[apply(x)]
+        @leaf_labels[apply(x)].dup
       end
       # Predict probability for samples.
@@ -138,7 +140,7 @@ module Rumale
       end
       def stop_growing?(y)
-        y.flatten.to_a.uniq.size == 1
+        y[true, 0].to_a.uniq.size == 1
       end
       def put_leaf(node, y)
@@ -150,13 +152,17 @@ module Rumale
         node
       end
+      def best_split(features, y, whole_impurity)
+        order = features.sort_index
+        sorted_f = features[order].to_a
+        sorted_y = y[order, true].to_a.flatten
+        n_classes = @classes.size
+        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq, n_classes)
+      end
       def impurity(y)
-        posterior_probs = y.flatten.bincount / y.size.to_f
-        if @params[:criterion] == 'entropy'
-          -(posterior_probs * Numo::NMath.log(posterior_probs + 1)).sum
-        else
-          1.0 - (posterior_probs * posterior_probs).sum
-        end
+        n_classes = @classes.size
+        node_impurity(@params[:criterion], y[true, 0].to_a, n_classes)
       end
     end
   end

data/lib/rumale/tree/decision_tree_regressor.rb CHANGED

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require 'rumale/rumale'
 require 'rumale/tree/base_decision_tree'
 require 'rumale/base/regressor'
@@ -16,6 +17,7 @@ module Rumale
     #
     class DecisionTreeRegressor < BaseDecisionTree
       include Base::Regressor
+      include ExtDecisionTreeRegressor
       # Return the importance for each feature.
       # @return [Numo::DFloat] (size: n_features)
@@ -35,7 +37,7 @@ module Rumale
       # Create a new regressor with decision tree algorithm.
       #
-      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
+      # @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
       # @param max_depth [Integer] The maximum depth of the tree.
       #   If nil is given, decision tree grows without concern for depth.
       # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
@@ -84,7 +86,7 @@ module Rumale
       # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
       def predict(x)
         check_sample_array(x)
-        @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
+        @leaf_values.shape[1].nil? ? @leaf_values[apply(x)].dup : @leaf_values[apply(x), true].dup
       end
       # Dump marshal data.
@@ -123,12 +125,15 @@ module Rumale
         node
       end
-      def impurity(values)
-        if @params[:criterion] == 'mae'
-          (values - values.mean(0)).abs.mean
-        else
-          ((values - values.mean(0))**2).mean
-        end
+      def best_split(features, y, whole_impurity)
+        order = features.sort_index
+        sorted_f = features[order].to_a
+        sorted_y = y[order, true].to_a
+        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, sorted_f.uniq)
+      end
+      def impurity(y)
+        node_impurity(@params[:criterion], y.to_a)
       end
     end
   end

data/lib/rumale/version.rb CHANGED

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.8.4'
+  VERSION = '0.9.0'
 end

data/rumale.gemspec CHANGED

@@ -29,6 +29,7 @@ MSG
   spec.bindir        = 'exe'
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ['lib']
+  spec.extensions    = ['ext/rumale/extconf.rb']
   spec.required_ruby_version = '>= 2.3'
@@ -37,5 +38,6 @@ MSG
   spec.add_development_dependency 'bundler', '>= 1.16'
   spec.add_development_dependency 'coveralls', '~> 0.8'
   spec.add_development_dependency 'rake', '~> 12.0'
+  spec.add_development_dependency 'rake-compiler'
   spec.add_development_dependency 'rspec', '~> 3.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.8.4
+  version: 0.9.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-04-20 00:00:00.000000000 Z
+date: 2019-04-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -66,6 +66,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '12.0'
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -90,7 +104,8 @@ description: |
 email:
 - yoshoku@outlook.com
 executables: []
-extensions: []
+extensions:
+- ext/rumale/extconf.rb
 extra_rdoc_files: []
 files:
 - ".coveralls.yml"
@@ -107,6 +122,9 @@ files:
 - Rakefile
 - bin/console
 - bin/setup
+- ext/rumale/extconf.rb
+- ext/rumale/rumale.c
+- ext/rumale/rumale.h
 - lib/rumale.rb
 - lib/rumale/base/base_estimator.rb
 - lib/rumale/base/classifier.rb