RubyGems - rumale - Versions diffs - 0.12.5 → 0.12.6 - Mend

rumale 0.12.5 → 0.12.6

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +3 -0
data/README.md +1 -1
data/ext/rumale/extconf.rb +8 -0
data/ext/rumale/rumale.c +137 -82
data/ext/rumale/rumale.h +2 -0
data/lib/rumale/tree/decision_tree_classifier.rb +3 -4
data/lib/rumale/tree/gradient_tree_regressor.rb +2 -6
data/lib/rumale/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6e66626936daf06e8a4a8ce46aea1d58b309d884
-  data.tar.gz: 7332f667b41551e6b22597b5bd7beb2798c7537d
+  metadata.gz: f5ad94467c9d031744ab3cc7e0c29439a23ce562
+  data.tar.gz: 6b99e9c5846acef596ee6efef32fc355df73f856
 SHA512:
-  metadata.gz: 597906ee7bfa304ea80d15e49d1ce998d4c626183b4c9d5cd0b25ea1e29165cd7c294ef60f7ad818ca7e46c35d10979f7eb59f40ea23bf3e60a4d3f1b7ef26e3
-  data.tar.gz: f484aa119b4da49ed1eac171eff78459025ac2647fc71f647dc032b0efe9a07e4ec42f9b9562f0d2114b0b083749e3b1747f66625d2502d8978015824b0b25a4
+  metadata.gz: b962432af5544227a33dce685f002807855bb51559fbe874414e26f3b13df031886bf294769282b3971eee75d4ad6941a630b8a9b81a591975ad7977fc111e43
+  data.tar.gz: e2c5e90412c9dfb1cb2ab6cf9f46593a93902807cbffe5b3fff43074074d3cc77ce9a03d9be8dd7c2e447579fb231c0548f57796ee04b189758e8890cc8e2a8f

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,6 @@
+# 0.12.6
+- Fix extension codes of decision tree classifier and gradient tree regressor for using Numo::NArray.
 # 0.12.5
 - Fix random number generator initialization on gradient boosting estimators
 to obtain the same result with and without parallel option.

data/README.md CHANGED Viewed

@@ -6,7 +6,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
 [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
 [![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
-[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.5)
+[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.12.6)
 Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
 Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.

data/ext/rumale/extconf.rb CHANGED Viewed

@@ -1,5 +1,13 @@
 # frozen_string_literal: true
 require 'mkmf'
+require 'numo/narray'
+$LOAD_PATH.each do |lp|
+  if File.exist?(File.join(lp, 'numo/numo/narray.h'))
+    $INCFLAGS = "-I#{lp}/numo #{$INCFLAGS}"
+    break
+  end
+end
 create_makefile('rumale/rumale')

data/ext/rumale/rumale.c CHANGED Viewed

@@ -122,9 +122,9 @@ calc_mse(VALUE target_vecs, VALUE sum_vec)
 }
 double
-calc_impurity_cls(VALUE criterion, VALUE histogram, const long n_elements)
+calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements)
 {
-  if (strcmp(StringValuePtr(criterion), "entropy") == 0) {
+  if (strcmp(criterion, "entropy") == 0) {
     return calc_entropy(histogram, n_elements);
   }
   return calc_gini_coef(histogram, n_elements);
@@ -181,76 +181,107 @@ sub_sum_vec(VALUE sum_vec, VALUE target)
 /**
  * @!visibility private
- * Find for split point with maximum information gain.
- *
- * @overload find_split_params(criterion, impurity, sorted_features, sorted_labels, n_classes) -> Array<Float>
- *
- * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
- * @param impurity [Float] The impurity of whole dataset.
- * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
- * @param sorted_labels [Numo::Int32] (shape: [n_labels]) The labels sorted according to feature values.
- * @param n_classes [Integer] The number of classes.
- * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
  */
-static VALUE
-find_split_params_cls(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE sorted_f, VALUE sorted_y, VALUE n_classes_)
+typedef struct {
+  char* criterion;
+  long n_classes;
+  double impurity;
+} split_opts_cls;
+/**
+ * @!visibility private
+ */
+static void
+iter_find_split_params_cls(na_loop_t const* lp)
 {
-  const long n_classes = NUM2LONG(n_classes_);
-  const long n_elements = RARRAY_LEN(sorted_f);
-  const double w_impurity = NUM2DBL(whole_impurity);
-  long iter = 0;
+  const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
+  const double* f = (double*)NDL_PTR(lp, 1);
+  const int32_t* y = (int32_t*)NDL_PTR(lp, 2);
+  const long n_elements = NDL_SHAPE(lp, 0)[0];
+  const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion;
+  const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes;
+  const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity;
+  double* params = (double*)NDL_PTR(lp, 3);
+  long i;
   long curr_pos = 0;
   long next_pos = 0;
   long n_l_elements = 0;
   long n_r_elements = n_elements;
-  double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
-  double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
+  double curr_el = f[o[0]];
+  double last_el = f[o[n_elements - 1]];
   double next_el;
   double l_impurity;
   double r_impurity;
   double gain;
   VALUE l_histogram = create_zero_vector(n_classes);
   VALUE r_histogram = create_zero_vector(n_classes);
-  VALUE opt_params = rb_ary_new2(4);
   /* Initialize optimal parameters. */
-  rb_ary_store(opt_params, 0, DBL2NUM(0));                /* left impurity */
-  rb_ary_store(opt_params, 1, DBL2NUM(w_impurity));       /* right impurity */
-  rb_ary_store(opt_params, 2, rb_ary_entry(sorted_f, 0)); /* threshold */
-  rb_ary_store(opt_params, 3, DBL2NUM(0));                /* gain */
+  params[0] = 0.0;        /* left impurity */
+  params[1] = w_impurity; /* right impurity */
+  params[2] = curr_el;    /* threshold */
+  params[3] = 0.0;        /* gain */
   /* Initialize child node variables. */
-  for (iter = 0; iter < n_elements; iter++) {
-    increment_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, iter)));
+  for (i = 0; i < n_elements; i++) {
+    increment_histogram(r_histogram, y[o[i]]);
   }
   /* Find optimal parameters. */
   while (curr_pos < n_elements && curr_el != last_el) {
-    next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
+    next_el = f[o[next_pos]];
     while (next_pos < n_elements && next_el == curr_el) {
-      increment_histogram(l_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
+      increment_histogram(l_histogram, y[o[next_pos]]);
       n_l_elements++;
-      decrement_histogram(r_histogram, NUM2LONG(rb_ary_entry(sorted_y, next_pos)));
+      decrement_histogram(r_histogram, y[o[next_pos]]);
       n_r_elements--;
-      next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
+      next_pos++;
+      next_el = f[o[next_pos]];
     }
     /* Calculate gain of new split. */
     l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements);
     r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements);
     gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
     /* Update optimal parameters. */
-    if (gain > NUM2DBL(rb_ary_entry(opt_params, 3))) {
-      rb_ary_store(opt_params, 0, DBL2NUM(l_impurity));
-      rb_ary_store(opt_params, 1, DBL2NUM(r_impurity));
-      rb_ary_store(opt_params, 2, DBL2NUM(0.5 * (curr_el + next_el)));
-      rb_ary_store(opt_params, 3, DBL2NUM(gain));
+    if (gain > params[3]) {
+      params[0] = l_impurity;
+      params[1] = r_impurity;
+      params[2] = 0.5 * (curr_el + next_el);
+      params[3] = gain;
     }
     if (next_pos == n_elements) break;
     curr_pos = next_pos;
-    curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
+    curr_el = f[o[curr_pos]];
   }
-  return opt_params;
+}
+/**
+ * @!visibility private
+ * Find for split point with maximum information gain.
+ *
+ * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array<Float>
+ *
+ * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'.
+ * @param impurity [Float] The impurity of whole dataset.
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
+ * @param labels [Numo::Int32] (shape: [n_elements]) The labels.
+ * @param n_classes [Integer] The number of classes.
+ * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
+ */
+static VALUE
+find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes)
+{
+  ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1} };
+  size_t out_shape[1] = { 4 };
+  ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
+  ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout };
+  split_opts_cls opts = { StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity) };
+  VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels);
+  VALUE results = rb_ary_new2(4);
+  rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
+  rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
+  rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
+  rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
+  return results;
 }
 /**
@@ -336,50 +367,40 @@ find_split_params_reg(VALUE self, VALUE criterion, VALUE whole_impurity, VALUE s
 /**
  * @!visibility private
- * Find for split point with maximum information gain.
- *
- * @overload find_split_params(sorted_features, sorted_gradient, sorted_hessian, sum_gradient, sum_hessian) -> Array<Float>
- *
- * @param sorted_features [Array<Float>] (size: n_samples) The feature values sorted in ascending order.
- * @param sorted_targets [Array<Float>] (size: n_samples) The target values sorted according to feature values.
- * @param sorted_gradient [Array<Float>] (size: n_samples) The gradient values of loss function sorted according to feature values.
- * @param sorted_hessian [Array<Float>] (size: n_samples) The hessian values of loss function sorted according to feature values.
- * @param sum_gradient [Float] The sum of gradient values.
- * @param sum_hessian [Float] The sum of hessian values.
- * @param reg_lambda [Float] The L2 regularization term on weight.
- * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
  */
-static VALUE
-find_split_params_grad_reg
-(VALUE self, VALUE sorted_f, VALUE sorted_g, VALUE sorted_h, VALUE sum_g, VALUE sum_h, VALUE reg_l)
+static void
+iter_find_split_params_grad_reg(na_loop_t const* lp)
 {
-  const long n_elements = RARRAY_LEN(sorted_f);
-  const double s_grad = NUM2DBL(sum_g);
-  const double s_hess = NUM2DBL(sum_h);
-  const double reg_lambda = NUM2DBL(reg_l);
+  const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
+  const double* f = (double*)NDL_PTR(lp, 1);
+  const double* g = (double*)NDL_PTR(lp, 2);
+  const double* h = (double*)NDL_PTR(lp, 3);
+  const double s_grad = ((double*)lp->opt_ptr)[0];
+  const double s_hess = ((double*)lp->opt_ptr)[1];
+  const double reg_lambda = ((double*)lp->opt_ptr)[2];
+  const long n_elements = NDL_SHAPE(lp, 0)[0];
+  double* params = (double*)NDL_PTR(lp, 4);
   long curr_pos = 0;
   long next_pos = 0;
-  double last_el = NUM2DBL(rb_ary_entry(sorted_f, n_elements - 1));
-  double curr_el = NUM2DBL(rb_ary_entry(sorted_f, 0));
+  double curr_el = f[o[0]];
+  double last_el = f[o[n_elements - 1]];
   double next_el;
   double l_grad = 0.0;
   double l_hess = 0.0;
   double r_grad;
   double r_hess;
+  double threshold = curr_el;
+  double gain_max = 0.0;
   double gain;
-  VALUE opt_params = rb_ary_new2(2);
-  /* Initialize optimal parameters. */
-  rb_ary_store(opt_params, 0, rb_ary_entry(sorted_f, 0)); /* threshold */
-  rb_ary_store(opt_params, 1, DBL2NUM(0));                /* gain */
   /* Find optimal parameters. */
   while (curr_pos < n_elements && curr_el != last_el) {
-    next_el = NUM2DBL(rb_ary_entry(sorted_f, next_pos));
+    next_el = f[o[next_pos]];
     while (next_pos < n_elements && next_el == curr_el) {
-      l_grad += NUM2DBL(rb_ary_entry(sorted_g, next_pos));
-      l_hess += NUM2DBL(rb_ary_entry(sorted_h, next_pos));
-      next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
+      l_grad += g[o[next_pos]];
+      l_hess += h[o[next_pos]];
+      next_pos++;
+      next_el = f[o[next_pos]];
     }
     /* Calculate gain of new split. */
     r_grad = s_grad - l_grad;
@@ -388,16 +409,48 @@ find_split_params_grad_reg
            (r_grad * r_grad) / (r_hess + reg_lambda) -
            (s_grad * s_grad) / (s_hess + reg_lambda);
     /* Update optimal parameters. */
-    if (gain > NUM2DBL(rb_ary_entry(opt_params, 1))) {
-      rb_ary_store(opt_params, 0, DBL2NUM(0.5 * (curr_el + next_el)));
-      rb_ary_store(opt_params, 1, DBL2NUM(gain));
+    if (gain > gain_max) {
+      threshold = 0.5 * (curr_el + next_el);
+      gain_max = gain;
     }
     if (next_pos == n_elements) break;
     curr_pos = next_pos;
-    curr_el = NUM2DBL(rb_ary_entry(sorted_f, curr_pos));
+    curr_el = f[o[curr_pos]];
   }
-  return opt_params;
+  params[0] = threshold;
+  params[1] = gain_max;
+}
+/**
+ * @!visibility private
+ * Find for split point with maximum information gain.
+ *
+ * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array<Float>
+ *
+ * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values.
+ * @param features [Numo::DFloat] (shape: [n_elements]) The feature values.
+ * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values.
+ * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values.
+ * @param sum_gradient [Float] The sum of gradient values.
+ * @param sum_hessian [Float] The sum of hessian values.
+ * @param reg_lambda [Float] The L2 regularization term on weight.
+ * @return [Array<Float>] The array consists of optimal parameters including threshold and gain.
+ */
+static VALUE
+find_split_params_grad_reg
+(VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda)
+{
+  ndfunc_arg_in_t ain[4] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1} };
+  size_t out_shape[1] = { 2 };
+  ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
+  ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout };
+  double opts[3] = { NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda) };
+  VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians);
+  VALUE results = rb_ary_new2(2);
+  rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
+  rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
+  return results;
 }
 /**
@@ -407,22 +460,24 @@ find_split_params_grad_reg
  * @overload node_impurity(criterion, y, n_classes) -> Float
  *
  * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'.
- * @param y [Numo::Int32] (shape: [n_samples]) The labels.
+ * @param y_nary [Numo::Int32] (shape: [n_samples]) The labels.
+ * @param n_elements_ [Integer] The number of elements.
  * @param n_classes [Integer] The number of classes.
  * @return [Float] impurity
  */
 static VALUE
-node_impurity_cls(VALUE self, VALUE criterion, VALUE y, VALUE n_classes)
+node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, VALUE n_classes)
 {
   long i;
-  const long n_elements = RARRAY_LEN(y);
+  const long n_elements = NUM2LONG(n_elements_);
+  const int32_t* y = (int32_t*)na_get_pointer_for_read(y_nary);
   VALUE histogram = create_zero_vector(NUM2LONG(n_classes));
   for (i = 0; i < n_elements; i++) {
-    increment_histogram(histogram, NUM2LONG(rb_ary_entry(y, i)));
+    increment_histogram(histogram, y[i]);
   }
-  return DBL2NUM(calc_impurity_cls(criterion, histogram, n_elements));
+  return DBL2NUM(calc_impurity_cls(StringValuePtr(criterion), histogram, n_elements));
 }
 /**
@@ -480,9 +535,9 @@ void Init_rumale(void)
    */
   VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
-  rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 5);
+  rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
   rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 4);
-  rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 6);
-  rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 3);
+  rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
+  rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
   rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
 }

data/ext/rumale/rumale.h CHANGED Viewed

@@ -5,5 +5,7 @@
 #include <string.h>
 #include "ruby.h"
+#include "numo/narray.h"
+#include "numo/template.h"
 #endif /* RUMALE_H */

data/lib/rumale/tree/decision_tree_classifier.rb CHANGED Viewed

@@ -155,15 +155,14 @@ module Rumale
       def best_split(features, y, whole_impurity)
         order = features.sort_index
-        sorted_f = features[order].to_a
-        sorted_y = y[order, 0].to_a
         n_classes = @classes.size
-        find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y, n_classes)
+        find_split_params(@params[:criterion], whole_impurity, order, features, y[true, 0], n_classes)
       end
       def impurity(y)
+        n_elements = y.shape[0]
         n_classes = @classes.size
-        node_impurity(@params[:criterion], y[true, 0].to_a, n_classes)
+        node_impurity(@params[:criterion], y[true, 0].dup, n_elements, n_classes)
       end
     end
   end

data/lib/rumale/tree/gradient_tree_regressor.rb CHANGED Viewed

@@ -214,12 +214,8 @@ module Rumale
         node
       end
-      def best_split(features, g, h, sum_g, sum_h)
-        order = features.sort_index
-        sorted_f = features[order].to_a
-        sorted_g = g[order].to_a
-        sorted_h = h[order].to_a
-        find_split_params(sorted_f, sorted_g, sorted_h, sum_g, sum_h, @params[:reg_lambda])
+      def best_split(f, g, h, sum_g, sum_h)
+        find_split_params(f.sort_index, f, g, h, sum_g, sum_h, @params[:reg_lambda])
       end
       def rand_ids

data/lib/rumale/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.12.5'
+  VERSION = '0.12.6'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.12.5
+  version: 0.12.6
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-07-02 00:00:00.000000000 Z
+date: 2019-07-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray