RubyGems - svmkit - Versions diffs - 0.6.1 → 0.6.2 - Mend

svmkit 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/.rubocop.yml +3 -0
data/HISTORY.md +4 -1
data/lib/svmkit/linear_model/sgd_linear_estimator.rb +1 -1
data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +1 -1
data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +1 -1
data/lib/svmkit/tree/decision_tree_classifier.rb +31 -34
data/lib/svmkit/tree/decision_tree_regressor.rb +20 -21
data/lib/svmkit/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1d52bf496a762b096a5f6dbeec278a1cae8079b53d6c91cc13c07dca7a799fde
-  data.tar.gz: e5ca2fed307b82e88dfe816691a4715d62a3187c1cad71421a48bea65037b19c
+  metadata.gz: 9655d7d990f37468c79de9713e55f74c6134b00c1cda9471832097c678cb6ded
+  data.tar.gz: fb294e6256d16272e80c2ade2f5b223dd792b14c7fd39caf1de8a9500a3ab55e
 SHA512:
-  metadata.gz: 620370c119300f3f419550609444eba4aa34561a954e8ec26cf6a0d3522cd32cabf1f6875092de5ab0dd202ebf7b772c1d6d6421cd05d90cfeeeeadea3cd0565
-  data.tar.gz: a0d8b5a7b91c4a8e2ffb4312a8082096a2e4fbd411e37bd36752ac19a66f6b9accf22be15894d6a88421b7296bde90ead33b55604085d139e5faec64b97f0f55
+  metadata.gz: f7386071fe57df51bd8223d4945dc67069769464892bd64b972b5c1ba26cd206b7b67d50e600f34d79a3bff9f19803c0fdae06dd92fdf8f6ef87f1d5e982cf2d
+  data.tar.gz: 473a1233e0109672b80b8bf17933366276b0a81fab3c75699fb1a07f92923d66cc3f064e0a2df36e493bf1554aa4c34d567d60607d1802e10ab949eafe1187d3

data/.rubocop.yml CHANGED Viewed

@@ -42,3 +42,6 @@ Style/FormatStringToken:
 Style/NumericLiterals:
   Enabled: false
+Layout/EmptyLineAfterGuardClause:
+  Enabled: false

data/HISTORY.md CHANGED Viewed

@@ -1,7 +1,10 @@
+# 0.6.2
+- Refactor decision tree classes for improving performance.
 # 0.6.1
 - Add abstract class for linear estimator with stochastic gradient descent.
 - Refactor linear estimators to use linear esitmator abstract class.
-- Refactor decistion tree classes to avoid unneeded type conversion.
+- Refactor decision tree classes to avoid unneeded type conversion.
 # 0.6.0
 - Add class for Principal Component Analysis.

data/lib/svmkit/linear_model/sgd_linear_estimator.rb CHANGED Viewed

@@ -79,7 +79,7 @@ module SVMKit
       def split_weight(weight)
         if @params[:fit_bias]
-          [weight[0...-1], weight[-1]]
+          [weight[0...-1].dup, weight[-1]]
         else
           [weight, 0.0]
         end

data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb CHANGED Viewed

@@ -253,7 +253,7 @@ module SVMKit
       end
       def split_weight_vec_bias(weight_vec)
-        weights = weight_vec[0...-1]
+        weights = weight_vec[0...-1].dup
         bias = weight_vec[-1]
         [weights, bias]
       end

data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb CHANGED Viewed

@@ -185,7 +185,7 @@ module SVMKit
       end
       def split_weight_vec_bias(weight_vec)
-        weights = weight_vec[0...-1]
+        weights = weight_vec[0...-1].dup
         bias = weight_vec[-1]
         [weights, bias]
       end

data/lib/svmkit/tree/decision_tree_classifier.rb CHANGED Viewed

@@ -91,8 +91,9 @@ module SVMKit
         n_samples, n_features = x.shape
         @params[:max_features] = n_features if @params[:max_features].nil?
         @params[:max_features] = [@params[:max_features], n_features].min
-        @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
-        build_tree(x, y)
+        uniq_y = y.to_a.uniq.sort
+        @classes = Numo::Int32.asarray(uniq_y)
+        build_tree(x, y.map { |v| uniq_y.index(v) })
         eval_importance(n_samples, n_features)
         self
       end
@@ -174,36 +175,35 @@ module SVMKit
       def build_tree(x, y)
         @n_leaves = 0
         @leaf_labels = []
-        @tree = grow_node(0, x, y)
+        @tree = grow_node(0, x, y, impurity(y))
         @leaf_labels = Numo::Int32[*@leaf_labels]
         nil
       end
-      def grow_node(depth, x, y)
-        if @params[:max_leaf_nodes].is_a?(Integer)
+      def grow_node(depth, x, y, whole_impurity)
+        unless @params[:max_leaf_nodes].nil?
           return nil if @n_leaves >= @params[:max_leaf_nodes]
         end
         n_samples, n_features = x.shape
-        if @params[:min_samples_leaf].is_a?(Integer)
-          return nil if n_samples <= @params[:min_samples_leaf]
-        end
+        return nil if n_samples <= @params[:min_samples_leaf]
-        node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
+        node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
         return put_leaf(node, y) if y.to_a.uniq.size == 1
-        if @params[:max_depth].is_a?(Integer)
+        unless @params[:max_depth].nil?
           return put_leaf(node, y) if depth == @params[:max_depth]
         end
-        feature_id, threshold, left_ids, right_ids, max_gain =
-          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
-        return put_leaf(node, y) if max_gain.nil?
-        return put_leaf(node, y) if max_gain.zero?
+        feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
+          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
+        return put_leaf(node, y) if gain.nil? || gain.zero?
+        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], left_impurity)
+        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], right_impurity)
-        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids])
-        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids])
         return put_leaf(node, y) if node.left.nil? && node.right.nil?
         node.feature_id = feature_id
@@ -213,7 +213,7 @@ module SVMKit
       end
       def put_leaf(node, y)
-        node.probs = Numo::DFloat.cast(@classes.map { |c| y.eq(c).count_true }) / node.n_samples
+        node.probs = y.bincount(minlength: @classes.size) / node.n_samples.to_f
         node.leaf = true
         node.leaf_id = @n_leaves
         @n_leaves += 1
@@ -225,27 +225,23 @@ module SVMKit
         [*0...n].sample(@params[:max_features], random: @rng)
       end
-      def best_split(features, labels)
+      def best_split(features, labels, whole_impurity)
+        n_samples = labels.size
         features.to_a.uniq.sort.each_cons(2).map do |l, r|
           threshold = 0.5 * (l + r)
-          left_ids, right_ids = splited_ids(features, threshold)
-          [threshold, left_ids, right_ids, gain(labels, labels[left_ids], labels[right_ids])]
+          left_ids = features.le(threshold).where
+          right_ids = features.gt(threshold).where
+          left_impurity = impurity(labels[left_ids])
+          right_impurity = impurity(labels[right_ids])
+          gain = whole_impurity -
+                 left_impurity * left_ids.size.fdiv(n_samples) -
+                 right_impurity * right_ids.size.fdiv(n_samples)
+          [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
         end.max_by(&:last)
       end
-      def splited_ids(features, threshold)
-        [features.le(threshold).where, features.gt(threshold).where]
-      end
-      def gain(labels, labels_left, labels_right)
-        prob_left = labels_left.size.fdiv(labels.size)
-        prob_right = labels_right.size.fdiv(labels.size)
-        impurity(labels) - prob_left * impurity(labels_left) - prob_right * impurity(labels_right)
-      end
       def impurity(labels)
-        cls = labels.to_a.uniq.sort
-        cls.size == 1 ? 0.0 : send(@criterion, Numo::DFloat[*(cls.map { |c| labels.eq(c).count_true.fdiv(labels.size) })])
+        send(@criterion, labels.bincount / labels.size.to_f)
       end
       def gini(posterior_probs)
@@ -253,7 +249,7 @@ module SVMKit
       end
       def entropy(posterior_probs)
-        -(posterior_probs * Numo::NMath.log(posterior_probs)).sum
+        -(posterior_probs * Numo::NMath.log(posterior_probs + 1)).sum
       end
       def eval_importance(n_samples, n_features)
@@ -269,7 +265,8 @@ module SVMKit
         return nil if node.leaf
         return nil if node.left.nil? || node.right.nil?
         gain = node.n_samples * node.impurity -
-               node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
+               node.left.n_samples * node.left.impurity -
+               node.right.n_samples * node.right.impurity
         @feature_importances[node.feature_id] += gain
         eval_importance_at_node(node.left)
         eval_importance_at_node(node.right)

data/lib/svmkit/tree/decision_tree_regressor.rb CHANGED Viewed

@@ -151,12 +151,12 @@ module SVMKit
       def build_tree(x, y)
         @n_leaves = 0
         @leaf_values = []
-        @tree = grow_node(0, x, y)
+        @tree = grow_node(0, x, y, impurity(y))
         @leaf_values = Numo::DFloat.cast(@leaf_values)
         nil
       end
-      def grow_node(depth, x, y)
+      def grow_node(depth, x, y, whole_impurity)
         unless @params[:max_leaf_nodes].nil?
           return nil if @n_leaves >= @params[:max_leaf_nodes]
         end
@@ -164,7 +164,7 @@ module SVMKit
         n_samples, n_features = x.shape
         return nil if n_samples <= @params[:min_samples_leaf]
-        node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
+        node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
         return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
@@ -172,12 +172,14 @@ module SVMKit
           return put_leaf(node, y) if depth == @params[:max_depth]
         end
-        feature_id, threshold, left_ids, right_ids, max_gain =
-          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
-        return put_leaf(node, y) if max_gain.nil? || max_gain.zero?
+        feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
+          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
+        return put_leaf(node, y) if gain.nil? || gain.zero?
+        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
+        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
-        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true])
-        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true])
         return put_leaf(node, y) if node.left.nil? && node.right.nil?
         node.feature_id = feature_id
@@ -199,24 +201,21 @@ module SVMKit
         [*0...n].sample(@params[:max_features], random: @rng)
       end
-      def best_split(features, values)
+      def best_split(features, values, whole_impurity)
+        n_samples = values.shape[0]
         features.to_a.uniq.sort.each_cons(2).map do |l, r|
           threshold = 0.5 * (l + r)
-          left_ids, right_ids = splited_ids(features, threshold)
-          [threshold, left_ids, right_ids, gain(values, values[left_ids], values[right_ids])]
+          left_ids = features.le(threshold).where
+          right_ids = features.gt(threshold).where
+          left_impurity = impurity(values[left_ids, true])
+          right_impurity = impurity(values[right_ids, true])
+          gain = whole_impurity -
+                 left_impurity * left_ids.size.fdiv(n_samples) -
+                 right_impurity * right_ids.size.fdiv(n_samples)
+          [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
         end.max_by(&:last)
       end
-      def splited_ids(features, threshold)
-        [features.le(threshold).where, features.gt(threshold).where]
-      end
-      def gain(values, values_left, values_right)
-        prob_left = values_left.shape[0].fdiv(values.shape[0])
-        prob_right = values_right.shape[0].fdiv(values.shape[0])
-        impurity(values) - prob_left * impurity(values_left) - prob_right * impurity(values_right)
-      end
       def impurity(values)
         send(@criterion, values)
       end

data/lib/svmkit/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.6.1'.freeze
+  VERSION = '0.6.2'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-09-10 00:00:00.000000000 Z
+date: 2018-09-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray