RubyGems - svmkit - Versions diffs - 0.3.1 → 0.3.2 - Mend

svmkit 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/HISTORY.md +8 -0
data/lib/svmkit/dataset.rb +18 -4
data/lib/svmkit/ensemble/random_forest_classifier.rb +2 -1
data/lib/svmkit/ensemble/random_forest_regressor.rb +141 -0
data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +231 -0
data/lib/svmkit/tree/decision_tree_classifier.rb +2 -65
data/lib/svmkit/tree/decision_tree_regressor.rb +252 -0
data/lib/svmkit/tree/node.rb +70 -0
data/lib/svmkit/version.rb +1 -1
data/lib/svmkit.rb +4 -0
metadata +6 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 558b550a373cb5cbe7c295dc589c57b6b37a697b8309fa5497e5b0da6fd83336
-  data.tar.gz: ab8241d5e35446f1e7342a08fcb915bed0542a65fcaf4837c358d19699b791e9
+  metadata.gz: 93ce9c2e79ac158b4a3e988afc547b1891419eb6e6b1845156cf98eaa3cdd578
+  data.tar.gz: 4e677653deebd035cbdcd5c98529b7f4fee6804075ecab1113dbccc0bf9c65ed
 SHA512:
-  metadata.gz: a4739788d141bae29fdf1baba602ba76c51299cd8f8536e1a919084d94601b5ddeb02f9128b289965c4c821a925f063da0c2eec9b360dd5190ef9b9c9f2daae5
-  data.tar.gz: 1fdee6fec50ee3d995639d8c78f6e7c259456e1c0b1a916c40ec1e16ff578ecb63b4f7029029c129e6723c69bf19161e9e48589bd9954c8e8ab1ea90b777a870
+  metadata.gz: 7518039557e3c991c4a0cc112764198ed6340c8be1fa9c3fb746be21ffbb5518dd35651149cda5aba8ef52a36dfa6b17f47e1335893ae0cd1dfc5776a0e6bf8e
+  data.tar.gz: c062d9c2a7c04be82787a4d76a970855c2dc8ce0d4bf5531b6196b96873f4f8e679ee434af9a56c2ebf56ae9a8adb33387925050eb9b8964da613318f2a0e430

data/HISTORY.md CHANGED Viewed

@@ -1,3 +1,11 @@
+# 0.3.2
+- Add class for Factorization Machine regressor.
+- Add class for Decision Tree regressor.
+- Add class for Random Forest regressor.
+- Fix to support loading and dumping libsvm file with multi-target variables.
+- Fix to require DecisionTreeClassifier on RandomForestClassifier.
+- Fix some mistakes on document.
 # 0.3.1
 - Fix bug on decision function calculation of FactorizationMachineClassifier.
 - Fix bug on weight updating process of KernelSVC.

data/lib/svmkit/dataset.rb CHANGED Viewed

@@ -33,11 +33,13 @@ module SVMKit
       # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
       def dump_libsvm_file(data, labels, filename, zero_based: false)
         n_samples = [data.shape[0], labels.shape[0]].min
+        single_label = labels.shape[1].nil?
         label_type = detect_dtype(labels)
         value_type = detect_dtype(data)
         File.open(filename, 'w') do |file|
           n_samples.times do |n|
-            file.puts(dump_libsvm_line(labels[n], data[n, true],
+            label = single_label ? labels[n] : labels[n, true].to_a
+            file.puts(dump_libsvm_line(label, data[n, true],
                                        label_type, value_type, zero_based))
           end
         end
@@ -47,8 +49,7 @@ module SVMKit
       def parse_libsvm_line(line, zero_based)
         tokens = line.split
-        label = tokens.shift
-        label = label.to_i.to_s == label ? label.to_i : label.to_f
+        label = parse_label(tokens.shift)
         ftvec = tokens.map do |el|
           idx, val = el.split(':')
           idx = idx.to_i - (zero_based == false ? 1 : 0)
@@ -60,6 +61,11 @@ module SVMKit
         [label, ftvec, max_idx]
       end
+      def parse_label(label)
+        lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
+        lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
+      end
       def convert_to_matrix(data, n_features)
         mat = []
         data.each do |ft|
@@ -80,13 +86,21 @@ module SVMKit
       end
       def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
-        line = format(label_type.to_s, label)
+        line = dump_label(label, label_type.to_s)
         ftvec.to_a.each_with_index do |val, n|
           idx = n + (zero_based == false ? 1 : 0)
           line += format(" %d:#{value_type}", idx, val) if val != 0.0
         end
         line
       end
+      def dump_label(label, label_type_str)
+        if label.is_a?(Array)
+          label.map { |lbl| format(label_type_str, lbl) }.join(',')
+        else
+          format(label_type_str, label)
+        end
+      end
     end
   end
 end

data/lib/svmkit/ensemble/random_forest_classifier.rb CHANGED Viewed

@@ -3,6 +3,7 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
+require 'svmkit/tree/decision_tree_classifier'
 module SVMKit
   # This module consists of the classes that implement ensemble-based methods.
@@ -32,7 +33,7 @@ module SVMKit
       # @return [Numo::DFloat] (size: n_features)
       attr_reader :feature_importances
-      # Return the random generator for performing random sampling in the Pegasos algorithm.
+      # Return the random generator for random selection of feature index.
       # @return [Random]
       attr_reader :rng

data/lib/svmkit/ensemble/random_forest_regressor.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require 'pp'
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/regressor'
+require 'svmkit/tree/decision_tree_regressor'
+module SVMKit
+  module Ensemble
+    # RandomForestRegressor is a class that implements random forest for regression
+    #
+    # @example
+    #   estimator =
+    #     SVMKit::Ensemble::RandomForestRegressor.new(
+    #       n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    class RandomForestRegressor
+      include Base::BaseEstimator
+      include Base::Regressor
+      include Validation
+      # Return the set of estimators.
+      # @return [Array<DecisionTreeRegressor>]
+      attr_reader :estimators
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new regressor with random forest.
+      #
+      # @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10, criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
+                              max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
+                              max_features: max_features)
+        @params = {}
+        @params[:n_estimators] = n_estimators
+        @params[:criterion] = criterion
+        @params[:max_depth] = max_depth
+        @params[:max_leaf_nodes] = max_leaf_nodes
+        @params[:min_samples_leaf] = min_samples_leaf
+        @params[:max_features] = max_features
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @estimators = nil
+        @feature_importances = nil
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @return [RandomForestRegressor] The learned regressor itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        # Initialize some variables.
+        n_samples, n_features = x.shape
+        @params[:max_features] ||= n_features
+        @params[:max_features] = [[1, @params[:max_features]].max, Math.sqrt(n_features).to_i].min
+        single_target = y.shape[1].nil?
+        # Construct forest.
+        @estimators = Array.new(@params[:n_estimators]) do |_n|
+          tree = Tree::DecisionTreeRegressor.new(
+            criterion: @params[:criterion], max_depth: @params[:max_depth],
+            max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+            max_features: @params[:max_features], random_seed: @params[:random_seed]
+          )
+          bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
+          tree.fit(x[bootstrap_ids, true], single_target ? y[bootstrap_ids] : y[bootstrap_ids, true])
+        end
+        # Calculate feature importances.
+        @feature_importances = @estimators.map(&:feature_importances).reduce(&:+)
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
+      def predict(x)
+        check_sample_array(x)
+        @estimators.map { |est| est.predict(x) }.reduce(&:+) / @params[:n_estimators]
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        SVMKit::Validation.check_sample_array(x)
+        Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about RandomForestRegressor
+      def marshal_dump
+        { params: @params,
+          estimators: @estimators,
+          feature_importances: @feature_importances,
+          rng: @rng }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @estimators = obj[:estimators]
+        @feature_importances = obj[:feature_importances]
+        @rng = obj[:rng]
+        nil
+      end
+    end
+  end
+end

data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb ADDED Viewed

@@ -0,0 +1,231 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/regressor'
+module SVMKit
+  module PolynomialModel
+    # FactorizationMachineRegressor is a class that implements Factorization Machine
+    # with stochastic gradient descent (SGD) optimization.
+    #
+    # @example
+    #   estimator =
+    #     SVMKit::PolynomialModel::FactorizationMachineRegressor.new(
+    #      n_factors: 10, reg_param_bias: 0.1, reg_param_weight: 0.1, reg_param_factor: 0.1,
+    #      max_iter: 5000, batch_size: 50, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - S. Rendle, "Factorization Machines with libFM," ACM Transactions on Intelligent Systems and Technology, vol. 3 (3), pp. 57:1--57:22, 2012.
+    # - S. Rendle, "Factorization Machines," Proc. the 10th IEEE International Conference on Data Mining (ICDM'10), pp. 995--1000, 2010.
+    # - I. Sutskever, J. Martens, G. Dahl, and G. Hinton, "On the importance of initialization and momentum in deep learning," Proc. the 30th  International Conference on Machine Learning (ICML' 13), pp. 1139--1147, 2013.
+    # - G. Hinton, N. Srivastava, and K. Swersky, "Lecture 6e rmsprop," Neural Networks for Machine Learning, 2012.
+    class FactorizationMachineRegressor
+      include Base::BaseEstimator
+      include Base::Regressor
+      include Validation
+      # Return the factor matrix for Factorization Machine.
+      # @return [Numo::DFloat] (shape: [n_outputs, n_factors, n_features])
+      attr_reader :factor_mat
+      # Return the weight vector for Factorization Machine.
+      # @return [Numo::DFloat] (shape: [n_outputs, n_features])
+      attr_reader :weight_vec
+      # Return the bias term for Factoriazation Machine.
+      # @return [Numo::DFloat] (shape: [n_outputs])
+      attr_reader :bias_term
+      # Return the random generator for random sampling.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new regressor with Factorization Machine.
+      #
+      # @param n_factors [Integer] The maximum number of iterations.
+      # @param reg_param_bias [Float] The regularization parameter for bias term.
+      # @param reg_param_weight [Float] The regularization parameter for weight vector.
+      # @param reg_param_factor [Float] The regularization parameter for factor matrix.
+      # @param init_std [Float] The standard deviation of normal random number for initialization of factor matrix.
+      # @param learning_rate [Float] The learning rate for optimization.
+      # @param decay [Float] The discounting factor for RMS prop optimization.
+      # @param momentum [Float] The Nesterov momentum for optimization.
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param batch_size [Integer] The size of the mini batches.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_factors: 2,
+                     reg_param_bias: 1.0, reg_param_weight: 1.0, reg_param_factor: 1.0, init_std: 0.01,
+                     learning_rate: 0.01, decay: 0.9, momentum: 0.9,
+                     max_iter: 1000, batch_size: 10, random_seed: nil)
+        check_params_float(reg_param_bias: reg_param_bias, reg_param_weight: reg_param_weight,
+                           reg_param_factor: reg_param_factor, init_std: init_std,
+                           learning_rate: learning_rate, decay: decay, momentum: momentum)
+        check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
+                              reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
+                              learning_rate: learning_rate, decay: decay, momentum: momentum,
+                              max_iter: max_iter, batch_size: batch_size)
+        @params = {}
+        @params[:n_factors] = n_factors
+        @params[:reg_param_bias] = reg_param_bias
+        @params[:reg_param_weight] = reg_param_weight
+        @params[:reg_param_factor] = reg_param_factor
+        @params[:init_std] = init_std
+        @params[:learning_rate] = learning_rate
+        @params[:decay] = decay
+        @params[:momentum] = momentum
+        @params[:max_iter] = max_iter
+        @params[:batch_size] = batch_size
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @factor_mat = nil
+        @weight_vec = nil
+        @bias_term = nil
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::Int32] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @return [FactorizationMachineRegressor] The learned regressor itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
+        _n_samples, n_features = x.shape
+        if n_outputs > 1
+          @factor_mat = Numo::DFloat.zeros(n_outputs, @params[:n_factors], n_features)
+          @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
+          @bias_term = Numo::DFloat.zeros(n_outputs)
+          n_outputs.times do |n|
+            factor, weight, bias = single_fit(x, y[true, n])
+            @factor_mat[n, true, true] = factor
+            @weight_vec[n, true] = weight
+            @bias_term[n] = bias
+          end
+        else
+          @factor_mat, @weight_vec, @bias_term = single_fit(x, y)
+        end
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      def predict(x)
+        check_sample_array(x)
+        linear_term = @bias_term + x.dot(@weight_vec.transpose)
+        factor_term = if @weight_vec.shape[1].nil?
+                        0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(0)
+                      else
+                        0.5 * (@factor_mat.dot(x.transpose)**2 - (@factor_mat**2).dot(x.transpose**2)).sum(1).transpose
+                      end
+        linear_term + factor_term
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about FactorizationMachineRegressor
+      def marshal_dump
+        { params: @params,
+          factor_mat: @factor_mat,
+          weight_vec: @weight_vec,
+          bias_term: @bias_term,
+          rng: @rng }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @factor_mat = obj[:factor_mat]
+        @weight_vec = obj[:weight_vec]
+        @bias_term = obj[:bias_term]
+        @rng = obj[:rng]
+        nil
+      end
+      private
+      def single_fit(x, y)
+        # Initialize some variables.
+        n_samples, n_features = x.shape
+        rand_ids = [*0...n_samples].shuffle(random: @rng)
+        factor_mat = rand_normal([@params[:n_factors], n_features], 0, @params[:init_std])
+        factor_sqrsum = Numo::DFloat.zeros(factor_mat.shape)
+        factor_update = Numo::DFloat.zeros(factor_mat.shape)
+        weight_vec = Numo::DFloat.zeros(n_features)
+        weight_sqrsum = Numo::DFloat.zeros(n_features)
+        weight_update = Numo::DFloat.zeros(n_features)
+        bias_term = 0.0
+        bias_sqrsum = 0.0
+        bias_update = 0.0
+        # Start optimization.
+        @params[:max_iter].times do |_t|
+          # Random sampling.
+          subset_ids = rand_ids.shift(@params[:batch_size])
+          rand_ids.concat(subset_ids)
+          data = x[subset_ids, true]
+          values = y[subset_ids]
+          # Calculate gradients for loss function.
+          loss_grad = loss_gradient(data, values, factor_mat, weight_vec, bias_term)
+          next if loss_grad.ne(0.0).count.zero?
+          # Update each parameter.
+          bias_term, bias_sqrsum, bias_update =
+            update_param(bias_term, bias_sqrsum, bias_update,
+                         bias_gradient(loss_grad, bias_term - @params[:momentum] * bias_update))
+          weight_vec, weight_sqrsum, weight_update =
+            update_param(weight_vec, weight_sqrsum, weight_update,
+                         weight_gradient(loss_grad, data, weight_vec - @params[:momentum] * weight_update))
+          @params[:n_factors].times do |n|
+            factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true] =
+              update_param(factor_update[n, true], factor_sqrsum[n, true], factor_update[n, true],
+                           factor_gradient(loss_grad, data, factor_mat[n, true] - @params[:momentum] * factor_update[n, true]))
+          end
+        end
+        [factor_mat, weight_vec, bias_term]
+      end
+      def loss_gradient(x, y, factor, weight, bias)
+        z = bias + x.dot(weight) + 0.5 * (factor.dot(x.transpose)**2 - (factor**2).dot(x.transpose**2)).sum(0)
+        2.0 * (z - y)
+      end
+      def bias_gradient(loss_grad, bias)
+        loss_grad.mean + @params[:reg_param_bias] * bias
+      end
+      def weight_gradient(loss_grad, data, weight)
+        (loss_grad.expand_dims(1) * data).mean(0) + @params[:reg_param_weight] * weight
+      end
+      def factor_gradient(loss_grad, data, factor)
+        (loss_grad.expand_dims(1) * (data * data.dot(factor).expand_dims(1) - factor * (data**2))).mean(0) + @params[:reg_param_factor] * factor
+      end
+      def update_param(param, sqrsum, update, gr)
+        new_sqrsum = @params[:decay] * sqrsum + (1.0 - @params[:decay]) * gr**2
+        new_update = (@params[:learning_rate] / ((new_sqrsum + 1.0e-8)**0.5)) * gr
+        new_param = param - (new_update + @params[:momentum] * update)
+        [new_param, new_sqrsum, new_update]
+      end
+      def rand_uniform(shape)
+        Numo::DFloat[*Array.new(shape.inject(&:*)) { @rng.rand }].reshape(*shape)
+      end
+      def rand_normal(shape, mu, sigma)
+        mu + sigma * (Numo::NMath.sqrt(-2.0 * Numo::NMath.log(rand_uniform(shape))) * Numo::NMath.sin(2.0 * Math::PI * rand_uniform(shape)))
+      end
+    end
+  end
+end

data/lib/svmkit/tree/decision_tree_classifier.rb CHANGED Viewed

@@ -3,74 +3,11 @@
 require 'svmkit/validation'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
+require 'svmkit/tree/node'
 module SVMKit
   # This module consists of the classes that implement tree models.
   module Tree
-    # Node is a class that implements node used for construction of decision tree.
-    # This class is used for internal data structures.
-    class Node
-      # @!visibility private
-      attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
-      # Create a new node for decision tree.
-      #
-      # @param depth [Integer] The depth of the node in tree.
-      # @param impurity [Float] The impurity of the node.
-      # @param n_samples [Integer] The number of the samples in the node.
-      # @param probs [Float] The probability of the node.
-      # @param leaf [Boolean] The flag indicating whether the node is a leaf.
-      # @param leaf_id [Integer] The leaf index of the node.
-      # @param left [Node] The left node.
-      # @param right [Node] The right node.
-      # @param feature_id [Integer] The feature index used for evaluation.
-      # @param threshold [Float] The threshold value of the feature for splitting the node.
-      def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
-                     leaf: true, leaf_id: 0,
-                     left: nil, right: nil, feature_id: 0, threshold: 0.0)
-        @depth = depth
-        @impurity = impurity
-        @n_samples = n_samples
-        @probs = probs
-        @leaf = leaf
-        @leaf_id = leaf_id
-        @left = left
-        @right = right
-        @feature_id = feature_id
-        @threshold = threshold
-      end
-      # Dump marshal data.
-      # @return [Hash] The marshal data about Node
-      def marshal_dump
-        { depth: @depth,
-          impurity: @impurity,
-          n_samples: @n_samples,
-          probs: @probs,
-          leaf: @leaf,
-          leaf_id: @leaf_id,
-          left: @left,
-          right: @right,
-          feature_id: @feature_id,
-          threshold: @threshold }
-      end
-      # Load marshal data.
-      # @return [nil]
-      def marshal_load(obj)
-        @depth = obj[:depth]
-        @impurity = obj[:impurity]
-        @n_samples = obj[:n_samples]
-        @probs = obj[:probs]
-        @leaf = obj[:leaf]
-        @leaf_id = obj[:leaf_id]
-        @left = obj[:left]
-        @right = obj[:right]
-        @feature_id = obj[:feature_id]
-        @threshold = obj[:threshold]
-      end
-    end
     # DecisionTreeClassifier is a class that implements decision tree for classification.
     #
     # @example
@@ -96,7 +33,7 @@ module SVMKit
       # @return [Node]
       attr_reader :tree
-      # Return the random generator for performing random sampling in the Pegasos algorithm.
+      # Return the random generator for random selection of feature index.
       # @return [Random]
       attr_reader :rng

data/lib/svmkit/tree/decision_tree_regressor.rb ADDED Viewed

@@ -0,0 +1,252 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/regressor'
+require 'svmkit/tree/node'
+module SVMKit
+  module Tree
+    # DecisionTreeRegressor is a class that implements decision tree for regression.
+    #
+    # @example
+    #   estimator =
+    #     SVMKit::Tree::DecisionTreeRegressor.new(
+    #       max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    class DecisionTreeRegressor
+      include Base::BaseEstimator
+      include Base::Regressor
+      include Validation
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+      # Return the learned tree.
+      # @return [Node]
+      attr_reader :tree
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+      # Return the values assigned each leaf.
+      # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
+      attr_reader :leaf_values
+      # Create a new regressor with decision tree algorithm.
+      #
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, decision tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers all features.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
+                     random_seed: nil)
+        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                          max_features: max_features, random_seed: random_seed)
+        check_params_integer(min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                              min_samples_leaf: min_samples_leaf, max_features: max_features)
+        @params = {}
+        @params[:criterion] = criterion
+        @params[:max_depth] = max_depth
+        @params[:max_leaf_nodes] = max_leaf_nodes
+        @params[:min_samples_leaf] = min_samples_leaf
+        @params[:max_features] = max_features
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @criterion = :mse
+        @criterion = :mae if @params[:criterion] == 'mae'
+        @tree = nil
+        @feature_importances = nil
+        @n_leaves = nil
+        @leaf_values = nil
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
+      # @return [DecisionTreeRegressor] The learned regressor itself.
+      def fit(x, y)
+        check_sample_array(x)
+        check_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        single_target = y.shape[1].nil?
+        y = y.expand_dims(1) if single_target
+        n_samples, n_features = x.shape
+        @params[:max_features] = n_features if @params[:max_features].nil?
+        @params[:max_features] = [@params[:max_features], n_features].min
+        build_tree(x, y)
+        @leaf_values = @leaf_values[true] if single_target
+        eval_importance(n_samples, n_features)
+        self
+      end
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      def predict(x)
+        check_sample_array(x)
+        @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
+      end
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
+      def apply(x)
+        check_sample_array(x)
+        Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about DecisionTreeRegressor
+      def marshal_dump
+        { params: @params,
+          criterion: @criterion,
+          tree: @tree,
+          feature_importances: @feature_importances,
+          leaf_values: @leaf_values,
+          rng: @rng }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @criterion = obj[:criterion]
+        @tree = obj[:tree]
+        @feature_importances = obj[:feature_importances]
+        @leaf_values = obj[:leaf_values]
+        @rng = obj[:rng]
+        nil
+      end
+      private
+      def apply_at_node(node, sample)
+        return node.leaf_id if node.leaf
+        return apply_at_node(node.left, sample) if node.right.nil?
+        return apply_at_node(node.right, sample) if node.left.nil?
+        if sample[node.feature_id] <= node.threshold
+          apply_at_node(node.left, sample)
+        else
+          apply_at_node(node.right, sample)
+        end
+      end
+      def build_tree(x, y)
+        @n_leaves = 0
+        @leaf_values = []
+        @tree = grow_node(0, x, y)
+        @leaf_values = Numo::DFloat.cast(@leaf_values)
+        nil
+      end
+      def grow_node(depth, x, y)
+        unless @params[:max_leaf_nodes].nil?
+          return nil if @n_leaves >= @params[:max_leaf_nodes]
+        end
+        n_samples, n_features = x.shape
+        return nil if n_samples <= @params[:min_samples_leaf]
+        node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
+        return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
+        unless @params[:max_depth].nil?
+          return put_leaf(node, y) if depth == @params[:max_depth]
+        end
+        feature_id, threshold, left_ids, right_ids, max_gain =
+          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
+        return put_leaf(node, y) if max_gain.nil? || max_gain.zero?
+        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true])
+        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true])
+        return put_leaf(node, y) if node.left.nil? && node.right.nil?
+        node.feature_id = feature_id
+        node.threshold = threshold
+        node.leaf = false
+        node
+      end
+      def put_leaf(node, values)
+        node.probs = nil
+        node.leaf = true
+        node.leaf_id = @n_leaves
+        @n_leaves += 1
+        @leaf_values.push(values.mean(0))
+        node
+      end
+      def rand_ids(n)
+        [*0...n].sample(@params[:max_features], random: @rng)
+      end
+      def best_split(features, values)
+        features.to_a.uniq.sort.each_cons(2).map do |l, r|
+          threshold = 0.5 * (l + r)
+          left_ids, right_ids = splited_ids(features, threshold)
+          [threshold, left_ids, right_ids, gain(values, values[left_ids], values[right_ids])]
+        end.max_by(&:last)
+      end
+      def splited_ids(features, threshold)
+        [features.le(threshold).where.to_a, features.gt(threshold).where.to_a]
+      end
+      def gain(values, values_left, values_right)
+        prob_left = values_left.shape[0].fdiv(values.shape[0])
+        prob_right = values_right.shape[0].fdiv(values.shape[0])
+        impurity(values) - prob_left * impurity(values_left) - prob_right * impurity(values_right)
+      end
+      def impurity(values)
+        send(@criterion, values)
+      end
+      def mse(values)
+        ((values - values.mean(0))**2).mean
+      end
+      def mae(values)
+        (values - values.mean(0)).abs.mean
+      end
+      def eval_importance(n_samples, n_features)
+        @feature_importances = Numo::DFloat.zeros(n_features)
+        eval_importance_at_node(@tree)
+        @feature_importances /= n_samples
+        normalizer = @feature_importances.sum
+        @feature_importances /= normalizer if normalizer > 0.0
+        nil
+      end
+      def eval_importance_at_node(node)
+        return nil if node.leaf
+        return nil if node.left.nil? || node.right.nil?
+        gain = node.n_samples * node.impurity -
+               node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
+        @feature_importances[node.feature_id] += gain
+        eval_importance_at_node(node.left)
+        eval_importance_at_node(node.right)
+      end
+    end
+  end
+end

data/lib/svmkit/tree/node.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+module SVMKit
+  module Tree
+    # Node is a class that implements node used for construction of decision tree.
+    # This class is used for internal data structures.
+    class Node
+      # @!visibility private
+      attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
+      # Create a new node for decision tree.
+      #
+      # @param depth [Integer] The depth of the node in tree.
+      # @param impurity [Float] The impurity of the node.
+      # @param n_samples [Integer] The number of the samples in the node.
+      # @param probs [Float] The probability of the node.
+      # @param leaf [Boolean] The flag indicating whether the node is a leaf.
+      # @param leaf_id [Integer] The leaf index of the node.
+      # @param left [Node] The left node.
+      # @param right [Node] The right node.
+      # @param feature_id [Integer] The feature index used for evaluation.
+      # @param threshold [Float] The threshold value of the feature for splitting the node.
+      def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
+                     leaf: true, leaf_id: 0,
+                     left: nil, right: nil, feature_id: 0, threshold: 0.0)
+        @depth = depth
+        @impurity = impurity
+        @n_samples = n_samples
+        @probs = probs
+        @leaf = leaf
+        @leaf_id = leaf_id
+        @left = left
+        @right = right
+        @feature_id = feature_id
+        @threshold = threshold
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about Node
+      def marshal_dump
+        { depth: @depth,
+          impurity: @impurity,
+          n_samples: @n_samples,
+          probs: @probs,
+          leaf: @leaf,
+          leaf_id: @leaf_id,
+          left: @left,
+          right: @right,
+          feature_id: @feature_id,
+          threshold: @threshold }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @depth = obj[:depth]
+        @impurity = obj[:impurity]
+        @n_samples = obj[:n_samples]
+        @probs = obj[:probs]
+        @leaf = obj[:leaf]
+        @leaf_id = obj[:leaf_id]
+        @left = obj[:left]
+        @right = obj[:right]
+        @feature_id = obj[:feature_id]
+        @threshold = obj[:threshold]
+        nil
+      end
+    end
+  end
+end

data/lib/svmkit/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.3.1'.freeze
+  VERSION = '0.3.2'.freeze
 end

data/lib/svmkit.rb CHANGED Viewed

@@ -19,12 +19,16 @@ require 'svmkit/linear_model/svr'
 require 'svmkit/linear_model/logistic_regression'
 require 'svmkit/kernel_machine/kernel_svc'
 require 'svmkit/polynomial_model/factorization_machine_classifier'
+require 'svmkit/polynomial_model/factorization_machine_regressor'
 require 'svmkit/multiclass/one_vs_rest_classifier'
 require 'svmkit/nearest_neighbors/k_neighbors_classifier'
 require 'svmkit/nearest_neighbors/k_neighbors_regressor'
 require 'svmkit/naive_bayes/naive_bayes'
+require 'svmkit/tree/node'
 require 'svmkit/tree/decision_tree_classifier'
+require 'svmkit/tree/decision_tree_regressor'
 require 'svmkit/ensemble/random_forest_classifier'
+require 'svmkit/ensemble/random_forest_regressor'
 require 'svmkit/preprocessing/l2_normalizer'
 require 'svmkit/preprocessing/min_max_scaler'
 require 'svmkit/preprocessing/standard_scaler'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-05-16 00:00:00.000000000 Z
+date: 2018-05-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -115,6 +115,7 @@ files:
 - lib/svmkit/base/transformer.rb
 - lib/svmkit/dataset.rb
 - lib/svmkit/ensemble/random_forest_classifier.rb
+- lib/svmkit/ensemble/random_forest_regressor.rb
 - lib/svmkit/evaluation_measure/accuracy.rb
 - lib/svmkit/evaluation_measure/f_score.rb
 - lib/svmkit/evaluation_measure/log_loss.rb
@@ -138,6 +139,7 @@ files:
 - lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb
 - lib/svmkit/pairwise_metric.rb
 - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
+- lib/svmkit/polynomial_model/factorization_machine_regressor.rb
 - lib/svmkit/preprocessing/l2_normalizer.rb
 - lib/svmkit/preprocessing/label_encoder.rb
 - lib/svmkit/preprocessing/min_max_scaler.rb
@@ -145,6 +147,8 @@ files:
 - lib/svmkit/preprocessing/standard_scaler.rb
 - lib/svmkit/probabilistic_output.rb
 - lib/svmkit/tree/decision_tree_classifier.rb
+- lib/svmkit/tree/decision_tree_regressor.rb
+- lib/svmkit/tree/node.rb
 - lib/svmkit/validation.rb
 - lib/svmkit/version.rb
 - svmkit.gemspec