RubyGems - svmkit - Versions diffs - 0.2.8 → 0.2.9 - Mend

svmkit 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +5 -5
data/.gitignore +4 -0
data/.rubocop.yml +10 -1
data/.rubocop_todo.yml +51 -10
data/Gemfile +1 -1
data/HISTORY.md +43 -33
data/lib/svmkit.rb +4 -0
data/lib/svmkit/base/classifier.rb +1 -0
data/lib/svmkit/ensemble/random_forest_classifier.rb +5 -2
data/lib/svmkit/evaluation_measure/log_loss.rb +44 -0
data/lib/svmkit/kernel_approximation/rbf.rb +1 -1
data/lib/svmkit/kernel_machine/kernel_svc.rb +40 -2
data/lib/svmkit/linear_model/logistic_regression.rb +3 -1
data/lib/svmkit/linear_model/svc.rb +46 -7
data/lib/svmkit/model_selection/cross_validation.rb +9 -1
data/lib/svmkit/model_selection/k_fold.rb +1 -1
data/lib/svmkit/model_selection/stratified_k_fold.rb +3 -2
data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +1 -0
data/lib/svmkit/naive_bayes/naive_bayes.rb +5 -0
data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +2 -0
data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +4 -1
data/lib/svmkit/preprocessing/label_encoder.rb +94 -0
data/lib/svmkit/preprocessing/one_hot_encoder.rb +98 -0
data/lib/svmkit/probabilistic_output.rb +112 -0
data/lib/svmkit/tree/decision_tree_classifier.rb +80 -10
data/lib/svmkit/validation.rb +12 -0
data/lib/svmkit/version.rb +1 -1
data/svmkit.gemspec +4 -6
metadata +18 -14

data/lib/svmkit/probabilistic_output.rb ADDED

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+module SVMKit
+  # Module for calculating posterior class probabilities with SVM outputs.
+  # This module is used for internal processes.
+  #
+  # @example
+  #   estimator = SVMKit::LinearModel::SVC.new
+  #   estimator.fit(x, bin_y)
+  #   df = estimator.decision_function(x)
+  #   params = SVMKit::ProbabilisticOutput.fit_sigmoid(df, bin_y)
+  #   probs = 1 / (Numo::NMath.exp(params[0] * df + params[1]) + 1)
+  #
+  # *Reference*
+  # 1. J C. Platt, "Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods," Adv. Large Margin Classifiers, pp. 61--74, 2000.
+  # 1. H-T Lin, C-J Lin, and R C.Weng, "A Note on Platt's Probabilistic Outputs for Support Vector Machines," J. Machine Learning, Vol. 63 (3), pp. 267--276, 2007.
+  module ProbabilisticOutput
+    class << self
+      # Fit the probabilistic model for binary SVM outputs.
+      #
+      # @param df [Numo::DFloat] (shape: [n_samples]) The outputs of decision function to be used for fitting the model.
+      # @param bin_y [Numo::Int32] (shape: [n_samples]) The binary labels to be used for fitting the model.
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param min_step [Float] The minimum step of Newton's method.
+      # @param sigma [Float] The parameter to avoid hessian matrix from becoming singular matrix.
+      # @return [Numo::DFloat] (shape: 2) The parameters of the model.
+      def fit_sigmoid(df, bin_y, max_iter = 100, min_step = 1e-10, sigma = 1e-12)
+        # Initialize some variables.
+        n_samples = bin_y.size
+        negative_label = bin_y.to_a.uniq.sort.first
+        pos = bin_y.ne(negative_label)
+        neg = bin_y.eq(negative_label)
+        n_pos_samples = pos.count
+        n_neg_samples = neg.count
+        target_probs = Numo::DFloat.zeros(n_samples)
+        target_probs[pos] = (n_pos_samples + 1) / (n_pos_samples + 2.0)
+        target_probs[neg] = 1 / (n_neg_samples + 2.0)
+        alpha = 0.0
+        beta = Math.log((n_neg_samples + 1) / (n_pos_samples + 1.0))
+        err = error_function(target_probs, df, alpha, beta)
+        # Optimize parameters for class porbability calculation.
+        old_grad_vec = Numo::DFloat.zeros(2)
+        max_iter.times do
+          # Calculate gradient and hessian matrix.
+          probs = predicted_probs(df, alpha, beta)
+          grad_vec = gradient(target_probs, probs, df)
+          hess_mat = hessian_matrix(probs, df, sigma)
+          break if grad_vec.abs.lt(1e-5).count == 2
+          break if (old_grad_vec - grad_vec).abs.sum < 1e-5
+          old_grad_vec = grad_vec
+          # Calculate Newton directions.
+          dirs_vec = directions(grad_vec, hess_mat)
+          grad_dir = grad_vec.dot(dirs_vec)
+          stepsize = 2.0
+          while stepsize >= min_step
+            stepsize *= 0.5
+            new_alpha = alpha + stepsize * dirs_vec[0]
+            new_beta = beta + stepsize * dirs_vec[1]
+            new_err = error_function(target_probs, df, new_alpha, new_beta)
+            next unless new_err < err + 0.0001 * stepsize * grad_dir
+            alpha = new_alpha
+            beta = new_beta
+            err = new_err
+            break
+          end
+        end
+        Numo::DFloat[alpha, beta]
+      end
+      private
+      def error_function(target_probs, df, alpha, beta)
+        fn = alpha * df + beta
+        pos = fn.ge(0.0)
+        neg = fn.lt(0.0)
+        err = 0.0
+        err += (target_probs[pos] * fn[pos] + Numo::NMath.log(1 + Numo::NMath.exp(-fn[pos]))).sum if pos.count > 0
+        err += ((target_probs[neg] - 1) * fn[neg] + Numo::NMath.log(1 + Numo::NMath.exp(fn[neg]))).sum if neg.count > 0
+        err
+      end
+      def predicted_probs(df, alpha, beta)
+        fn = alpha * df + beta
+        pos = fn.ge(0.0)
+        neg = fn.lt(0.0)
+        probs = Numo::DFloat.zeros(df.shape[0])
+        probs[pos] = Numo::NMath.exp(-fn[pos]) / (1 + Numo::NMath.exp(-fn[pos])) if pos.count > 0
+        probs[neg] = 1 / (1 + Numo::NMath.exp(fn[neg])) if neg.count > 0
+        probs
+      end
+      def gradient(target_probs, probs, df)
+        sub = target_probs - probs
+        Numo::DFloat[(df * sub).sum, sub.sum]
+      end
+      def hessian_matrix(probs, df, sigma)
+        sub = probs * (1 - probs)
+        h11 = (df * df * sub).sum + sigma
+        h22 = sub.sum + sigma
+        h21 = (df * sub).sum
+        Numo::DFloat[[h11, h21], [h21, h22]]
+      end
+      def directions(grad_vec, hess_mat)
+        det = hess_mat[0, 0] * hess_mat[1, 1] - hess_mat[0, 1] * hess_mat[1, 0]
+        inv_hess_mat = Numo::DFloat[[hess_mat[1, 1], -hess_mat[0, 1]], [-hess_mat[1, 0], hess_mat[0, 0]]] / det
+        -inv_hess_mat.dot(grad_vec)
+      end
+    end
+  end
+end

data/lib/svmkit/tree/decision_tree_classifier.rb CHANGED

@@ -7,6 +7,70 @@ require 'ostruct'
 module SVMKit
   # This module consists of the classes that implement tree models.
   module Tree
+    # Node is a class that implements node used for construction of decision tree.
+    # This class is used for internal data structures.
+    class Node
+      # @!visibility private
+      attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
+      # Create a new node for decision tree.
+      #
+      # @param depth [Integer] The depth of the node in tree.
+      # @param impurity [Float] The impurity of the node.
+      # @param n_samples [Integer] The number of the samples in the node.
+      # @param probs [Float] The probability of the node.
+      # @param leaf [Boolean] The flag indicating whether the node is a leaf.
+      # @param leaf_id [Integer] The leaf index of the node.
+      # @param left [Node] The left node.
+      # @param right [Node] The right node.
+      # @param feature_id [Integer] The feature index used for evaluation.
+      # @param threshold [Float] The threshold value of the feature for splitting the node.
+      def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
+                     leaf: true, leaf_id: 0,
+                     left: nil, right: nil, feature_id: 0, threshold: 0.0)
+        @depth = depth
+        @impurity = impurity
+        @n_samples = n_samples
+        @probs = probs
+        @leaf = leaf
+        @leaf_id = leaf_id
+        @left = left
+        @right = right
+        @feature_id = feature_id
+        @threshold = threshold
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about Node
+      def marshal_dump
+        { depth: @depth,
+          impurity: @impurity,
+          n_samples: @n_samples,
+          probs: @probs,
+          leaf: @leaf,
+          leaf_id: @leaf_id,
+          left: @left,
+          right: @right,
+          feature_id: @feature_id,
+          threshold: @threshold }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @depth = obj[:depth]
+        @impurity = obj[:impurity]
+        @n_samples = obj[:n_samples]
+        @probs = obj[:probs]
+        @leaf = obj[:leaf]
+        @leaf_id = obj[:leaf_id]
+        @left = obj[:left]
+        @right = obj[:right]
+        @feature_id = obj[:feature_id]
+        @threshold = obj[:threshold]
+      end
+    end
     # DecisionTreeClassifier is a class that implements decision tree for classification.
     #
     # @example
@@ -29,7 +93,7 @@ module SVMKit
       attr_reader :feature_importances
       # Return the learned tree.
-      # @return [OpenStruct]
+      # @return [Node]
       attr_reader :tree
       # Return the random generator for performing random sampling in the Pegasos algorithm.
@@ -55,10 +119,11 @@ module SVMKit
       def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
                      random_seed: nil)
         SVMKit::Validation.check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
-                                                    max_features: max_features, random_seed: random_seed)
+                                                             max_features: max_features, random_seed: random_seed)
         SVMKit::Validation.check_params_integer(min_samples_leaf: min_samples_leaf)
         SVMKit::Validation.check_params_string(criterion: criterion)
+        SVMKit::Validation.check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                                 min_samples_leaf: min_samples_leaf, max_features: max_features)
         @params = {}
         @params[:criterion] = criterion
         @params[:max_depth] = max_depth
@@ -67,6 +132,8 @@ module SVMKit
         @params[:max_features] = max_features
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
+        @criterion = :gini
+        @criterion = :entropy if @params[:criterion] == 'entropy'
         @tree = nil
         @classes = nil
         @feature_importances = nil
@@ -83,9 +150,10 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         n_samples, n_features = x.shape
-        @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
-        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        @params[:max_features] = n_features if @params[:max_features].nil?
+        @params[:max_features] = [@params[:max_features], n_features].min
         @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
         build_tree(x, y)
         eval_importance(n_samples, n_features)
@@ -125,6 +193,7 @@ module SVMKit
       def marshal_dump
         { params: @params,
           classes: @classes,
+          criterion: @criterion,
           tree: @tree,
           feature_importances: @feature_importances,
           leaf_labels: @leaf_labels,
@@ -136,6 +205,7 @@ module SVMKit
       def marshal_load(obj)
         @params = obj[:params]
         @classes = obj[:classes]
+        @criterion = obj[:criterion]
         @tree = obj[:tree]
         @feature_importances = obj[:feature_importances]
         @leaf_labels = obj[:leaf_labels]
@@ -183,7 +253,7 @@ module SVMKit
           return nil if n_samples <= @params[:min_samples_leaf]
         end
-        node = OpenStruct.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
+        node = Node.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
         return put_leaf(node, y) if y.to_a.uniq.size == 1
@@ -238,16 +308,16 @@ module SVMKit
       end
       def impurity(labels)
-        posterior_probs = labels.to_a.uniq.sort.map { |c| labels.eq(c).count / labels.size.to_f }
-        @params[:criterion] == 'entropy' ? entropy(posterior_probs) : gini(posterior_probs)
+        posterior_probs = Numo::DFloat[*(labels.to_a.uniq.sort.map { |c| labels.eq(c).count })] / labels.size.to_f
+        send(@criterion, posterior_probs)
       end
       def gini(posterior_probs)
-        1.0 - posterior_probs.map { |p| p**2 }.inject(:+)
+        1.0 - (posterior_probs * posterior_probs).sum
       end
       def entropy(posterior_probs)
-        -posterior_probs.map { |p| p * Math.log(p) }.inject(:+)
+        -(posterior_probs * Numo::NMath.log(posterior_probs)).sum
       end
       def eval_importance(n_samples, n_features)

data/lib/svmkit/validation.rb CHANGED

@@ -19,6 +19,12 @@ module SVMKit
       nil
     end
+    # @!visibility private
+    def check_sample_label_size(x, y)
+      raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
+      nil
+    end
     # @!visibility private
     def check_params_type(type, params = {})
       params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
@@ -51,5 +57,11 @@ module SVMKit
       params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
       nil
     end
+    # @!visibility private
+    def check_params_positive(params = {})
+      params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
+      nil
+    end
   end
 end

data/lib/svmkit/version.rb CHANGED

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.2.8'
+  VERSION = '0.2.9'.freeze
 end

data/svmkit.gemspec CHANGED

@@ -1,9 +1,8 @@
-# coding: utf-8
-lib = File.expand_path('../lib', __FILE__)
+lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'svmkit/version'
 Gem::Specification.new do |spec|
   spec.name          = 'svmkit'
   spec.version       = SVMKit::VERSION
@@ -33,12 +32,12 @@ MSG
   spec.required_ruby_version = '>= 2.1'
-  spec.add_runtime_dependency 'numo-narray', '~> 0.9.0'
+  spec.add_runtime_dependency 'numo-narray', '>= 0.9.0'
   spec.add_development_dependency 'bundler', '~> 1.16'
+  spec.add_development_dependency 'coveralls', '~> 0.8'
   spec.add_development_dependency 'rake', '~> 12.0'
   spec.add_development_dependency 'rspec', '~> 3.0'
-  spec.add_development_dependency 'coveralls', '~> 0.8'
   spec.post_install_message = <<MSG
 *************************************************************************
@@ -48,5 +47,4 @@ Note that the SVMKit has been changed to use Numo::NArray for
 linear algebra library from version 0.2.0.
 *************************************************************************
 MSG
 end

metadata CHANGED

@@ -1,27 +1,27 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.2.8
+  version: 0.2.9
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-04-05 00:00:00.000000000 Z
+date: 2018-05-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
         version: 0.9.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
         version: 0.9.0
 - !ruby/object:Gem::Dependency
@@ -39,47 +39,47 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '1.16'
 - !ruby/object:Gem::Dependency
-  name: rake
+  name: coveralls
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '12.0'
+        version: '0.8'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '12.0'
+        version: '0.8'
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '12.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '12.0'
 - !ruby/object:Gem::Dependency
-  name: coveralls
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.8'
+        version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.8'
+        version: '3.0'
 description: |
   SVMKit is a machine learninig library in Ruby.
   SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
@@ -116,6 +116,7 @@ files:
 - lib/svmkit/ensemble/random_forest_classifier.rb
 - lib/svmkit/evaluation_measure/accuracy.rb
 - lib/svmkit/evaluation_measure/f_score.rb
+- lib/svmkit/evaluation_measure/log_loss.rb
 - lib/svmkit/evaluation_measure/precision.rb
 - lib/svmkit/evaluation_measure/precision_recall.rb
 - lib/svmkit/evaluation_measure/recall.rb
@@ -132,8 +133,11 @@ files:
 - lib/svmkit/pairwise_metric.rb
 - lib/svmkit/polynomial_model/factorization_machine_classifier.rb
 - lib/svmkit/preprocessing/l2_normalizer.rb
+- lib/svmkit/preprocessing/label_encoder.rb
 - lib/svmkit/preprocessing/min_max_scaler.rb
+- lib/svmkit/preprocessing/one_hot_encoder.rb
 - lib/svmkit/preprocessing/standard_scaler.rb
+- lib/svmkit/probabilistic_output.rb
 - lib/svmkit/tree/decision_tree_classifier.rb
 - lib/svmkit/validation.rb
 - lib/svmkit/version.rb
@@ -164,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.5.4
+rubygems_version: 2.7.6
 signing_key:
 specification_version: 4
 summary: SVMKit is a machine learninig library in Ruby. SVMKit provides machine learning