RubyGems - svmkit - Versions diffs - 0.2.8 → 0.2.9 - Mend

svmkit 0.2.8 → 0.2.9

Files changed (29) hide show

checksums.yaml +5 -5
data/.gitignore +4 -0
data/.rubocop.yml +10 -1
data/.rubocop_todo.yml +51 -10
data/Gemfile +1 -1
data/HISTORY.md +43 -33
data/lib/svmkit.rb +4 -0
data/lib/svmkit/base/classifier.rb +1 -0
data/lib/svmkit/ensemble/random_forest_classifier.rb +5 -2
data/lib/svmkit/evaluation_measure/log_loss.rb +44 -0
data/lib/svmkit/kernel_approximation/rbf.rb +1 -1
data/lib/svmkit/kernel_machine/kernel_svc.rb +40 -2
data/lib/svmkit/linear_model/logistic_regression.rb +3 -1
data/lib/svmkit/linear_model/svc.rb +46 -7
data/lib/svmkit/model_selection/cross_validation.rb +9 -1
data/lib/svmkit/model_selection/k_fold.rb +1 -1
data/lib/svmkit/model_selection/stratified_k_fold.rb +3 -2
data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +1 -0
data/lib/svmkit/naive_bayes/naive_bayes.rb +5 -0
data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +2 -0
data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +4 -1
data/lib/svmkit/preprocessing/label_encoder.rb +94 -0
data/lib/svmkit/preprocessing/one_hot_encoder.rb +98 -0
data/lib/svmkit/probabilistic_output.rb +112 -0
data/lib/svmkit/tree/decision_tree_classifier.rb +80 -10
data/lib/svmkit/validation.rb +12 -0
data/lib/svmkit/version.rb +1 -1
data/svmkit.gemspec +4 -6
metadata +18 -14

data/lib/svmkit/linear_model/svc.rb CHANGED

@@ -45,26 +45,30 @@ module SVMKit
       # @param bias_scale [Float] The scale of the bias term.
       # @param max_iter [Integer] The maximum number of iterations.
       # @param batch_size [Integer] The size of the mini batches.
+      # @param probability [Boolean] The flag indicating whether to perform probability estimation.
       # @param normalize [Boolean] The flag indicating whether to normalize the weight vector.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0,
-                     max_iter: 100, batch_size: 50, normalize: true, random_seed: nil)
+                     max_iter: 100, batch_size: 50, probability: false, normalize: true, random_seed: nil)
         SVMKit::Validation.check_params_float(reg_param: reg_param, bias_scale: bias_scale)
         SVMKit::Validation.check_params_integer(max_iter: max_iter, batch_size: batch_size)
-        SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, normalize: normalize)
+        SVMKit::Validation.check_params_boolean(fit_bias: fit_bias, probability: probability, normalize: normalize)
         SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
+        SVMKit::Validation.check_params_positive(reg_param: reg_param, bias_scale: bias_scale, max_iter: max_iter,
+                                                 batch_size: batch_size)
         @params = {}
         @params[:reg_param] = reg_param
         @params[:fit_bias] = fit_bias
         @params[:bias_scale] = bias_scale
         @params[:max_iter] = max_iter
         @params[:batch_size] = batch_size
+        @params[:probability] = probability
         @params[:normalize] = normalize
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @weight_vec = nil
         @bias_term = nil
+        @prob_param = nil
         @classes = nil
         @rng = Random.new(@params[:random_seed])
       end
@@ -77,6 +81,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         n_classes = @classes.size
@@ -85,16 +90,27 @@ module SVMKit
         if n_classes > 2
           @weight_vec = Numo::DFloat.zeros(n_classes, n_features)
           @bias_term = Numo::DFloat.zeros(n_classes)
+          @prob_param = Numo::DFloat.zeros(n_classes, 2)
           n_classes.times do |n|
             bin_y = Numo::Int32.cast(y.eq(@classes[n])) * 2 - 1
             weight, bias = binary_fit(x, bin_y)
             @weight_vec[n, true] = weight
             @bias_term[n] = bias
+            @prob_param[n, true] = if @params[:probability]
+                                     SVMKit::ProbabilisticOutput.fit_sigmoid(x.dot(weight.transpose) + bias, bin_y)
+                                   else
+                                     Numo::DFloat[1, 0]
+                                   end
           end
         else
           negative_label = y.to_a.uniq.sort.first
           bin_y = Numo::Int32.cast(y.ne(negative_label)) * 2 - 1
           @weight_vec, @bias_term = binary_fit(x, bin_y)
+          @prob_param = if @params[:probability]
+                          SVMKit::ProbabilisticOutput.fit_sigmoid(x.dot(@weight_vec.transpose) + @bias_term, bin_y)
+                        else
+                          Numo::DFloat[1, 0]
+                        end
         end
         self
@@ -124,12 +140,32 @@ module SVMKit
         Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
       end
+      # Predict probability for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
+      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
+      def predict_proba(x)
+        SVMKit::Validation.check_sample_array(x)
+        if @classes.size > 2
+          probs = 1.0 / (Numo::NMath.exp(@prob_param[true, 0] * decision_function(x) + @prob_param[true, 1]) + 1.0)
+          return (probs.transpose / probs.sum(axis: 1)).transpose
+        end
+        n_samples, = x.shape
+        probs = Numo::DFloat.zeros(n_samples, 2)
+        probs[true, 1] = 1.0 / (Numo::NMath.exp(@prob_param[0] * decision_function(x) + @prob_param[1]) + 1.0)
+        probs[true, 0] = 1.0 - probs[true, 1]
+        probs
+      end
       # Dump marshal data.
       # @return [Hash] The marshal data about SVC.
       def marshal_dump
         { params: @params,
           weight_vec: @weight_vec,
           bias_term: @bias_term,
+          prob_param: @prob_param,
           classes: @classes,
           rng: @rng }
       end
@@ -140,6 +176,7 @@ module SVMKit
         @params = obj[:params]
         @weight_vec = obj[:weight_vec]
         @bias_term = obj[:bias_term]
+        @prob_param = obj[:prob_param]
         @classes = obj[:classes]
         @rng = obj[:rng]
         nil
@@ -159,11 +196,13 @@ module SVMKit
           # random sampling
           subset_ids = rand_ids.shift(@params[:batch_size])
           rand_ids.concat(subset_ids)
-          target_ids = subset_ids.map { |n| n if weight_vec.dot(samples[n, true]) * bin_y[n] < 1 }.compact
-          n_subsamples = target_ids.size
-          next if n_subsamples.zero?
+          sub_samples = samples[subset_ids, true]
+          sub_bin_y = bin_y[subset_ids]
+          target_ids = (sub_samples.dot(weight_vec.transpose) * sub_bin_y).lt(1.0).where
+          n_targets = target_ids.size
+          next if n_targets.zero?
           # update the weight vector.
-          mean_vec = samples[target_ids, true].transpose.dot(bin_y[target_ids]) / n_subsamples
+          mean_vec = sub_samples[target_ids, true].transpose.dot(sub_bin_y[target_ids]) / n_targets
           weight_vec -= learning_rate(t) * (@params[:reg_param] * weight_vec - mean_vec)
           # scale the weight vector.
           normalize_weight_vec(weight_vec) if @params[:normalize]

data/lib/svmkit/model_selection/cross_validation.rb CHANGED

@@ -62,6 +62,7 @@ module SVMKit
       def perform(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         # Initialize the report of cross validation.
         report = { test_score: [], train_score: nil, fit_time: [] }
         report[:train_score] = [] if @return_train_score
@@ -81,9 +82,12 @@ module SVMKit
           if @evaluator.nil?
             report[:test_score].push(@estimator.score(test_x, test_y))
             report[:train_score].push(@estimator.score(train_x, train_y)) if @return_train_score
+          elsif log_loss?
+            report[:test_score].push(@evaluator.score(test_y, @estimator.predict_proba(test_x)))
+            report[:train_score].push(@evaluator.score(train_y, @estimator.predict_proba(train_x))) if @return_train_score
           else
             report[:test_score].push(@evaluator.score(test_y, @estimator.predict(test_x)))
-            report[:train_score].push(@estimator.score(train_x, @estimator.predict(train_x))) if @return_train_score
+            report[:train_score].push(@evaluator.score(train_y, @estimator.predict(train_x))) if @return_train_score
           end
         end
         report
@@ -96,6 +100,10 @@ module SVMKit
         class_name = @estimator.params[:estimator].class.to_s if class_name.include?('Multiclass')
         class_name.include?('KernelMachine')
       end
+      def log_loss?
+        @evaluator.is_a?(SVMKit::EvaluationMeasure::LogLoss)
+      end
     end
   end
 end

data/lib/svmkit/model_selection/k_fold.rb CHANGED

@@ -35,7 +35,7 @@ module SVMKit
         SVMKit::Validation.check_params_integer(n_splits: n_splits)
         SVMKit::Validation.check_params_boolean(shuffle: shuffle)
         SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
+        SVMKit::Validation.check_params_positive(n_splits: n_splits)
         @n_splits = n_splits
         @shuffle = shuffle
         @random_seed = random_seed

data/lib/svmkit/model_selection/stratified_k_fold.rb CHANGED

@@ -35,7 +35,7 @@ module SVMKit
         SVMKit::Validation.check_params_integer(n_splits: n_splits)
         SVMKit::Validation.check_params_boolean(shuffle: shuffle)
         SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
+        SVMKit::Validation.check_params_positive(n_splits: n_splits)
         @n_splits = n_splits
         @shuffle = shuffle
         @random_seed = random_seed
@@ -51,9 +51,10 @@ module SVMKit
       # @param y [Numo::Int32] (shape: [n_samples])
       #   The labels to be used to generate data indices for stratified K-fold cross validation.
       # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
-      def split(x, y) # rubocop:disable Lint/UnusedMethodArgument
+      def split(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         # Check the number of samples in each class.
         unless valid_n_splits?(y)
           raise ArgumentError,

data/lib/svmkit/multiclass/one_vs_rest_classifier.rb CHANGED

@@ -48,6 +48,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         y_arr = y.to_a
         @classes = Numo::Int32.asarray(y_arr.uniq.sort)
         @estimators = @classes.to_a.map do |label|

data/lib/svmkit/naive_bayes/naive_bayes.rb CHANGED

@@ -80,6 +80,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         n_samples, = x.shape
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
@@ -154,6 +155,7 @@ module SVMKit
       # @param smoothing_param [Float] The Laplace smoothing parameter.
       def initialize(smoothing_param: 1.0)
         SVMKit::Validation.check_params_float(smoothing_param: smoothing_param)
+        SVMKit::Validation.check_params_positive(smoothing_param: smoothing_param)
         @params = {}
         @params[:smoothing_param] = smoothing_param
       end
@@ -167,6 +169,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         n_samples, = x.shape
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         @class_priors = Numo::DFloat[*@classes.to_a.map { |l| y.eq(l).count / n_samples.to_f }]
@@ -241,6 +244,7 @@ module SVMKit
       # @param bin_threshold [Float] The threshold for binarizing of features.
       def initialize(smoothing_param: 1.0, bin_threshold: 0.0)
         SVMKit::Validation.check_params_float(smoothing_param: smoothing_param, bin_threshold: bin_threshold)
+        SVMKit::Validation.check_params_positive(smoothing_param: smoothing_param)
         @params = {}
         @params[:smoothing_param] = smoothing_param
         @params[:bin_threshold] = bin_threshold
@@ -255,6 +259,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         n_samples, = x.shape
         bin_x = Numo::DFloat[*x.gt(@params[:bin_threshold])]
         @classes = Numo::Int32[*y.to_a.uniq.sort]

data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb CHANGED

@@ -36,6 +36,7 @@ module SVMKit
       # @param n_neighbors [Integer] The number of neighbors.
       def initialize(n_neighbors: 5)
         SVMKit::Validation.check_params_integer(n_neighbors: n_neighbors)
+        SVMKit::Validation.check_params_positive(n_neighbors: n_neighbors)
         @params = {}
         @params[:n_neighbors] = n_neighbors
         @prototypes = nil
@@ -51,6 +52,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         @prototypes = Numo::DFloat.asarray(x.to_a)
         @labels = Numo::Int32.asarray(y.to_a)
         @classes = Numo::Int32.asarray(y.to_a.uniq.sort)

data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb CHANGED

@@ -63,7 +63,9 @@ module SVMKit
         SVMKit::Validation.check_params_integer(n_factors: n_factors, max_iter: max_iter, batch_size: batch_size)
         SVMKit::Validation.check_params_string(loss: loss)
         SVMKit::Validation.check_params_type_or_nil(Integer, random_seed: random_seed)
+        SVMKit::Validation.check_params_positive(n_factors: n_factors, reg_param_bias: reg_param_bias,
+                                                 reg_param_weight: reg_param_weight, reg_param_factor: reg_param_factor,
+                                                 max_iter: max_iter, batch_size: batch_size)
         @params = {}
         @params[:n_factors] = n_factors
         @params[:loss] = loss
@@ -90,6 +92,7 @@ module SVMKit
       def fit(x, y)
         SVMKit::Validation.check_sample_array(x)
         SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
         @classes = Numo::Int32[*y.to_a.uniq.sort]
         n_classes = @classes.size

data/lib/svmkit/preprocessing/label_encoder.rb ADDED

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/transformer'
+module SVMKit
+  module Preprocessing
+    # Encode labels to values between 0 and n_classes - 1.
+    #
+    # @example
+    #   encoder = SVMKit::Preprocessing::LabelEncoder.new
+    #   labels = Numo::Int32[1, 8, 8, 15, 0]
+    #   encoded_labels = encoder.fit_transform(labels)
+    #   # > pp encoded_labels
+    #   # Numo::Int32#shape=[5]
+    #   # [1, 2, 2, 3, 0]
+    #   decoded_labels = encoder.inverse_transform(encoded_labels)
+    #   # > pp decoded_labels
+    #   # [1, 8, 8, 15, 0]
+    class LabelEncoder
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the class labels.
+      # @return [Array] (size: [n_classes])
+      attr_reader :classes
+      # Create a new encoder for encoding labels to values between 0 and n_classes - 1.
+      def initialize
+        @params = {}
+        @classes = nil
+      end
+      # Fit label-encoder to labels.
+      #
+      # @overload fit(x) -> LabelEncoder
+      #
+      # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
+      # @return [LabelEncoder]
+      def fit(x, _y = nil)
+        x = x.to_a if x.is_a?(Numo::NArray)
+        SVMKit::Validation.check_params_type(Array, x: x)
+        @classes = x.sort.uniq
+        self
+      end
+      # Fit label-encoder to labels, then return encoded labels.
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Array] (shape: [n_samples]) The labels to fit label-encoder.
+      # @return [Numo::Int32] The encoded labels.
+      def fit_transform(x, _y = nil)
+        x = x.to_a if x.is_a?(Numo::NArray)
+        SVMKit::Validation.check_params_type(Array, x: x)
+        fit(x).transform(x)
+      end
+      # Encode labels.
+      #
+      # @param x [Array] (shape: [n_samples]) The labels to be encoded.
+      # @return [Numo::Int32] The encoded labels.
+      def transform(x)
+        x = x.to_a if x.is_a?(Numo::NArray)
+        SVMKit::Validation.check_params_type(Array, x: x)
+        Numo::Int32[*(x.map { |v| @classes.index(v) })]
+      end
+      # Decode encoded labels.
+      #
+      # @param x [Numo::Int32] (shape: [n_samples]) The labels to be decoded.
+      # @return [Array] The decoded labels.
+      def inverse_transform(x)
+        SVMKit::Validation.check_label_array(x)
+        x.to_a.map { |n| @classes[n] }
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about LabelEncoder
+      def marshal_dump
+        { params: @params,
+          classes: @classes }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @classes = obj[:classes]
+        nil
+      end
+    end
+  end
+end

data/lib/svmkit/preprocessing/one_hot_encoder.rb ADDED

@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/transformer'
+module SVMKit
+  module Preprocessing
+    # Encode categorical integer features to one-hot-vectors.
+    #
+    # @example
+    #   encoder = SVMKit::Preprocessing::OneHotEncoder.new
+    #   labels = Numo::Int32[0, 0, 2, 3, 2, 1]
+    #   one_hot_vectors = encoder.fit_transform(labels)
+    #   # > pp one_hot_vectors
+    #   # Numo::DFloat#shape[6, 4]
+    #   # [[1, 0, 0, 0],
+    #   #  [1, 0, 0, 0],
+    #   #  [0, 0, 1, 0],
+    #   #  [0, 0, 0, 1],
+    #   #  [0, 0, 1, 0],
+    #   #  [0, 1, 0, 0]]
+    class OneHotEncoder
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the maximum values for each feature.
+      # @return [Numo::Int32] (shape: [n_features])
+      attr_reader :n_values
+      # Return the indices to feature ranges.
+      # @return [Numo::Int32] (shape: [n_features + 1])
+      attr_reader :feature_indices
+      # Create a new encoder for encoding categorical integer features to one-hot-vectors
+      def initialize
+        @params = {}
+        @n_values = nil
+        @feature_indices = nil
+      end
+      # Fit one-hot-encoder to samples.
+      #
+      # @overload fit(x) -> OneHotEncoder
+      #
+      # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
+      # @return [OneHotEncoder]
+      def fit(x, _y = nil)
+        SVMKit::Validation.check_params_type(Numo::Int32, x: x)
+        @n_values = x.max(0) + 1
+        @feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
+        self
+      end
+      # Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
+      # @return [Numo::DFloat] The one-hot-vectors.
+      def fit_transform(x, _y = nil)
+        SVMKit::Validation.check_params_type(Numo::Int32, x: x)
+        fit(x).transform(x)
+      end
+      # Encode samples into one-hot-vectors.
+      #
+      # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
+      # @return [Numo::DFloat] The one-hot-vectors.
+      def transform(x)
+        SVMKit::Validation.check_params_type(Numo::Int32, x: x)
+        n_samples, n_features = x.shape
+        n_features = 1 if n_features.nil?
+        column_indices = (x + @feature_indices[0...-1]).flatten.to_a
+        row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
+        codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
+        row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
+        codes
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data about OneHotEncoder.
+      def marshal_dump
+        { params: @params,
+          n_values: @n_values,
+          feature_indices: @feature_indices }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @n_values = obj[:n_values]
+        @feature_indices = obj[:feature_indices]
+        nil
+      end
+    end
+  end
+end