RubyGems - svmkit - Versions diffs - 0.7.3 → 0.8.1 - Mend

svmkit 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

checksums.yaml +4 -4
data/.gitignore +0 -9
data/.rspec +1 -0
data/.travis.yml +4 -12
data/LICENSE.txt +1 -1
data/README.md +11 -13
data/lib/svmkit.rb +3 -66
data/svmkit.gemspec +12 -7
metadata +16 -81
data/.coveralls.yml +0 -1
data/.rubocop.yml +0 -47
data/.rubocop_todo.yml +0 -58
data/HISTORY.md +0 -168
data/lib/svmkit/base/base_estimator.rb +0 -13
data/lib/svmkit/base/classifier.rb +0 -34
data/lib/svmkit/base/cluster_analyzer.rb +0 -29
data/lib/svmkit/base/evaluator.rb +0 -13
data/lib/svmkit/base/regressor.rb +0 -34
data/lib/svmkit/base/splitter.rb +0 -17
data/lib/svmkit/base/transformer.rb +0 -18
data/lib/svmkit/clustering/dbscan.rb +0 -127
data/lib/svmkit/clustering/k_means.rb +0 -140
data/lib/svmkit/dataset.rb +0 -109
data/lib/svmkit/decomposition/nmf.rb +0 -147
data/lib/svmkit/decomposition/pca.rb +0 -150
data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
data/lib/svmkit/evaluation_measure/precision.rb +0 -51
data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
data/lib/svmkit/evaluation_measure/purity.rb +0 -41
data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
data/lib/svmkit/evaluation_measure/recall.rb +0 -51
data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
data/lib/svmkit/linear_model/lasso.rb +0 -138
data/lib/svmkit/linear_model/linear_regression.rb +0 -112
data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
data/lib/svmkit/linear_model/ridge.rb +0 -112
data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
data/lib/svmkit/linear_model/svc.rb +0 -184
data/lib/svmkit/linear_model/svr.rb +0 -123
data/lib/svmkit/model_selection/cross_validation.rb +0 -121
data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
data/lib/svmkit/model_selection/k_fold.rb +0 -77
data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
data/lib/svmkit/optimizer/nadam.rb +0 -90
data/lib/svmkit/optimizer/rmsprop.rb +0 -69
data/lib/svmkit/optimizer/sgd.rb +0 -65
data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
data/lib/svmkit/pairwise_metric.rb +0 -91
data/lib/svmkit/pipeline/pipeline.rb +0 -197
data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
data/lib/svmkit/probabilistic_output.rb +0 -112
data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
data/lib/svmkit/tree/node.rb +0 -70
data/lib/svmkit/utils.rb +0 -22
data/lib/svmkit/validation.rb +0 -79
data/lib/svmkit/values.rb +0 -13
data/lib/svmkit/version.rb +0 -7

data/lib/svmkit/tree/decision_tree_regressor.rb DELETED

@@ -1,251 +0,0 @@
-# frozen_string_literal: true
-require 'svmkit/validation'
-require 'svmkit/base/base_estimator'
-require 'svmkit/base/regressor'
-require 'svmkit/tree/node'
-module SVMKit
-  module Tree
-    # DecisionTreeRegressor is a class that implements decision tree for regression.
-    #
-    # @example
-    #   estimator =
-    #     SVMKit::Tree::DecisionTreeRegressor.new(
-    #       max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
-    #   estimator.fit(training_samples, traininig_values)
-    #   results = estimator.predict(testing_samples)
-    #
-    class DecisionTreeRegressor
-      include Base::BaseEstimator
-      include Base::Regressor
-      include Validation
-      # Return the importance for each feature.
-      # @return [Numo::DFloat] (size: n_features)
-      attr_reader :feature_importances
-      # Return the learned tree.
-      # @return [Node]
-      attr_reader :tree
-      # Return the random generator for random selection of feature index.
-      # @return [Random]
-      attr_reader :rng
-      # Return the values assigned each leaf.
-      # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
-      attr_reader :leaf_values
-      # Create a new regressor with decision tree algorithm.
-      #
-      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
-      # @param max_depth [Integer] The maximum depth of the tree.
-      #   If nil is given, decision tree grows without concern for depth.
-      # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
-      #   If nil is given, number of leaves is not limited.
-      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
-      # @param max_features [Integer] The number of features to consider when searching optimal split point.
-      #   If nil is given, split process considers all features.
-      # @param random_seed [Integer] The seed value using to initialize the random generator.
-      #   It is used to randomly determine the order of features when deciding spliting point.
-      def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
-                     random_seed: nil)
-        check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
-                                          max_features: max_features, random_seed: random_seed)
-        check_params_integer(min_samples_leaf: min_samples_leaf)
-        check_params_string(criterion: criterion)
-        check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
-                              min_samples_leaf: min_samples_leaf, max_features: max_features)
-        @params = {}
-        @params[:criterion] = criterion
-        @params[:max_depth] = max_depth
-        @params[:max_leaf_nodes] = max_leaf_nodes
-        @params[:min_samples_leaf] = min_samples_leaf
-        @params[:max_features] = max_features
-        @params[:random_seed] = random_seed
-        @params[:random_seed] ||= srand
-        @criterion = :mse
-        @criterion = :mae if @params[:criterion] == 'mae'
-        @tree = nil
-        @feature_importances = nil
-        @n_leaves = nil
-        @leaf_values = nil
-        @rng = Random.new(@params[:random_seed])
-      end
-      # Fit the model with given training data.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
-      # @return [DecisionTreeRegressor] The learned regressor itself.
-      def fit(x, y)
-        check_sample_array(x)
-        check_tvalue_array(y)
-        check_sample_tvalue_size(x, y)
-        single_target = y.shape[1].nil?
-        y = y.expand_dims(1) if single_target
-        n_samples, n_features = x.shape
-        @params[:max_features] = n_features if @params[:max_features].nil?
-        @params[:max_features] = [@params[:max_features], n_features].min
-        build_tree(x, y)
-        @leaf_values = @leaf_values[true] if single_target
-        eval_importance(n_samples, n_features)
-        self
-      end
-      # Predict values for samples.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
-      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
-      def predict(x)
-        check_sample_array(x)
-        @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
-      end
-      # Return the index of the leaf that each sample reached.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
-      # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
-      def apply(x)
-        check_sample_array(x)
-        Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
-      end
-      # Dump marshal data.
-      # @return [Hash] The marshal data about DecisionTreeRegressor
-      def marshal_dump
-        { params: @params,
-          criterion: @criterion,
-          tree: @tree,
-          feature_importances: @feature_importances,
-          leaf_values: @leaf_values,
-          rng: @rng }
-      end
-      # Load marshal data.
-      # @return [nil]
-      def marshal_load(obj)
-        @params = obj[:params]
-        @criterion = obj[:criterion]
-        @tree = obj[:tree]
-        @feature_importances = obj[:feature_importances]
-        @leaf_values = obj[:leaf_values]
-        @rng = obj[:rng]
-        nil
-      end
-      private
-      def apply_at_node(node, sample)
-        return node.leaf_id if node.leaf
-        return apply_at_node(node.left, sample) if node.right.nil?
-        return apply_at_node(node.right, sample) if node.left.nil?
-        if sample[node.feature_id] <= node.threshold
-          apply_at_node(node.left, sample)
-        else
-          apply_at_node(node.right, sample)
-        end
-      end
-      def build_tree(x, y)
-        @n_leaves = 0
-        @leaf_values = []
-        @tree = grow_node(0, x, y, impurity(y))
-        @leaf_values = Numo::DFloat.cast(@leaf_values)
-        nil
-      end
-      def grow_node(depth, x, y, whole_impurity)
-        unless @params[:max_leaf_nodes].nil?
-          return nil if @n_leaves >= @params[:max_leaf_nodes]
-        end
-        n_samples, n_features = x.shape
-        return nil if n_samples <= @params[:min_samples_leaf]
-        node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
-        return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
-        unless @params[:max_depth].nil?
-          return put_leaf(node, y) if depth == @params[:max_depth]
-        end
-        feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
-          rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
-        return put_leaf(node, y) if gain.nil? || gain.zero?
-        node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
-        node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
-        return put_leaf(node, y) if node.left.nil? && node.right.nil?
-        node.feature_id = feature_id
-        node.threshold = threshold
-        node.leaf = false
-        node
-      end
-      def put_leaf(node, values)
-        node.probs = nil
-        node.leaf = true
-        node.leaf_id = @n_leaves
-        @n_leaves += 1
-        @leaf_values.push(values.mean(0))
-        node
-      end
-      def rand_ids(n)
-        [*0...n].sample(@params[:max_features], random: @rng)
-      end
-      def best_split(features, values, whole_impurity)
-        n_samples = values.shape[0]
-        features.to_a.uniq.sort.each_cons(2).map do |l, r|
-          threshold = 0.5 * (l + r)
-          left_ids = features.le(threshold).where
-          right_ids = features.gt(threshold).where
-          left_impurity = impurity(values[left_ids, true])
-          right_impurity = impurity(values[right_ids, true])
-          gain = whole_impurity -
-                 left_impurity * left_ids.size.fdiv(n_samples) -
-                 right_impurity * right_ids.size.fdiv(n_samples)
-          [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
-        end.max_by(&:last)
-      end
-      def impurity(values)
-        send(@criterion, values)
-      end
-      def mse(values)
-        ((values - values.mean(0))**2).mean
-      end
-      def mae(values)
-        (values - values.mean(0)).abs.mean
-      end
-      def eval_importance(n_samples, n_features)
-        @feature_importances = Numo::DFloat.zeros(n_features)
-        eval_importance_at_node(@tree)
-        @feature_importances /= n_samples
-        normalizer = @feature_importances.sum
-        @feature_importances /= normalizer if normalizer > 0.0
-        nil
-      end
-      def eval_importance_at_node(node)
-        return nil if node.leaf
-        return nil if node.left.nil? || node.right.nil?
-        gain = node.n_samples * node.impurity -
-               node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
-        @feature_importances[node.feature_id] += gain
-        eval_importance_at_node(node.left)
-        eval_importance_at_node(node.right)
-      end
-    end
-  end
-end

data/lib/svmkit/tree/node.rb DELETED

@@ -1,70 +0,0 @@
-# frozen_string_literal: true
-module SVMKit
-  module Tree
-    # Node is a class that implements node used for construction of decision tree.
-    # This class is used for internal data structures.
-    class Node
-      # @!visibility private
-      attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
-      # Create a new node for decision tree.
-      #
-      # @param depth [Integer] The depth of the node in tree.
-      # @param impurity [Float] The impurity of the node.
-      # @param n_samples [Integer] The number of the samples in the node.
-      # @param probs [Float] The probability of the node.
-      # @param leaf [Boolean] The flag indicating whether the node is a leaf.
-      # @param leaf_id [Integer] The leaf index of the node.
-      # @param left [Node] The left node.
-      # @param right [Node] The right node.
-      # @param feature_id [Integer] The feature index used for evaluation.
-      # @param threshold [Float] The threshold value of the feature for splitting the node.
-      def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
-                     leaf: true, leaf_id: 0,
-                     left: nil, right: nil, feature_id: 0, threshold: 0.0)
-        @depth = depth
-        @impurity = impurity
-        @n_samples = n_samples
-        @probs = probs
-        @leaf = leaf
-        @leaf_id = leaf_id
-        @left = left
-        @right = right
-        @feature_id = feature_id
-        @threshold = threshold
-      end
-      # Dump marshal data.
-      # @return [Hash] The marshal data about Node
-      def marshal_dump
-        { depth: @depth,
-          impurity: @impurity,
-          n_samples: @n_samples,
-          probs: @probs,
-          leaf: @leaf,
-          leaf_id: @leaf_id,
-          left: @left,
-          right: @right,
-          feature_id: @feature_id,
-          threshold: @threshold }
-      end
-      # Load marshal data.
-      # @return [nil]
-      def marshal_load(obj)
-        @depth = obj[:depth]
-        @impurity = obj[:impurity]
-        @n_samples = obj[:n_samples]
-        @probs = obj[:probs]
-        @leaf = obj[:leaf]
-        @leaf_id = obj[:leaf_id]
-        @left = obj[:left]
-        @right = obj[:right]
-        @feature_id = obj[:feature_id]
-        @threshold = obj[:threshold]
-        nil
-      end
-    end
-  end
-end

data/lib/svmkit/utils.rb DELETED

@@ -1,22 +0,0 @@
-# frozen_string_literal: true
-module SVMKit
-  # @!visibility private
-  module Utils
-    module_function
-    # @!visibility private
-    def choice_ids(size, probs, rng = nil)
-      rng ||= Random.new
-      Array.new(size) do
-        target = rng.rand
-        chosen = 0
-        probs.each_with_index do |p, idx|
-          break (chosen = idx) if target <= p
-          target -= p
-        end
-        chosen
-      end
-    end
-  end
-end

data/lib/svmkit/validation.rb DELETED

@@ -1,79 +0,0 @@
-# frozen_string_literal: true
-module SVMKit
-  # @!visibility private
-  module Validation
-    module_function
-    # @!visibility private
-    def check_sample_array(x)
-      raise TypeError, 'Expect class of sample matrix to be Numo::DFloat' unless x.is_a?(Numo::DFloat)
-      raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
-      nil
-    end
-    # @!visibility private
-    def check_label_array(y)
-      raise TypeError, 'Expect class of label vector to be Numo::Int32' unless y.is_a?(Numo::Int32)
-      raise ArgumentError, 'Expect label vector to be 1-D arrray' unless y.shape.size == 1
-      nil
-    end
-    # @!visibility private
-    def check_tvalue_array(y)
-      raise TypeError, 'Expect class of target value vector to be Numo::DFloat' unless y.is_a?(Numo::DFloat)
-      nil
-    end
-    # @!visibility private
-    def check_sample_label_size(x, y)
-      raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
-      nil
-    end
-    # @!visibility private
-    def check_sample_tvalue_size(x, y)
-      raise ArgumentError, 'Expect to have the same number of samples for sample matrix and target value vector' unless x.shape[0] == y.shape[0]
-      nil
-    end
-    # @!visibility private
-    def check_params_type(type, params = {})
-      params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
-      nil
-    end
-    # @!visibility private
-    def check_params_type_or_nil(type, params = {})
-      params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type} or nil" unless v.is_a?(type) || v.is_a?(NilClass) }
-      nil
-    end
-    # @!visibility private
-    def check_params_float(params = {})
-      check_params_type(Float, params)
-    end
-    # @!visibility private
-    def check_params_integer(params = {})
-      check_params_type(Integer, params)
-    end
-    # @!visibility private
-    def check_params_string(params = {})
-      check_params_type(String, params)
-    end
-    # @!visibility private
-    def check_params_boolean(params = {})
-      params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
-      nil
-    end
-    # @!visibility private
-    def check_params_positive(params = {})
-      params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
-      nil
-    end
-  end
-end

data/lib/svmkit/values.rb DELETED

@@ -1,13 +0,0 @@
-# frozen_string_literal: true
-module SVMKit
-  # @!visibility private
-  module Values
-    module_function
-    # @!visibility private
-    def int_max
-      @int_max ||= 2**([42].pack('i').size * 16 - 2) - 1
-    end
-  end
-end

data/lib/svmkit/version.rb DELETED

@@ -1,7 +0,0 @@
-# frozen_string_literal: true
-# SVMKit is a machine learning library in Ruby.
-module SVMKit
-  # @!visibility private
-  VERSION = '0.7.3'.freeze
-end