RubyGems - rumale - Versions diffs - 0.8.0 - Mend

rumale 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

checksums.yaml +7 -0
data/.coveralls.yml +1 -0
data/.gitignore +20 -0
data/.rspec +3 -0
data/.rubocop.yml +47 -0
data/.rubocop_todo.yml +58 -0
data/.travis.yml +13 -0
data/CHANGELOG.md +2 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +4 -0
data/LICENSE.txt +23 -0
data/README.md +175 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/rumale.rb +70 -0
data/lib/rumale/base/base_estimator.rb +13 -0
data/lib/rumale/base/classifier.rb +36 -0
data/lib/rumale/base/cluster_analyzer.rb +31 -0
data/lib/rumale/base/evaluator.rb +17 -0
data/lib/rumale/base/regressor.rb +36 -0
data/lib/rumale/base/splitter.rb +21 -0
data/lib/rumale/base/transformer.rb +22 -0
data/lib/rumale/clustering/dbscan.rb +125 -0
data/lib/rumale/clustering/k_means.rb +138 -0
data/lib/rumale/dataset.rb +110 -0
data/lib/rumale/decomposition/nmf.rb +141 -0
data/lib/rumale/decomposition/pca.rb +148 -0
data/lib/rumale/ensemble/ada_boost_classifier.rb +196 -0
data/lib/rumale/ensemble/ada_boost_regressor.rb +178 -0
data/lib/rumale/ensemble/random_forest_classifier.rb +180 -0
data/lib/rumale/ensemble/random_forest_regressor.rb +141 -0
data/lib/rumale/evaluation_measure/accuracy.rb +29 -0
data/lib/rumale/evaluation_measure/f_score.rb +50 -0
data/lib/rumale/evaluation_measure/log_loss.rb +45 -0
data/lib/rumale/evaluation_measure/mean_absolute_error.rb +29 -0
data/lib/rumale/evaluation_measure/mean_squared_error.rb +29 -0
data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +62 -0
data/lib/rumale/evaluation_measure/precision.rb +50 -0
data/lib/rumale/evaluation_measure/precision_recall.rb +91 -0
data/lib/rumale/evaluation_measure/purity.rb +40 -0
data/lib/rumale/evaluation_measure/r2_score.rb +43 -0
data/lib/rumale/evaluation_measure/recall.rb +50 -0
data/lib/rumale/kernel_approximation/rbf.rb +121 -0
data/lib/rumale/kernel_machine/kernel_svc.rb +193 -0
data/lib/rumale/linear_model/base_linear_model.rb +89 -0
data/lib/rumale/linear_model/lasso.rb +136 -0
data/lib/rumale/linear_model/linear_regression.rb +110 -0
data/lib/rumale/linear_model/logistic_regression.rb +159 -0
data/lib/rumale/linear_model/ridge.rb +110 -0
data/lib/rumale/linear_model/svc.rb +183 -0
data/lib/rumale/linear_model/svr.rb +122 -0
data/lib/rumale/model_selection/cross_validation.rb +123 -0
data/lib/rumale/model_selection/grid_search_cv.rb +247 -0
data/lib/rumale/model_selection/k_fold.rb +76 -0
data/lib/rumale/model_selection/stratified_k_fold.rb +94 -0
data/lib/rumale/multiclass/one_vs_rest_classifier.rb +100 -0
data/lib/rumale/naive_bayes/naive_bayes.rb +315 -0
data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +111 -0
data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +93 -0
data/lib/rumale/optimizer/nadam.rb +90 -0
data/lib/rumale/optimizer/rmsprop.rb +69 -0
data/lib/rumale/optimizer/sgd.rb +65 -0
data/lib/rumale/optimizer/yellow_fin.rb +144 -0
data/lib/rumale/pairwise_metric.rb +91 -0
data/lib/rumale/pipeline/pipeline.rb +197 -0
data/lib/rumale/polynomial_model/base_factorization_machine.rb +99 -0
data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +197 -0
data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +131 -0
data/lib/rumale/preprocessing/l2_normalizer.rb +62 -0
data/lib/rumale/preprocessing/label_encoder.rb +94 -0
data/lib/rumale/preprocessing/min_max_scaler.rb +92 -0
data/lib/rumale/preprocessing/one_hot_encoder.rb +98 -0
data/lib/rumale/preprocessing/standard_scaler.rb +86 -0
data/lib/rumale/probabilistic_output.rb +112 -0
data/lib/rumale/tree/base_decision_tree.rb +153 -0
data/lib/rumale/tree/decision_tree_classifier.rb +163 -0
data/lib/rumale/tree/decision_tree_regressor.rb +135 -0
data/lib/rumale/tree/node.rb +70 -0
data/lib/rumale/utils.rb +37 -0
data/lib/rumale/validation.rb +79 -0
data/lib/rumale/values.rb +13 -0
data/lib/rumale/version.rb +6 -0
data/rumale.gemspec +41 -0
metadata +204 -0

data/lib/rumale/evaluation_measure/accuracy.rb ADDED

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  # This module consists of the classes for model evaluation.
+  module EvaluationMeasure
+    # Accuracy is a class that calculates the accuracy of classifier from the predicted labels.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::Accuracy.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class Accuracy
+      include Base::Evaluator
+      # Calculate mean accuracy.
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
+      # @return [Float] Mean accuracy
+      def score(y_true, y_pred)
+        check_label_array(y_true)
+        check_label_array(y_pred)
+        (y_true.to_a.map.with_index { |label, n| label == y_pred[n] ? 1 : 0 }).inject(:+) / y_true.size.to_f
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/f_score.rb ADDED

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+require 'rumale/evaluation_measure/precision_recall'
+module Rumale
+  # This module consists of the classes for model evaluation.
+  module EvaluationMeasure
+    # FScore is a class that calculates the F1-score of the predicted labels.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::FScore.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class FScore
+      include Base::Evaluator
+      include EvaluationMeasure::PrecisionRecall
+      # Return the average type for calculation of F1-score.
+      # @return [String] ('binary', 'micro', 'macro')
+      attr_reader :average
+      # Create a new evaluation measure calculater for F1-score.
+      #
+      # @param average [String] The average type ('binary', 'micro', 'macro')
+      def initialize(average: 'binary')
+        check_params_string(average: average)
+        @average = average
+      end
+      # Calculate average F1-score
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
+      # @return [Float] Average F1-score
+      def score(y_true, y_pred)
+        check_label_array(y_true)
+        check_label_array(y_pred)
+        case @average
+        when 'binary'
+          f_score_each_class(y_true, y_pred).last
+        when 'micro'
+          micro_average_f_score(y_true, y_pred)
+        when 'macro'
+          macro_average_f_score(y_true, y_pred)
+        end
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/log_loss.rb ADDED

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+require 'rumale/preprocessing/one_hot_encoder'
+module Rumale
+  module EvaluationMeasure
+    # LogLoss is a class that calculates the logarithmic loss of predicted class probability.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::LogLoss.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class LogLoss
+      include Base::Evaluator
+      # Calculate mean logarithmic loss.
+      # If both y_true and y_pred are array (both shapes are [n_samples]), this method calculates
+      # mean logarithmic loss for binary classification.
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted class probability.
+      # @param eps [Float] A small value close to zero to avoid outputting infinity in logarithmic calcuation.
+      # @return [Float] mean logarithmic loss
+      def score(y_true, y_pred, eps = 1e-15)
+        check_params_type(Numo::Int32, y_true: y_true)
+        check_params_type(Numo::DFloat, y_pred: y_pred)
+        n_samples, n_classes = y_pred.shape
+        clipped_p = y_pred.clip(eps, 1 - eps)
+        log_loss = if n_classes.nil?
+                     negative_label = y_true.to_a.uniq.min
+                     bin_y_true = Numo::DFloat.cast(y_true.ne(negative_label))
+                     -(bin_y_true * Numo::NMath.log(clipped_p) + (1 - bin_y_true) * Numo::NMath.log(1 - clipped_p))
+                   else
+                     encoder = Rumale::Preprocessing::OneHotEncoder.new
+                     encoded_y_true = encoder.fit_transform(y_true)
+                     clipped_p /= clipped_p.sum(1).expand_dims(1)
+                     -(encoded_y_true * Numo::NMath.log(clipped_p)).sum(1)
+                   end
+        log_loss.sum / n_samples
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/mean_absolute_error.rb ADDED

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  module EvaluationMeasure
+    # MeanAbsoluteError is a class that calculates the mean absolute error.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::MeanAbsoluteError.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class MeanAbsoluteError
+      include Base::Evaluator
+      # Calculate mean absolute error.
+      #
+      # @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
+      # @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated target values.
+      # @return [Float] Mean absolute error
+      def score(y_true, y_pred)
+        check_tvalue_array(y_true)
+        check_tvalue_array(y_pred)
+        raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
+        (y_true - y_pred).abs.mean
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/mean_squared_error.rb ADDED

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  module EvaluationMeasure
+    # MeanSquaredError is a class that calculates the mean squared error.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::MeanSquaredError.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class MeanSquaredError
+      include Base::Evaluator
+      # Calculate mean squared error.
+      #
+      # @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
+      # @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated target values.
+      # @return [Float] Mean squared error
+      def score(y_true, y_pred)
+        check_tvalue_array(y_true)
+        check_tvalue_array(y_pred)
+        raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
+        ((y_true - y_pred)**2).mean
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/normalized_mutual_information.rb ADDED

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  module EvaluationMeasure
+    # NormalizedMutualInformation is a class that calculates the normalized mutual information of cluatering results.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::NormalizedMutualInformation.new
+    #   puts evaluator.score(ground_truth, predicted)
+    #
+    # *Reference*
+    # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    # - N X. Vinh, J. Epps, and J. Bailey, "Information Theoretic Measures for Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance," J. Machine Learning Research, vol. 11, pp. 2837--1854, 2010.
+    class NormalizedMutualInformation
+      include Base::Evaluator
+      # Calculate noramlzied mutual information
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
+      # @return [Float] Normalized mutual information
+      def score(y_true, y_pred)
+        check_label_array(y_true)
+        check_label_array(y_pred)
+        # initiazlie some variables.
+        mutual_information = 0.0
+        n_samples = y_pred.size
+        class_ids = y_true.to_a.uniq
+        cluster_ids = y_pred.to_a.uniq
+        # calculate entropy.
+        class_entropy = -1.0 * class_ids.map do |k|
+          ratio = y_true.eq(k).count.fdiv(n_samples)
+          ratio * Math.log(ratio)
+        end.reduce(:+)
+        return 0.0 if class_entropy.zero?
+        cluster_entropy = -1.0 * cluster_ids.map do |k|
+          ratio = y_pred.eq(k).count.fdiv(n_samples)
+          ratio * Math.log(ratio)
+        end.reduce(:+)
+        return 0.0 if cluster_entropy.zero?
+        # calculate mutual information.
+        cluster_ids.map do |k|
+          pr_sample_ids = y_pred.eq(k).where.to_a
+          n_pr_samples = pr_sample_ids.size
+          class_ids.map do |j|
+            tr_sample_ids = y_true.eq(j).where.to_a
+            n_tr_samples = tr_sample_ids.size
+            n_intr_samples = (pr_sample_ids & tr_sample_ids).size
+            if n_intr_samples.positive?
+              mutual_information +=
+                n_intr_samples.fdiv(n_samples) * Math.log((n_samples * n_intr_samples).fdiv(n_pr_samples * n_tr_samples))
+            end
+          end
+        end
+        # return normalized mutual information.
+        mutual_information / Math.sqrt(class_entropy * cluster_entropy)
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/precision.rb ADDED

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+require 'rumale/evaluation_measure/precision_recall'
+module Rumale
+  # This module consists of the classes for model evaluation.
+  module EvaluationMeasure
+    # Precision is a class that calculates the preicision of the predicted labels.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::Precision.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class Precision
+      include Base::Evaluator
+      include EvaluationMeasure::PrecisionRecall
+      # Return the average type for calculation of precision.
+      # @return [String] ('binary', 'micro', 'macro')
+      attr_reader :average
+      # Create a new evaluation measure calculater for precision score.
+      #
+      # @param average [String] The average type ('binary', 'micro', 'macro')
+      def initialize(average: 'binary')
+        check_params_string(average: average)
+        @average = average
+      end
+      # Calculate average precision.
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted labels.
+      # @return [Float] Average precision
+      def score(y_true, y_pred)
+        check_label_array(y_true)
+        check_label_array(y_pred)
+        case @average
+        when 'binary'
+          precision_each_class(y_true, y_pred).last
+        when 'micro'
+          micro_average_precision(y_true, y_pred)
+        when 'macro'
+          macro_average_precision(y_true, y_pred)
+        end
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/precision_recall.rb ADDED

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  # This module consists of the classes for model evaluation.
+  module EvaluationMeasure
+    # @!visibility private
+    module PrecisionRecall
+      module_function
+      # @!visibility private
+      def precision_each_class(y_true, y_pred)
+        y_true.sort.to_a.uniq.map do |label|
+          target_positions = y_pred.eq(label)
+          next 0.0 if y_pred[target_positions].empty?
+          n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
+          n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
+          n_true_positives / (n_true_positives + n_false_positives)
+        end
+      end
+      # @!visibility private
+      def recall_each_class(y_true, y_pred)
+        y_true.sort.to_a.uniq.map do |label|
+          target_positions = y_true.eq(label)
+          next 0.0 if y_pred[target_positions].empty?
+          n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
+          n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
+          n_true_positives / (n_true_positives + n_false_negatives)
+        end
+      end
+      # @!visibility private
+      def f_score_each_class(y_true, y_pred)
+        precision_each_class(y_true, y_pred).zip(recall_each_class(y_true, y_pred)).map do |p, r|
+          next 0.0 if p.zero? && r.zero?
+          (2.0 * p * r) / (p + r)
+        end
+      end
+      # @!visibility private
+      def micro_average_precision(y_true, y_pred)
+        evaluated_values = y_true.sort.to_a.uniq.map do |label|
+          target_positions = y_pred.eq(label)
+          next [0.0, 0.0] if y_pred[target_positions].empty?
+          n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
+          n_false_positives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
+          [n_true_positives, n_true_positives + n_false_positives]
+        end
+        res = evaluated_values.transpose.map { |v| v.inject(:+) }
+        res.first / res.last
+      end
+      # @!visibility private
+      def micro_average_recall(y_true, y_pred)
+        evaluated_values = y_true.sort.to_a.uniq.map do |label|
+          target_positions = y_true.eq(label)
+          next 0.0 if y_pred[target_positions].empty?
+          n_true_positives = Numo::Int32.cast(y_true[target_positions].eq(y_pred[target_positions])).sum.to_f
+          n_false_negatives = Numo::Int32.cast(y_true[target_positions].ne(y_pred[target_positions])).sum.to_f
+          [n_true_positives, n_true_positives + n_false_negatives]
+        end
+        res = evaluated_values.transpose.map { |v| v.inject(:+) }
+        res.first / res.last
+      end
+      # @!visibility private
+      def micro_average_f_score(y_true, y_pred)
+        p = micro_average_precision(y_true, y_pred)
+        r = micro_average_recall(y_true, y_pred)
+        (2.0 * p * r) / (p + r)
+      end
+      # @!visibility private
+      def macro_average_precision(y_true, y_pred)
+        precision_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
+      end
+      # @!visibility private
+      def macro_average_recall(y_true, y_pred)
+        recall_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
+      end
+      # @!visibility private
+      def macro_average_f_score(y_true, y_pred)
+        f_score_each_class(y_true, y_pred).inject(:+) / y_true.to_a.uniq.size
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/purity.rb ADDED

@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+module Rumale
+  module EvaluationMeasure
+    # Purity is a class that calculates the purity of cluatering results.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::Purity.new
+    #   puts evaluator.score(ground_truth, predicted)
+    #
+    # *Reference*
+    # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    class Purity
+      include Base::Evaluator
+      # Calculate purity
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
+      # @return [Float] Purity
+      def score(y_true, y_pred)
+        check_label_array(y_true)
+        check_label_array(y_pred)
+        # initiazlie some variables.
+        purity = 0
+        n_samples = y_pred.size
+        class_ids = y_true.to_a.uniq
+        cluster_ids = y_pred.to_a.uniq
+        # calculate purity.
+        cluster_ids.each do |k|
+          pr_sample_ids = y_pred.eq(k).where.to_a
+          purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
+        end
+        purity.fdiv(n_samples)
+      end
+    end
+  end
+end

data/lib/rumale/evaluation_measure/r2_score.rb ADDED

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+require 'rumale/base/evaluator'
+require 'rumale/evaluation_measure/precision_recall'
+module Rumale
+  module EvaluationMeasure
+    # R2Score is a class that calculates the coefficient of determination for the predicted values.
+    #
+    # @example
+    #   evaluator = Rumale::EvaluationMeasure::R2Score.new
+    #   puts evaluator.score(ground_truth, predicted)
+    class R2Score
+      include Base::Evaluator
+      # Create a new evaluation measure calculater for coefficient of determination.
+      def initialize; end
+      # Calculate the coefficient of determination.
+      #
+      # @param y_true [Numo::DFloat] (shape: [n_samples, n_outputs]) Ground truth target values.
+      # @param y_pred [Numo::DFloat] (shape: [n_samples, n_outputs]) Estimated taget values.
+      # @return [Float] Coefficient of determination
+      def score(y_true, y_pred)
+        check_tvalue_array(y_true)
+        check_tvalue_array(y_pred)
+        raise ArgumentError, 'Expect to have the same size both y_true and y_pred.' unless y_true.shape == y_pred.shape
+        n_samples, n_outputs = y_true.shape
+        numerator = ((y_true - y_pred)**2).sum(0)
+        yt_mean = y_true.sum(0) / n_samples
+        denominator = ((y_true - yt_mean)**2).sum(0)
+        if n_outputs.nil?
+          denominator.zero? ? 0.0 : 1.0 - numerator / denominator
+        else
+          scores = 1 - numerator / denominator
+          scores[denominator.eq(0)] = 0.0
+          scores.sum / scores.size
+        end
+      end
+    end
+  end
+end