RubyGems - svmkit - Versions diffs - 0.4.1 → 0.5.0 - Mend

svmkit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.gitignore +4 -0
data/.rubocop_todo.yml +15 -16
data/Gemfile +1 -1
data/HISTORY.md +5 -1
data/README.md +2 -1
data/Rakefile +3 -3
data/lib/svmkit.rb +4 -0
data/lib/svmkit/base/cluster_analyzer.rb +29 -0
data/lib/svmkit/clustering/k_means.rb +138 -0
data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +63 -0
data/lib/svmkit/evaluation_measure/purity.rb +41 -0
data/lib/svmkit/version.rb +1 -1
data/svmkit.gemspec +2 -2
metadata +8 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: af30c20b06fec51d531364ad9ca1414ce2fe36cdbe61fd8a1a7128c793d67304
-  data.tar.gz: ba87c535aa723ec17334fd6819577dcb51d2d11ccef6adb967f73de1702522f5
+  metadata.gz: 3be3dae5adddfa8bf3655f983082f64601056ce2097671f97873f36f062eea15
+  data.tar.gz: 44bb40d0ec91975d6e4948567f95103434f5792fb4a2be2b87b18079b0b7bb00
 SHA512:
-  metadata.gz: b32efe1dcd924c3e31ad0dc26dfbdcc86b0154b8b8591e58db5364103526b7dc828c46462b5f2dfe81c7c8ee23836ae8d4b81061cdf1ceb4f023c48cc78dd110
-  data.tar.gz: 6f38f301d23b3abc1037e1b0fe620e687da1fe44216a49707b2192d30fd8f2a7cb7690d6365580dda470e6852200db20b540c35947e3b1c54d8f8b5b599b2dc0
+  metadata.gz: a009b9403935760033ea14c2e7a3027953d28f38f27c3952f49ed69c035eea94ab7305dce4c4a9b3e688f9894eeb3f8511863c1f71640735d16f73e3a1afafe6
+  data.tar.gz: ad9e8198c88047aad39e4caf95872c1616d1cdb94272f8044af621f7eb4990378693a7e0f3073ed0a7dbad3e2e22d7d46055fdb2795c3287ef23fa7efc7ea9d1

data/.gitignore CHANGED Viewed

@@ -14,3 +14,7 @@
 *.swp
 .DS_Store
 .ruby-version
+/spec/dump_dbl.t
+/spec/dump_int.t
+/spec/dump_mult_dbl.t
+/spec/dump_zb.t

data/.rubocop_todo.yml CHANGED Viewed

@@ -1,19 +1,18 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2018-04-14 20:44:19 +0900 using RuboCop version 0.54.0.
+# on 2018-06-10 12:21:53 +0900 using RuboCop version 0.57.1.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
 # versions of RuboCop, may require this file to be generated again.
-# Offense count: 1
-# Configuration parameters: Include.
-# Include: **/*.gemspec
-Gemspec/RequiredRubyVersion:
+# Offense count: 2
+# Cop supports --auto-correct.
+Layout/ClosingHeredocIndentation:
   Exclude:
     - 'svmkit.gemspec'
-# Offense count: 3
+# Offense count: 2
 # Cop supports --auto-correct.
 # Configuration parameters: EnforcedStyle.
 # SupportedStyles: auto_detection, squiggly, active_support, powerpack, unindent
@@ -21,24 +20,24 @@ Layout/IndentHeredoc:
   Exclude:
     - 'svmkit.gemspec'
+# Offense count: 1
+# Cop supports --auto-correct.
+Layout/LeadingBlankLines:
+  Exclude:
+    - 'svmkit.gemspec'
 # Offense count: 1
 # Configuration parameters: CountComments, ExcludedMethods.
 Metrics/BlockLength:
-  Max: 30
+  Max: 29
-# Offense count: 1
+# Offense count: 3
 Metrics/CyclomaticComplexity:
   Max: 12
-# Offense count: 1
+# Offense count: 3
 Metrics/PerceivedComplexity:
-  Max: 12
-# Offense count: 1
-# Cop supports --auto-correct.
-Style/Encoding:
-  Exclude:
-    - 'svmkit.gemspec'
+  Max: 13
 # Offense count: 1
 # Cop supports --auto-correct.

data/Gemfile CHANGED Viewed

@@ -1,4 +1,4 @@
-source "https://rubygems.org"
+source 'https://rubygems.org'
 git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }

data/HISTORY.md CHANGED Viewed

@@ -1,3 +1,8 @@
+# 0.5.0
+- Add class for K-Means clustering.
+- Add class for evaluating purity.
+- Add class for evaluating normalized mutual information.
 # 0.4.1
 - Add class for linear regressor.
 - Add class for SGD optimizer.
@@ -26,7 +31,6 @@ SVMKit plans to add other optimizer algorithms sequentially, so that users can s
   - Remove learning_rate, decay, and momentum parameters on Ridge, Lasso, and FactorizationMachineRegressor.
   - Remove normalize parameter on SVC, SVR, and LogisticRegression.
 # 0.3.3
 - Add class for Ridge regressor.
 - Add class for Lasso regressor.

data/README.md CHANGED Viewed

@@ -9,7 +9,8 @@ SVMKit is a machine learninig library in Ruby.
 SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 SVMKit currently supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier, and cross-validation.
+Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
+K-Means and cross-validation.
 ## Installation

data/Rakefile CHANGED Viewed

@@ -1,6 +1,6 @@
-require "bundler/gem_tasks"
-require "rspec/core/rake_task"
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
-task :default => :spec
+task default: :spec

data/lib/svmkit.rb CHANGED Viewed

@@ -10,6 +10,7 @@ require 'svmkit/probabilistic_output'
 require 'svmkit/base/base_estimator'
 require 'svmkit/base/classifier'
 require 'svmkit/base/regressor'
+require 'svmkit/base/cluster_analyzer'
 require 'svmkit/base/transformer'
 require 'svmkit/base/splitter'
 require 'svmkit/base/evaluator'
@@ -36,6 +37,7 @@ require 'svmkit/tree/decision_tree_classifier'
 require 'svmkit/tree/decision_tree_regressor'
 require 'svmkit/ensemble/random_forest_classifier'
 require 'svmkit/ensemble/random_forest_regressor'
+require 'svmkit/clustering/k_means'
 require 'svmkit/preprocessing/l2_normalizer'
 require 'svmkit/preprocessing/min_max_scaler'
 require 'svmkit/preprocessing/standard_scaler'
@@ -52,3 +54,5 @@ require 'svmkit/evaluation_measure/log_loss'
 require 'svmkit/evaluation_measure/r2_score'
 require 'svmkit/evaluation_measure/mean_squared_error'
 require 'svmkit/evaluation_measure/mean_absolute_error'
+require 'svmkit/evaluation_measure/purity'
+require 'svmkit/evaluation_measure/normalized_mutual_information'

data/lib/svmkit/base/cluster_analyzer.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/evaluation_measure/purity'
+module SVMKit
+  module Base
+    # Module for all clustering algorithms in SVMKit.
+    module ClusterAnalyzer
+      # An abstract method for analyzing clusters and predicting cluster indices.
+      def fit_predict
+        raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
+      end
+      # Calculate purity of clustering result.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
+      # @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
+      # @return [Float] Purity
+      def score(x, y)
+        SVMKit::Validation.check_sample_array(x)
+        SVMKit::Validation.check_label_array(y)
+        SVMKit::Validation.check_sample_label_size(x, y)
+        evaluator = SVMKit::EvaluationMeasure::Purity.new
+        evaluator.score(y, fit_predict(x))
+      end
+    end
+  end
+end

data/lib/svmkit/clustering/k_means.rb ADDED Viewed

@@ -0,0 +1,138 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/cluster_analyzer'
+require 'svmkit/pairwise_metric'
+module SVMKit
+  # This module consists of classes that implement cluster analysis methods.
+  module Clustering
+    # KMeans is a class that implements K-Means cluster analysis.
+    #
+    # @example
+    #   analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
+    #   cluster_ids = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
+    class KMeans
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      include Validation
+      # Return the centroids.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features])
+      attr_reader :cluster_centers
+      # Return the random generator.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new cluster analyzer with K-Means method.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param init [String] The initialization method for centroids ('random' or 'k-means++').
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param tol [Float] The tolerance of termination criterion.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
+        check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
+        check_params_string(init: init)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
+        @params = {}
+        @params[:n_clusters] = n_clusters
+        @params[:init] = init == 'random' ? 'random' : 'k-means++'
+        @params[:max_iter] = max_iter
+        @params[:tol] = tol
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @cluster_centers = nil
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> KMeans
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [KMeans] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_sample_array(x)
+        init_cluster_centers(x)
+        @params[:max_iter].times do |_t|
+          cluster_ids = assign_cluster(x)
+          old_centers = @cluster_centers.dup
+          @params[:n_clusters].times do |n|
+            assigned_bits = cluster_ids.eq(n)
+            @cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
+          end
+          error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
+          break if error <= @params[:tol]
+        end
+        self
+      end
+      # Predict cluster indices for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster index.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
+      def predict(x)
+        check_sample_array(x)
+        assign_cluster(x)
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
+      def fit_predict(x)
+        check_sample_array(x)
+        fit(x)
+        predict(x)
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data.
+      def marshal_dump
+        { params: @params,
+          cluster_centers: @cluster_centers,
+          rng: @rng }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @cluster_centers = obj[:cluster_centers]
+        @rng = obj[:rng]
+        nil
+      end
+      private
+      def assign_cluster(x)
+        distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
+        distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
+      end
+      def init_cluster_centers(x)
+        # random initialize
+        n_samples = x.shape[0]
+        rand_id = [*0...n_samples].sample(@params[:n_clusters], random: @rng)
+        @cluster_centers = x[rand_id, true].dup
+        return unless @params[:init] == 'k-means++'
+        # k-means++ initialize
+        (1...@params[:n_clusters]).each do |n|
+          distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
+          min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
+          probs = min_distances**2 / (min_distances**2).sum
+          cum_probs = probs.cumsum
+          selected_id = cum_probs.gt(@rng.rand).where.to_a.first
+          @cluster_centers[n, true] = x[selected_id, true].dup
+        end
+      end
+    end
+  end
+end

data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/evaluator'
+module SVMKit
+  module EvaluationMeasure
+    # NormalizedMutualInformation is a class that calculates the normalized mutual information of cluatering results.
+    #
+    # @example
+    #   evaluator = SVMKit::EvaluationMeasure::NormalizedMutualInformation.new
+    #   puts evaluator.score(ground_truth, predicted)
+    #
+    # *Reference*
+    # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    # - N X. Vinh, J. Epps, and J. Bailey, "Information Theoretic Measures for Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance," J. Machine Learning Research, vol. 11, pp. 2837--1854, 2010.
+    class NormalizedMutualInformation
+      include Base::Evaluator
+      # Calculate noramlzied mutual information
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
+      # @return [Float] Normalized mutual information
+      def score(y_true, y_pred)
+        SVMKit::Validation.check_label_array(y_true)
+        SVMKit::Validation.check_label_array(y_pred)
+        # initiazlie some variables.
+        mutual_information = 0.0
+        n_samples = y_pred.size
+        class_ids = y_true.to_a.uniq
+        cluster_ids = y_pred.to_a.uniq
+        # calculate entropy.
+        class_entropy = -1.0 * class_ids.map do |k|
+          ratio = y_true.eq(k).count.fdiv(n_samples)
+          ratio * Math.log(ratio)
+        end.reduce(:+)
+        return 0.0 if class_entropy.zero?
+        cluster_entropy = -1.0 * cluster_ids.map do |k|
+          ratio = y_pred.eq(k).count.fdiv(n_samples)
+          ratio * Math.log(ratio)
+        end.reduce(:+)
+        return 0.0 if cluster_entropy.zero?
+        # calculate mutual information.
+        cluster_ids.map do |k|
+          pr_sample_ids = y_pred.eq(k).where.to_a
+          n_pr_samples = pr_sample_ids.size
+          class_ids.map do |j|
+            tr_sample_ids = y_true.eq(j).where.to_a
+            n_tr_samples = tr_sample_ids.size
+            n_intr_samples = (pr_sample_ids & tr_sample_ids).size
+            if n_intr_samples > 0
+              mutual_information +=
+                n_intr_samples.fdiv(n_samples) * Math.log((n_samples * n_intr_samples).fdiv(n_pr_samples * n_tr_samples))
+            end
+          end
+        end
+        # return normalized mutual information.
+        mutual_information / Math.sqrt(class_entropy * cluster_entropy)
+      end
+    end
+  end
+end

data/lib/svmkit/evaluation_measure/purity.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/evaluator'
+module SVMKit
+  module EvaluationMeasure
+    # Purity is a class that calculates the purity of cluatering results.
+    #
+    # @example
+    #   evaluator = SVMKit::EvaluationMeasure::Purity.new
+    #   puts evaluator.score(ground_truth, predicted)
+    #
+    # *Reference*
+    # - C D. Manning, P. Raghavan, and H. Schutze, "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    class Purity
+      include Base::Evaluator
+      # Calculate purity
+      #
+      # @param y_true [Numo::Int32] (shape: [n_samples]) Ground truth labels.
+      # @param y_pred [Numo::Int32] (shape: [n_samples]) Predicted cluster labels.
+      # @return [Float] Purity
+      def score(y_true, y_pred)
+        SVMKit::Validation.check_label_array(y_true)
+        SVMKit::Validation.check_label_array(y_pred)
+        # initiazlie some variables.
+        purity = 0
+        n_samples = y_pred.size
+        class_ids = y_true.to_a.uniq
+        cluster_ids = y_pred.to_a.uniq
+        # calculate purity.
+        cluster_ids.each do |k|
+          pr_sample_ids = y_pred.eq(k).where.to_a
+          purity += class_ids.map { |j| (pr_sample_ids & y_true.eq(j).where.to_a).size }.max
+        end
+        purity.fdiv(n_samples)
+      end
+    end
+  end
+end

data/lib/svmkit/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.4.1'.freeze
+  VERSION = '0.5.0'.freeze
 end

data/svmkit.gemspec CHANGED Viewed

@@ -1,4 +1,3 @@
 lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'svmkit/version'
@@ -18,7 +17,8 @@ SVMKit is a machine learninig library in Ruby.
 SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 SVMKit currently supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
+Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
+K-Means and cross-validation.
 MSG
   spec.homepage      = 'https://github.com/yoshoku/svmkit'
   spec.license       = 'BSD-2-Clause'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.4.1
+  version: 0.5.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-06-08 00:00:00.000000000 Z
+date: 2018-06-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -85,7 +85,8 @@ description: |
   SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
   SVMKit currently supports Linear / Kernel Support Vector Machine,
   Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
-  Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm, and cross-validation.
+  Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
+  K-Means and cross-validation.
 email:
 - yoshoku@outlook.com
 executables: []
@@ -109,10 +110,12 @@ files:
 - lib/svmkit.rb
 - lib/svmkit/base/base_estimator.rb
 - lib/svmkit/base/classifier.rb
+- lib/svmkit/base/cluster_analyzer.rb
 - lib/svmkit/base/evaluator.rb
 - lib/svmkit/base/regressor.rb
 - lib/svmkit/base/splitter.rb
 - lib/svmkit/base/transformer.rb
+- lib/svmkit/clustering/k_means.rb
 - lib/svmkit/dataset.rb
 - lib/svmkit/ensemble/random_forest_classifier.rb
 - lib/svmkit/ensemble/random_forest_regressor.rb
@@ -121,8 +124,10 @@ files:
 - lib/svmkit/evaluation_measure/log_loss.rb
 - lib/svmkit/evaluation_measure/mean_absolute_error.rb
 - lib/svmkit/evaluation_measure/mean_squared_error.rb
+- lib/svmkit/evaluation_measure/normalized_mutual_information.rb
 - lib/svmkit/evaluation_measure/precision.rb
 - lib/svmkit/evaluation_measure/precision_recall.rb
+- lib/svmkit/evaluation_measure/purity.rb
 - lib/svmkit/evaluation_measure/r2_score.rb
 - lib/svmkit/evaluation_measure/recall.rb
 - lib/svmkit/kernel_approximation/rbf.rb