RubyGems - svmkit - Versions diffs - 0.5.1 → 0.5.2 - Mend

svmkit 0.5.1 → 0.5.2

Files changed (9) hide show

checksums.yaml +4 -4
data/HISTORY.md +3 -0
data/README.md +1 -1
data/lib/svmkit.rb +1 -0
data/lib/svmkit/clustering/dbscan.rb +127 -0
data/lib/svmkit/clustering/k_means.rb +9 -7
data/lib/svmkit/version.rb +1 -1
data/svmkit.gemspec +1 -1
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8f4ee565e18136b7f40832368ef78df514b7390a20929d40efb623d2ba7c0378
-  data.tar.gz: e05f3ff80b02ee41a7ce4c32cf6bc6cc99f30771ae9f719eb4fea716680da229
+  metadata.gz: 917f85878296b940b497f13253e3d3b03047be8f154d554116c2629aaeea55dd
+  data.tar.gz: 16308e4638b15a55843f15b4e0d97886f27aae0cc236c59c590a8f9fe7f0e5c6
 SHA512:
-  metadata.gz: 48cc0e18b0aa8a5ace9ceb07744249b799f61ffc04b177bbbb229529754c6e138f13dcd9f5ff04c2b61a380abcef01a6d0f4c175a86b7829c00aad9b91181521
-  data.tar.gz: 2b84b8983d392015dc30cb69851e4df11c4d1b30e6e521763179e648e239482c0fb508ccacf46e03cbbe8f2765b123df4a595a3f7f2a2384b0d1c0f085cd78fd
+  metadata.gz: d390d3ef0d7b06676e6d3c34479939b4a99ee01472816eacbe49fd3f40224ef5984620dfe6d335fb5b15e7213d3b0d17ba9441766e7cdd08c8bad9bff669db8d
+  data.tar.gz: ab2239c0d1297e18e31940e763875ac24668d8c4c3f30355f06bc5ed305c247ff0328e1d584c5ab70ce77d4d2f946dcc5f72f1eb4c3a25d9b0dcd38e1d246182

data/HISTORY.md CHANGED

@@ -1,3 +1,6 @@
+# 0.5.2
+- Add class for DBSCAN clustering.
 # 0.5.1
 - Fix bug on class probability calculation of DecisionTreeClassifier.

data/README.md CHANGED

@@ -10,7 +10,7 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
 SVMKit currently supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
 Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor classifier,
-K-Means and cross-validation.
+K-Means, DBSCAN and cross-validation.
 ## Installation

data/lib/svmkit.rb CHANGED

@@ -38,6 +38,7 @@ require 'svmkit/tree/decision_tree_regressor'
 require 'svmkit/ensemble/random_forest_classifier'
 require 'svmkit/ensemble/random_forest_regressor'
 require 'svmkit/clustering/k_means'
+require 'svmkit/clustering/dbscan'
 require 'svmkit/preprocessing/l2_normalizer'
 require 'svmkit/preprocessing/min_max_scaler'
 require 'svmkit/preprocessing/standard_scaler'

data/lib/svmkit/clustering/dbscan.rb ADDED

@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+require 'svmkit/validation'
+require 'svmkit/base/base_estimator'
+require 'svmkit/base/cluster_analyzer'
+require 'svmkit/pairwise_metric'
+module SVMKit
+  module Clustering
+    # DBSCAN is a class that implements DBSCAN cluster analysis.
+    # The current implementation uses the Euclidean distance for analyzing the clusters.
+    #
+    # @example
+    #   analyzer = SVMKit::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - M. Ester, H-P. Kriegel, J. Sander, and X. Xu, "A density-based algorithm for discovering clusters in large spatial databases with noise," Proc. KDD' 96, pp. 266--231, 1996.
+    class DBSCAN
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      include Validation
+      # Return the core sample indices.
+      # @return [Numo::Int32] (shape: [n_core_samples])
+      attr_reader :core_sample_ids
+      # Return the cluster labels. The negative cluster label indicates that the point is noise.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Create a new cluster analyzer with DBSCAN method.
+      #
+      # @param eps [Float] The radius of neighborhood.
+      # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
+      def initialize(eps: 0.5, min_samples: 5)
+        check_params_float(eps: eps)
+        check_params_integer(min_samples: min_samples)
+        @params = {}
+        @params[:eps] = eps
+        @params[:min_samples] = min_samples
+        @core_sample_ids = nil
+        @labels = nil
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> DBSCAN
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [DBSCAN] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_sample_array(x)
+        partial_fit(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_sample_array(x)
+        partial_fit(x)
+        labels
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data.
+      def marshal_dump
+        { params: @params,
+          core_sample_ids: @core_sample_ids,
+          labels: @labels }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @core_sample_ids = obj[:core_sample_ids]
+        @labels = obj[:labels]
+        nil
+      end
+      private
+      def partial_fit(x)
+        cluster_id = 0
+        n_samples  = x.shape[0]
+        @core_sample_ids = []
+        @labels = Numo::Int32.zeros(n_samples) - 2
+        n_samples.times do |q|
+          next if @labels[q] >= -1
+          cluster_id += 1 if expand_cluster(x, q, cluster_id)
+        end
+        @core_sample_ids = Numo::Int32[*@core_sample_ids.flatten]
+        nil
+      end
+      def expand_cluster(x, query_id, cluster_id)
+        target_ids = region_query(x[query_id, true], x)
+        if target_ids.size < @params[:min_samples]
+          @labels[query_id] = -1
+          false
+        else
+          @labels[target_ids] = cluster_id
+          @core_sample_ids.push(target_ids.dup)
+          target_ids.delete(query_id)
+          while (m = target_ids.shift)
+            neighbor_ids = region_query(x[m, true], x)
+            next if neighbor_ids.size < @params[:min_samples]
+            neighbor_ids.each do |n|
+              target_ids.push(n) if @labels[n] < -1
+              @labels[n] = cluster_id if @labels[n] <= -1
+            end
+          end
+          true
+        end
+      end
+      def region_query(query, targets)
+        distance_arr = PairwiseMetric.euclidean_distance(query.expand_dims(0), targets)[0, true]
+        distance_arr.lt(@params[:eps]).where.to_a
+      end
+    end
+  end
+end

data/lib/svmkit/clustering/k_means.rb CHANGED

@@ -9,10 +9,11 @@ module SVMKit
   # This module consists of classes that implement cluster analysis methods.
   module Clustering
     # KMeans is a class that implements K-Means cluster analysis.
+    # The current implementation uses the Euclidean distance for analyzing the clusters.
     #
     # @example
     #   analyzer = SVMKit::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
-    #   cluster_ids = analyzer.fit_predict(samples)
+    #   cluster_labels = analyzer.fit_predict(samples)
     #
     # *Reference*
     # - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
@@ -38,6 +39,7 @@ module SVMKit
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
         check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
+        check_params_float(tol: tol)
         check_params_string(init: init)
         check_params_type_or_nil(Integer, random_seed: random_seed)
         check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
@@ -62,10 +64,10 @@ module SVMKit
         check_sample_array(x)
         init_cluster_centers(x)
         @params[:max_iter].times do |_t|
-          cluster_ids = assign_cluster(x)
+          cluster_labels = assign_cluster(x)
           old_centers = @cluster_centers.dup
           @params[:n_clusters].times do |n|
-            assigned_bits = cluster_ids.eq(n)
+            assigned_bits = cluster_labels.eq(n)
             @cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count > 0
           end
           error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
@@ -74,10 +76,10 @@ module SVMKit
         self
       end
-      # Predict cluster indices for samples.
+      # Predict cluster labels for samples.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster index.
-      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
       def predict(x)
         check_sample_array(x)
         assign_cluster(x)
@@ -86,7 +88,7 @@ module SVMKit
       # Analysis clusters and assign samples to clusters.
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
-      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster index per sample.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
       def fit_predict(x)
         check_sample_array(x)
         fit(x)

data/lib/svmkit/version.rb CHANGED

@@ -3,5 +3,5 @@
 # SVMKit is a machine learning library in Ruby.
 module SVMKit
   # @!visibility private
-  VERSION = '0.5.1'.freeze
+  VERSION = '0.5.2'.freeze
 end

data/svmkit.gemspec CHANGED

@@ -18,7 +18,7 @@ SVMKit provides machine learning algorithms with interfaces similar to Scikit-Le
 SVMKit currently supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
 Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
-K-Means and cross-validation.
+K-Means, DBSCAN and cross-validation.
 MSG
   spec.homepage      = 'https://github.com/yoshoku/svmkit'
   spec.license       = 'BSD-2-Clause'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: svmkit
 version: !ruby/object:Gem::Version
-  version: 0.5.1
+  version: 0.5.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-06-16 00:00:00.000000000 Z
+date: 2018-06-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -86,7 +86,7 @@ description: |
   SVMKit currently supports Linear / Kernel Support Vector Machine,
   Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
   Naive Bayes, Decision Tree, Random Forest, K-nearest neighbor algorithm,
-  K-Means and cross-validation.
+  K-Means, DBSCAN and cross-validation.
 email:
 - yoshoku@outlook.com
 executables: []
@@ -115,6 +115,7 @@ files:
 - lib/svmkit/base/regressor.rb
 - lib/svmkit/base/splitter.rb
 - lib/svmkit/base/transformer.rb
+- lib/svmkit/clustering/dbscan.rb
 - lib/svmkit/clustering/k_means.rb
 - lib/svmkit/dataset.rb
 - lib/svmkit/ensemble/random_forest_classifier.rb