RubyGems - rumale - Versions diffs - 0.13.3 → 0.13.4 - Mend

rumale 0.13.3 → 0.13.4

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +1 -1
data/lib/rumale.rb +3 -0
data/lib/rumale/clustering/hdbscan.rb +282 -0
data/lib/rumale/clustering/power_iteration.rb +46 -29
data/lib/rumale/clustering/single_linkage.rb +200 -0
data/lib/rumale/clustering/spectral_clustering.rb +134 -0
data/lib/rumale/kernel_machine/kernel_pca.rb +2 -0
data/lib/rumale/kernel_machine/kernel_ridge.rb +2 -0
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +1 -1
metadata +6 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: '088ba275c0027e5f4a816a681bac8f0ff08d9d9c'
-  data.tar.gz: 61f2d2e2e8a2557eb18a045cfb76cbc36d1876dd
+  metadata.gz: 522eaabfd67ced29bf275fb6f5cec019ff60e3d5
+  data.tar.gz: 0eb97f58c3764bdcbf448f9a392f8f5091ce418d
 SHA512:
-  metadata.gz: e13dfbee846fd28b10f8f5fa04b166efad17269a14537c9dcee8ff50f56353ac740549f2e616d418907bcc83cf37ac80834c1d3c2a26da33ce0acaa632416790
-  data.tar.gz: 46c895ce3b5dee436d83887c2d1a028048cc934054d91482c7b3148f6f128d59c9de947ba19cd3a69aef2fc42273579a413257203a7e4ee46b66534acc71866b
+  metadata.gz: bf5a3caf614b08813aa4b11673da758778191847ba6fe4c4144cae7da1dd8e4b3ec3eac1367d54b78a00a7afd5ae1ae047fa84c58954b0e7d0571a9442a10380
+  data.tar.gz: 8bdb25aaec7304f12595673d3fa915cc1739ef14fedd89210205f53325de5a96076b4cf718f8d7ca15fdec1f55d5f9c65cd256781ec2d60462a48221525ad068

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+# 0.13.4
+- Add cluster analysis class for [HDBSCAN](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/HDBSCAN.html).
+- Add cluster analysis class for [spectral clustering](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/SpectralClustering.html).
+- Refactor power iteration clustering.
+- Several documentation improvements.
 # 0.13.3
 - Add transformer class for [Kernel PCA](https://yoshoku.github.io/rumale/doc/Rumale/KernelMachine/KernelPCA.html).
 - Add regressor class for [Kernel Ridge](https://yoshoku.github.io/rumale/doc/Rumale/KernelMachine/KernelRidge.html).

data/README.md CHANGED

@@ -13,7 +13,7 @@ Rumale provides machine learning algorithms with interfaces similar to Scikit-Le
 Rumale supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Kernel Ridge, Factorization Machine,
 Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
-K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
+K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, HDSCAN, SNN, Spectral Clustering, Power Iteration Clustering,
 Mutidimensional Scaling, t-SNE, Principal Component Analysis, Kernel PCA and Non-negative Matrix Factorization.
 This project was formerly known as "SVMKit".

data/lib/rumale.rb CHANGED

@@ -62,8 +62,11 @@ require 'rumale/clustering/k_means'
 require 'rumale/clustering/k_medoids'
 require 'rumale/clustering/gaussian_mixture'
 require 'rumale/clustering/dbscan'
+require 'rumale/clustering/hdbscan'
 require 'rumale/clustering/snn'
 require 'rumale/clustering/power_iteration'
+require 'rumale/clustering/spectral_clustering'
+require 'rumale/clustering/single_linkage'
 require 'rumale/decomposition/pca'
 require 'rumale/decomposition/nmf'
 require 'rumale/manifold/tsne'

data/lib/rumale/clustering/hdbscan.rb ADDED

@@ -0,0 +1,282 @@
+# frozen_string_literal: true
+require 'ostruct'
+require 'rumale/base/base_estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+require 'rumale/clustering/single_linkage'
+module Rumale
+  module Clustering
+    # HDBSCAN is a class that implements HDBSCAN cluster analysis.
+    #
+    # @example
+    #   analyzer = Rumale::Clustering::HDBSCAN.new(min_samples: 5)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - R J. G. B. Campello, D. Moulavi, A. Zimek, and J. Sander, "Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection," TKDD, Vol. 10 (1), pp. 5:1--5:51, 2015.
+    # - R J. G. B. Campello, D. Moulavi, and J Sander, "Density-Based Clustering Based on Hierarchical Density Estimates," Proc. PAKDD'13, pp. 160--172, 2013.
+    # - L. Lelis and J. Sander, "Semi-Supervised Density-Based Clustering," Proc. ICDM'09, pp. 842--847, 2009.
+    class HDBSCAN
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      # Return the cluster labels. The negative cluster label indicates that the point is noise.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Create a new cluster analyzer with HDBSCAN algorithm.
+      #
+      # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
+      # @param min_cluster_size [Integer/Nil] The minimum size of cluster. If nil is given, it is set equal to min_samples.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(min_samples: 10, min_cluster_size: nil, metric: 'euclidean')
+        check_params_integer(min_samples: min_samples)
+        check_params_type_or_nil(Integer, min_cluster_size: min_cluster_size)
+        check_params_string(metric: metric)
+        check_params_positive(min_samples: min_samples)
+        @params = {}
+        @params[:min_samples] = min_samples
+        @params[:min_cluster_size] = min_cluster_size || min_samples
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
+        @labels = nil
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> HDBSCAN
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [HDBSCAN] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        fit_predict(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
+        @labels = partial_fit(distance_mat)
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data.
+      def marshal_dump
+        { params: @params,
+          labels: @labels }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @labels = obj[:labels]
+        nil
+      end
+      private
+      # @!visibility private
+      class UnionFind
+        def initialize(n)
+          @parent = Numo::Int32.new(n).seq
+          @rank = Numo::Int32.zeros(n)
+        end
+        # @!visibility private
+        def union(x, y)
+          x_root = find(x)
+          y_root = find(y)
+          return if x_root == y_root
+          # :nocov:
+          if @rank[x_root] < @rank[y_root]
+            @parent[x_root] = y_root
+          else
+            @parent[y_root] = x_root
+            @rank[x_root] += 1 if @rank[x_root] == @rank[y_root]
+          end
+          # :nocov:
+          nil
+        end
+        # @!visibility private
+        def find(x)
+          @parent[x] = find(@parent[x]) if @parent[x] != x
+          @parent[x]
+        end
+      end
+      private_constant :UnionFind
+      def partial_fit(distance_mat)
+        mr_distance_mat = mutual_reachability_distances(distance_mat, @params[:min_samples])
+        hierarchy = Rumale::Clustering::SingleLinkage.new(n_clusters: 1, metric: 'precomputed').fit(mr_distance_mat).hierarchy
+        tree = condense_tree(hierarchy, @params[:min_cluster_size])
+        stabilities = cluster_stability(tree)
+        flatten(tree, stabilities)
+      end
+      def mutual_reachability_distances(distance_mat, min_samples)
+        core_distances = distance_mat.sort(axis: 1)[true, min_samples + 1]
+        Numo::DFloat.maximum(core_distances.expand_dims(1), Numo::DFloat.maximum(core_distances, distance_mat))
+      end
+      def breadth_first_search_hierarchy(hierarchy, root)
+        n_edges = hierarchy.size
+        n_points = n_edges + 1
+        to_process = [root]
+        res = []
+        while to_process.any?
+          res.concat(to_process)
+          to_process = to_process.select { |n| n >= n_points }.map { |n| n - n_points }
+          to_process = to_process.map { |n| [hierarchy[n].x, hierarchy[n].y] }.flatten if to_process.any?
+        end
+        res
+      end
+      def condense_tree(hierarchy, min_cluster_size)
+        n_edges = hierarchy.size
+        root = 2 * n_edges
+        n_points = n_edges + 1
+        next_label = n_points + 1
+        node_ids = breadth_first_search_hierarchy(hierarchy, root)
+        relabel = Numo::Int32.zeros(root + 1)
+        relabel[root] = n_points
+        res = []
+        visited = {}
+        node_ids.each do |n_id|
+          next if visited[n_id] || n_id < n_points
+          edge = hierarchy[n_id - n_points]
+          density = edge.weight > 0.0 ? 1.fdiv(edge.weight) : Float::INFINITY
+          n_x_elements = edge.x >= n_points ? hierarchy[edge.x - n_points].n_elements : 1
+          n_y_elements = edge.y >= n_points ? hierarchy[edge.y - n_points].n_elements : 1
+          if n_x_elements >= min_cluster_size && n_y_elements >= min_cluster_size
+            relabel[edge.x] = next_label
+            res.push(OpenStruct.new(x: relabel[n_id], y: relabel[edge.x], weight: density, n_elements: n_x_elements))
+            next_label += 1
+            relabel[edge.y] = next_label
+            res.push(OpenStruct.new(x: relabel[n_id], y: relabel[edge.y], weight: density, n_elements: n_y_elements))
+            next_label += 1
+          elsif n_x_elements < min_cluster_size && n_y_elements < min_cluster_size
+            breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
+              res.push(OpenStruct.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
+              visited[sn_id] = true
+            end
+            breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
+              res.push(OpenStruct.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
+              visited[sn_id] = true
+            end
+          elsif n_x_elements < min_cluster_size
+            relabel[edge.y] = relabel[n_id]
+            breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
+              res.push(OpenStruct.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
+              visited[sn_id] = true
+            end
+          elsif n_y_elements < min_cluster_size
+            relabel[edge.x] = relabel[n_id]
+            breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
+              res.push(OpenStruct.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
+              visited[sn_id] = true
+            end
+          end
+        end
+        res
+      end
+      def cluster_stability(tree)
+        tree.sort! { |a, b| a.weight <=> b.weight }
+        root = tree.map(&:x).min
+        child_max = tree.map(&:y).max
+        child_max = root if child_max < root
+        densities = Numo::DFloat.zeros(child_max + 1) + Float::INFINITY
+        current = tree[0].y
+        density_min = tree[0].weight
+        tree.each do |edge|
+          if edge.x == current
+            density_min = [density_min, edge.weight].min
+          else
+            densities[current] = density_min
+            current = edge.y
+            density_min = edge.weight
+          end
+        end
+        densities[current] = density_min if current != tree[0].y
+        densities[root] = 0.0
+        tree.each_with_object({}) do |edge, stab|
+          stab[edge.x] ||= 0.0
+          stab[edge.x] += (edge.weight - densities[edge.x]) * edge.n_elements
+        end
+      end
+      def breadth_first_search_tree(tree, root)
+        to_process = [root]
+        res = []
+        while to_process.any?
+          res.concat(to_process)
+          to_process = tree.select { |v| to_process.include?(v.x) }.map(&:y)
+        end
+        res
+      end
+      def flatten(tree, stabilities)
+        node_ids = stabilities.keys.sort { |a, b| b <=> a }.slice(0, stabilities.size - 1)
+        cluster_tree = tree.select { |edge| edge.n_elements > 1 }
+        is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
+        node_ids.each do |n_id|
+          children = cluster_tree.select { |node| node.x == n_id }.map(&:y)
+          subtree_stability = children.inject(0.0) { |sum, c_id| sum + stabilities[c_id] }
+          if subtree_stability > stabilities[n_id]
+            is_cluster[n_id] = false
+            stabilities[n_id] = subtree_stability
+          else
+            breadth_first_search_tree(cluster_tree, n_id).each do |sn_id|
+              is_cluster[sn_id] = false if sn_id != n_id
+            end
+          end
+        end
+        cluster_label_map = {}
+        is_cluster.select { |_k, v| v == true }.keys.uniq.sort.each_with_index { |n_idx, c_idx| cluster_label_map[n_idx] = c_idx }
+        parent_arr = tree.map(&:x)
+        uf = UnionFind.new(parent_arr.max + 1)
+        tree.each { |edge| uf.union(edge.x, edge.y) if cluster_label_map[edge.y].nil? }
+        root = parent_arr.min
+        res = Numo::Int32.zeros(root)
+        root.times do |n|
+          cluster = uf.find(n)
+          res[n] = cluster < root ? -1 : cluster_label_map[cluster] || -1
+        end
+        res
+      end
+    end
+  end
+end

data/lib/rumale/clustering/power_iteration.rb CHANGED

@@ -22,6 +22,10 @@ module Rumale
       # @return [Numo::DFloat] (shape: [n_samples])
       attr_reader :embedding
+      # Return the cluster labels.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
       # Return the number of iterations run for optimization
       # @return [Integer]
       attr_reader :n_iter
@@ -55,12 +59,13 @@ module Rumale
         @params[:random_seed] = random_seed
         @params[:random_seed] ||= srand
         @embedding = nil
+        @labels = nil
         @n_iter = nil
       end
       # Analysis clusters with given training data.
       #
-      # @overload fit(x) -> PowerClustering
+      # @overload fit(x) -> PowerIteration
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
       #   If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
@@ -68,28 +73,7 @@ module Rumale
       def fit(x, _y = nil)
         check_sample_array(x)
         raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
-        # initialize some variables.
-        affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
-        affinity_mat[affinity_mat.diag_indices] = 0.0
-        n_samples = affinity_mat.shape[0]
-        tol = @params[:tol].fdiv(n_samples)
-        # calculate normalized affinity matrix.
-        degrees = affinity_mat.sum(axis: 1)
-        normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
-        # initialize embedding space.
-        @embedding = degrees / degrees.sum
-        # optimization
-        @n_iter = 0
-        error = Numo::DFloat.ones(n_samples)
-        @params[:max_iter].times do |t|
-          @n_iter = t + 1
-          new_embedding = normalized_affinity_mat.dot(@embedding)
-          new_embedding /= new_embedding.abs.sum
-          new_error = (new_embedding - @embedding).abs
-          break if (new_error - error).abs.max <= tol
-          @embedding = new_embedding
-          error = new_error
-        end
+        fit_predict(x)
         self
       end
@@ -100,12 +84,11 @@ module Rumale
       # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
       def fit_predict(x)
         check_sample_array(x)
-        fit(x)
-        kmeans = Rumale::Clustering::KMeans.new(
-          n_clusters: @params[:n_clusters], init: @params[:init],
-          max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
-        )
-        kmeans.fit_predict(@embedding.expand_dims(1))
+        raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
+        affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
+        @embedding, @n_iter = embedded_space(affinity_mat, @params[:max_iter], @params[:tol].fdiv(affinity_mat.shape[0]))
+        @labels = line_kmeans_clustering(@embedding)
       end
       # Dump marshal data.
@@ -113,6 +96,7 @@ module Rumale
       def marshal_dump
         { params: @params,
           embedding: @embedding,
+          labels: @labels,
           n_iter: @n_iter }
       end
@@ -121,9 +105,42 @@ module Rumale
       def marshal_load(obj)
         @params = obj[:params]
         @embedding = obj[:embedding]
+        @labels = obj[:labels]
         @n_iter = obj[:n_iter]
         nil
       end
+      private
+      def embedded_space(affinity_mat, max_iter, tol)
+        affinity_mat[affinity_mat.diag_indices] = 0.0
+        degrees = affinity_mat.sum(axis: 1)
+        normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
+        iters = 0
+        embedded_line = degrees / degrees.sum
+        n_samples = embedded_line.shape[0]
+        error = Numo::DFloat.ones(n_samples)
+        max_iter.times do |t|
+          iters = t + 1
+          new_embedded_line = normalized_affinity_mat.dot(embedded_line)
+          new_embedded_line /= new_embedded_line.abs.sum
+          new_error = (new_embedded_line - embedded_line).abs
+          break if (new_error - error).abs.max <= tol
+          embedded_line = new_embedded_line
+          error = new_error
+        end
+        [embedded_line, iters]
+      end
+      def line_kmeans_clustering(vec)
+        Rumale::Clustering::KMeans.new(
+          n_clusters: @params[:n_clusters], init: @params[:init],
+          max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
+        ).fit_predict(vec.expand_dims(1))
+      end
     end
   end
 end

data/lib/rumale/clustering/single_linkage.rb ADDED

@@ -0,0 +1,200 @@
+# frozen_string_literal: true
+require 'ostruct'
+require 'rumale/base/base_estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+module Rumale
+  module Clustering
+    # SingleLinkage is a class that implements hierarchical cluster analysis with single linakge method.
+    # This class is used internally for HDBSCAN.
+    #
+    # @example
+    #   analyzer = Rumale::Clustering::SingleLinkage.new(n_clusters: 2)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - D. Mullner, "Modern hierarchical, agglomerative clustering algorithms," arXiv:1109.2378, 2011.
+    class SingleLinkage
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      # Return the cluster labels.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Return the hierarchical structure.
+      # @return [Array<OpenStruct>] (shape: [n_samples - 1])
+      attr_reader :hierarchy
+      # Create a new cluster analyzer with single linkage algorithm.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(n_clusters: 2, metric: 'euclidean')
+        check_params_integer(n_clusters: n_clusters)
+        check_params_string(metric: metric)
+        @params = {}
+        @params[:n_clusters] = n_clusters
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
+        @labels = nil
+        @hierarchy = nil
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> SingleLinkage
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [SingleLinkage] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        fit_predict(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
+        @labels = partial_fit(distance_mat)
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data.
+      def marshal_dump
+        { params: @params,
+          labels: @labels,
+          hierarchy: @hierarchy }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @labels = obj[:labels]
+        @hierarchy = obj[:hierarchy]
+        nil
+      end
+      private
+      # @!visibility private
+      class UnionFind
+        def initialize(n)
+          @parent = Numo::Int32.zeros(2 * n - 1) - 1
+          @size = Numo::Int32.hstack([Numo::Int32.ones(n), Numo::Int32.zeros(n - 1)])
+          @next_label = n
+        end
+        # @!visibility private
+        def union(x, y)
+          size = @size[x] + @size[y]
+          @parent[x] = @next_label
+          @parent[y] = @next_label
+          @size[@next_label] = size
+          @next_label += 1
+          size
+        end
+        # @!visibility private
+        def find(x)
+          p = x
+          x = @parent[x] while @parent[x] != -1
+          while @parent[p] != x
+            p = @parent[p]
+            @parent[p] = x
+          end
+          x
+        end
+      end
+      private_constant :UnionFind
+      def partial_fit(distance_mat)
+        mst = minimum_spanning_tree(distance_mat)
+        @hierarchy = single_linkage_hierarchy(mst)
+        flatten(@hierarchy, @params[:n_clusters])
+      end
+      def minimum_spanning_tree(complete_graph)
+        n_samples = complete_graph.shape[0]
+        n_edges = n_samples - 1
+        curr_weights = Numo::DFloat.zeros(n_samples) + Float::INFINITY
+        curr_labels = Numo::Int32.new(n_samples).seq
+        next_node = 0
+        mst = Array.new(n_edges) do
+          curr_node = next_node
+          target = curr_labels.ne(curr_node)
+          curr_labels = curr_labels[target]
+          curr_weights = Numo::DFloat.minimum(curr_weights[target], complete_graph[curr_node, curr_labels])
+          next_node = curr_labels[curr_weights.min_index]
+          weight = curr_weights.min
+          OpenStruct.new(x: curr_node, y: next_node, weight: weight)
+        end
+        mst.sort! { |a, b| a.weight <=> b.weight }
+      end
+      def single_linkage_hierarchy(mst)
+        n_edges = mst.size
+        n_nodes = n_edges + 1
+        uf = UnionFind.new(n_nodes)
+        Array.new(n_edges) do |n|
+          x_root = uf.find(mst[n].x)
+          y_root = uf.find(mst[n].y)
+          x_root, y_root = [y_root, x_root] unless x_root < y_root
+          weight = mst[n].weight
+          n_samples = uf.union(x_root, y_root)
+          OpenStruct.new(x: x_root, y: y_root, weight: weight, n_elements: n_samples)
+        end
+      end
+      def descedent_ids(hierarchy_, start_node)
+        n_samples = hierarchy_.size + 1
+        return [start_node] if start_node < n_samples
+        res = []
+        indices = [start_node]
+        n_indices = 1
+        while n_indices.positive?
+          idx = indices.pop
+          if idx < n_samples
+            res.push(idx)
+            n_indices -= 1
+          else
+            indices.push(hierarchy_[idx - n_samples].x)
+            indices.push(hierarchy_[idx - n_samples].y)
+            n_indices += 1
+          end
+        end
+        res
+      end
+      def flatten(hierarchy_, n_clusters)
+        n_samples = hierarchy_.size + 1
+        return Numo::Int32.zeros(n_samples) if n_clusters < 2
+        nodes = [-([hierarchy_[-1].x, hierarchy_[-1].y].max + 1)]
+        (n_clusters - 1).times do
+          children = hierarchy_[-nodes[0] - n_samples]
+          nodes.push(-children.x)
+          nodes.push(-children.y)
+          nodes.sort!.shift
+        end
+        res = Numo::Int32.zeros(n_samples)
+        nodes.each_with_index { |sid, cluster_id| res[descedent_ids(hierarchy_, -sid)] = cluster_id }
+        res
+      end
+    end
+  end
+end

data/lib/rumale/clustering/spectral_clustering.rb ADDED

@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+require 'rumale/preprocessing/l2_normalizer'
+module Rumale
+  module Clustering
+    # SpectralClustering is a class that implements the normalized spectral clustering.
+    #
+    # @example
+    #   require 'numo/linalg/autoloader'
+    #
+    #   analyzer = Rumale::Clustering::SpectralClustering.new(n_clusters: 10, gamma: 8.0)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - A Y. Ng, M I. Jordan, and Y. Weiss, "On Spectral Clustering: Analyssi and an algorithm," Proc. NIPS'01, pp. 849--856, 2001.
+    # - U von Luxburg, "A tutorial on spectral clustering," Statistics and Computing, Vol. 17 (4), pp. 395--416, 2007.
+    class SpectralClustering
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      # Return the data in embedded space.
+      # @return [Numo::DFloat] (shape: [n_samples, n_clusters])
+      attr_reader :embedding
+      # Return the cluster labels.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Create a new cluster analyzer with normalized spectral clustering.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
+      #   If affinity = 'rbf', the class performs the normalized spectral clustering with the fully connected graph weighted by rbf kernel.
+      # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
+      #   If affinity = 'precomputed', this parameter is ignored.
+      # @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
+      # @param max_iter [Integer] The maximum number of iterations for K-Means clustering.
+      # @param tol [Float] The tolerance of termination criterion for K-Means clustering.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_clusters: 2, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 10, tol: 1.0e-8, random_seed: nil)
+        check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
+        check_params_float(tol: tol)
+        check_params_string(affinity: affinity, init: init)
+        check_params_type_or_nil(Float, gamma: gamma)
+        check_params_type_or_nil(Integer, random_seed: random_seed)
+        check_params_positive(n_clusters: n_clusters, max_iter: max_iter, tol: tol)
+        @params = {}
+        @params[:n_clusters] = n_clusters
+        @params[:affinity] = affinity
+        @params[:gamma] = gamma
+        @params[:init] = init == 'random' ? 'random' : 'k-means++'
+        @params[:max_iter] = max_iter
+        @params[:tol] = tol
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @embedding = nil
+        @labels = nil
+      end
+      # Analysis clusters with given training data.
+      # To execute this method, Numo::Linalg must be loaded.
+      #
+      # @overload fit(x) -> SpectralClustering
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #     If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
+      # @return [SpectralClustering] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
+        raise 'SpectralClustering#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
+        fit_predict(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      # To execute this method, Numo::Linalg must be loaded.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_sample_array(x)
+        raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
+        raise 'SpectralClustering#fit_predict requires Numo::Linalg but that is not loaded.' unless enable_linalg?
+        affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
+        @embedding = embedded_space(affinity_mat, @params[:n_clusters])
+        normalized_embedding = Rumale::Preprocessing::L2Normalizer.new.fit_transform(@embedding)
+        @labels = kmeans_clustering(normalized_embedding)
+      end
+      # Dump marshal data.
+      # @return [Hash] The marshal data.
+      def marshal_dump
+        { params: @params,
+          embedding: @embedding,
+          labels: @labels }
+      end
+      # Load marshal data.
+      # @return [nil]
+      def marshal_load(obj)
+        @params = obj[:params]
+        @embedding = obj[:embedding]
+        @labels = obj[:labels]
+        nil
+      end
+      private
+      def embedded_space(affinity_mat, n_clusters)
+        affinity_mat[affinity_mat.diag_indices] = 0.0
+        degrees = 1.0 / Numo::NMath.sqrt(affinity_mat.sum(axis: 1))
+        laplacian_mat = degrees.diag.dot(affinity_mat).dot(degrees.diag)
+        n_samples = affinity_mat.shape[0]
+        _, eig_vecs = Numo::Linalg.eigh(laplacian_mat, vals_range: (n_samples - n_clusters)...n_samples)
+        eig_vecs.reverse(1).dup
+      end
+      def kmeans_clustering(x)
+        Rumale::Clustering::KMeans.new(
+          n_clusters: @params[:n_clusters], init: @params[:init],
+          max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
+        ).fit_predict(x)
+      end
+    end
+  end
+end

data/lib/rumale/kernel_machine/kernel_pca.rb CHANGED

@@ -8,6 +8,8 @@ module Rumale
     # KernelPCA is a class that implements Kernel Principal Component Analysis.
     #
     # @example
+    #   require 'numo/linalg/autoloader'
+    #
     #   kernel_mat_train = Rumale::PairwiseMetric::rbf_kernel(training_samples)
     #   kpca = Rumale::KernelMachine::KernelPCA(n_components: 2)
     #   mapped_traininig_samples = kpca.fit_transform(kernel_mat_train)

data/lib/rumale/kernel_machine/kernel_ridge.rb CHANGED

@@ -8,6 +8,8 @@ module Rumale
     # KernelRidge is a class that implements kernel ridge regression.
     #
     # @example
+    #   require 'numo/linalg/autoloader'
+    #
     #   kernel_mat_train = Rumale::PairwiseMetric::rbf_kernel(training_samples)
     #   kridge = Rumale::KernelMachine::KernelRidge.new(reg_param: 1.0)
     #   kridge.fit(kernel_mat_train, traininig_values)

data/lib/rumale/version.rb CHANGED

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.13.3'
+  VERSION = '0.13.4'
 end

data/rumale.gemspec CHANGED

@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
     Rumale currently supports Linear / Kernel Support Vector Machine,
     Logistic Regression, Linear Regression, Ridge, Lasso, Kernel Ridge, Factorization Machine,
     Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
-    K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
+    K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, HDBSCAN, SNN, Spectral Clustering, Power Iteration Clustering,
     Multidimensional Scaling, t-SNE, Principal Component Analysis, Kernel PCA, and Non-negative Matrix Factorization.
   MSG
   spec.homepage      = 'https://github.com/yoshoku/rumale'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.13.3
+  version: 0.13.4
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-09-15 00:00:00.000000000 Z
+date: 2019-09-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -128,7 +128,7 @@ description: |
   Rumale currently supports Linear / Kernel Support Vector Machine,
   Logistic Regression, Linear Regression, Ridge, Lasso, Kernel Ridge, Factorization Machine,
   Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
-  K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
+  K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, HDBSCAN, SNN, Spectral Clustering, Power Iteration Clustering,
   Multidimensional Scaling, t-SNE, Principal Component Analysis, Kernel PCA, and Non-negative Matrix Factorization.
 email:
 - yoshoku@outlook.com
@@ -163,10 +163,13 @@ files:
 - lib/rumale/base/transformer.rb
 - lib/rumale/clustering/dbscan.rb
 - lib/rumale/clustering/gaussian_mixture.rb
+- lib/rumale/clustering/hdbscan.rb
 - lib/rumale/clustering/k_means.rb
 - lib/rumale/clustering/k_medoids.rb
 - lib/rumale/clustering/power_iteration.rb
+- lib/rumale/clustering/single_linkage.rb
 - lib/rumale/clustering/snn.rb
+- lib/rumale/clustering/spectral_clustering.rb
 - lib/rumale/dataset.rb
 - lib/rumale/decomposition/nmf.rb
 - lib/rumale/decomposition/pca.rb