RubyGems - rumale-clustering - Versions diffs - 0.24.0 - Mend

rumale-clustering 0.24.0

Files changed (16) hide show

checksums.yaml +7 -0
data/LICENSE.txt +27 -0
data/README.md +34 -0
data/lib/rumale/clustering/dbscan.rb +126 -0
data/lib/rumale/clustering/gaussian_mixture.rb +215 -0
data/lib/rumale/clustering/hdbscan.rb +289 -0
data/lib/rumale/clustering/k_means.rb +120 -0
data/lib/rumale/clustering/k_medoids.rb +143 -0
data/lib/rumale/clustering/mini_batch_k_means.rb +138 -0
data/lib/rumale/clustering/power_iteration.rb +128 -0
data/lib/rumale/clustering/single_linkage.rb +206 -0
data/lib/rumale/clustering/snn.rb +75 -0
data/lib/rumale/clustering/spectral_clustering.rb +120 -0
data/lib/rumale/clustering/version.rb +10 -0
data/lib/rumale/clustering.rb +15 -0
metadata +93 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 26c8d431fa54beb0ef656cb5c058176ed8b777dcd1075d4ff859c37ca458ab98
+  data.tar.gz: e180764368160a0273fc42e92238beaa25e93ebbbee0766dfb9f0efed2bc80fe
+SHA512:
+  metadata.gz: e5386f87dbed2376c712b9f1e74484f757d0bd6e89b8d1c5455865405f4561ae22f4245863ecc06894202e3bea7373f97c767cd2e182172931eb58c18ee47220
+  data.tar.gz: 52e855b335ea4454850ffc2ab18a2c89c34849bb88f0b59af59073071d803e926c69638241f47d320f40aa06d945b105887e4e9d84ef453d404a43c1825470a5

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Copyright (c) 2022 Atsushi Tatsuma
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED Viewed

@@ -0,0 +1,34 @@
+# Rumale::Clustering
+[![Gem Version](https://badge.fury.io/rb/rumale-clustering.svg)](https://badge.fury.io/rb/rumale-clustering)
+[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-clustering/LICENSE.txt)
+[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Clustering.html)
+Rumale is a machine learning library in Ruby.
+Rumale::Clustering provides cluster analysis algorithms,
+such as K-Means, Gaussian Mixture Model, DBSCAN, and Spectral Clustering,
+with Rumale interface.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'rumale-clustering'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install rumale-clustering
+## Documentation
+- [Rumale API Documentation - Clustering](https://yoshoku.github.io/rumale/doc/Rumale/Clustering.html)
+## License
+The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).

data/lib/rumale/clustering/dbscan.rb ADDED Viewed

@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+require 'rumale/base/estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+require 'rumale/validation'
+module Rumale
+  module Clustering
+    # DBSCAN is a class that implements DBSCAN cluster analysis.
+    #
+    # @example
+    #   require 'rumale/clustering/dbscan'
+    #
+    #   analyzer = Rumale::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - Ester, M., Kriegel, H-P., Sander, J., and Xu, X., "A density-based algorithm for discovering clusters in large spatial databases with noise," Proc. KDD' 96, pp. 266--231, 1996.
+    class DBSCAN < ::Rumale::Base::Estimator
+      include ::Rumale::Base::ClusterAnalyzer
+      # Return the core sample indices.
+      # @return [Numo::Int32] (shape: [n_core_samples])
+      attr_reader :core_sample_ids
+      # Return the cluster labels. The negative cluster label indicates that the point is noise.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Create a new cluster analyzer with DBSCAN method.
+      #
+      # @param eps [Float] The radius of neighborhood.
+      # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(eps: 0.5, min_samples: 5, metric: 'euclidean')
+        super()
+        @params = {
+          eps: eps,
+          min_samples: min_samples,
+          metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
+        }
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> DBSCAN
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #     If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      #   @return [DBSCAN] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
+        partial_fit(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
+        partial_fit(x)
+        labels
+      end
+      private
+      def check_invalid_array_shape(x)
+        @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+      end
+      def partial_fit(x)
+        cluster_id = 0
+        metric_mat = calc_pairwise_metrics(x)
+        n_samples = metric_mat.shape[0]
+        @core_sample_ids = []
+        @labels = Numo::Int32.zeros(n_samples) - 2
+        n_samples.times do |query_id|
+          next if @labels[query_id] >= -1
+          cluster_id += 1 if expand_cluster(metric_mat, query_id, cluster_id)
+        end
+        @core_sample_ids = Numo::Int32[*@core_sample_ids.flatten]
+        nil
+      end
+      def calc_pairwise_metrics(x)
+        @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
+      end
+      def expand_cluster(metric_mat, query_id, cluster_id)
+        target_ids = region_query(metric_mat[query_id, true])
+        if target_ids.size < @params[:min_samples]
+          @labels[query_id] = -1
+          false
+        else
+          @labels[target_ids] = cluster_id
+          @core_sample_ids.push(target_ids.dup)
+          target_ids.delete(query_id)
+          while (m = target_ids.shift)
+            neighbor_ids = region_query(metric_mat[m, true])
+            next if neighbor_ids.size < @params[:min_samples]
+            neighbor_ids.each do |n|
+              target_ids.push(n) if @labels[n] < -1
+              @labels[n] = cluster_id if @labels[n] <= -1
+            end
+          end
+          true
+        end
+      end
+      def region_query(metric_arr)
+        metric_arr.lt(@params[:eps]).where.to_a
+      end
+    end
+  end
+end

data/lib/rumale/clustering/gaussian_mixture.rb ADDED Viewed

@@ -0,0 +1,215 @@
+# frozen_string_literal: true
+require 'rumale/base/estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/utils'
+require 'rumale/validation'
+require 'rumale/clustering/k_means'
+module Rumale
+  module Clustering
+    # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
+    #
+    # @example
+    #   require 'rumale/clustering/gaussian_mixture'
+    #
+    #   analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    #   # If Numo::Linalg is installed, you can specify 'full' for the tyep of covariance option.
+    #   require 'numo/linalg/autoloader'
+    #   require 'rumale/clustering/gaussian_mixture'
+    #
+    #   analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50, covariance_type: 'full')
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    class GaussianMixture < ::Rumale::Base::Estimator # rubocop:disable Metrics/ClassLength
+      include ::Rumale::Base::ClusterAnalyzer
+      # Return the number of iterations to covergence.
+      # @return [Integer]
+      attr_reader :n_iter
+      # Return the weight of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters])
+      attr_reader :weights
+      # Return the mean of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features])
+      attr_reader :means
+      # Return the diagonal elements of covariance matrix of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features] if 'diag', [n_clusters, n_features, n_features] if 'full')
+      attr_reader :covariances
+      # Create a new cluster analyzer with gaussian mixture model.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param init [String] The initialization method for centroids ('random' or 'k-means++').
+      # @param covariance_type [String] The type of covariance parameter to be used ('diag' or 'full').
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param tol [Float] The tolerance of termination criterion.
+      # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_clusters: 8, init: 'k-means++', covariance_type: 'diag',
+                     max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
+        super()
+        @params = {
+          n_clusters: n_clusters,
+          init: (init == 'random' ? 'random' : 'k-means++'),
+          covariance_type: (covariance_type == 'full' ? 'full' : 'diag'),
+          max_iter: max_iter,
+          tol: tol,
+          reg_covar: reg_covar,
+          random_seed: random_seed || srand
+        }
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> GaussianMixture
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   @return [GaussianMixture] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_enable_linalg('fit')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        memberships = init_memberships(x)
+        @params[:max_iter].times do |t|
+          @n_iter = t
+          @weights = calc_weights(n_samples, memberships)
+          @means = calc_means(x, memberships)
+          @covariances = calc_covariances(x, @means, memberships, @params[:reg_covar], @params[:covariance_type])
+          new_memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
+          error = (memberships - new_memberships).abs.max
+          break if error <= @params[:tol]
+          memberships = new_memberships.dup
+        end
+        self
+      end
+      # Predict cluster labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def predict(x)
+        check_enable_linalg('predict')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
+        assign_cluster(memberships)
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_enable_linalg('fit_predict')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        fit(x).predict(x)
+      end
+      private
+      def assign_cluster(memberships)
+        n_clusters = memberships.shape[1]
+        memberships.max_index(axis: 1) - Numo::Int32[*0.step(memberships.size - 1, n_clusters)]
+      end
+      def init_memberships(x)
+        kmeans = ::Rumale::Clustering::KMeans.new(
+          n_clusters: @params[:n_clusters], init: @params[:init], max_iter: 0, random_seed: @params[:random_seed]
+        )
+        cluster_ids = kmeans.fit_predict(x)
+        Numo::DFloat.cast(::Rumale::Utils.binarize_labels(cluster_ids))
+      end
+      def calc_memberships(x, weights, means, covars, covar_type)
+        n_samples = x.shape[0]
+        n_clusters = means.shape[0]
+        memberships = Numo::DFloat.zeros(n_samples, n_clusters)
+        n_clusters.times do |n|
+          centered = x - means[n, true]
+          covar = covar_type == 'full' ? covars[n, true, true] : covars[n, true]
+          memberships[true, n] = calc_unnormalized_membership(centered, weights[n], covar, covar_type)
+        end
+        memberships / memberships.sum(axis: 1).expand_dims(1)
+      end
+      def calc_weights(n_samples, memberships)
+        memberships.sum(axis: 0) / n_samples
+      end
+      def calc_means(x, memberships)
+        memberships.transpose.dot(x) / memberships.sum(axis: 0).expand_dims(1)
+      end
+      def calc_covariances(x, means, memberships, reg_cover, covar_type)
+        if covar_type == 'full'
+          calc_full_covariances(x, means, reg_cover, memberships)
+        else
+          calc_diag_covariances(x, means, reg_cover, memberships)
+        end
+      end
+      def calc_diag_covariances(x, means, reg_cover, memberships)
+        n_clusters = means.shape[0]
+        diag_cov = Array.new(n_clusters) do |n|
+          centered = x - means[n, true]
+          memberships[true, n].dot(centered**2) / memberships[true, n].sum
+        end
+        Numo::DFloat.asarray(diag_cov) + reg_cover
+      end
+      def calc_full_covariances(x, means, reg_cover, memberships)
+        n_features = x.shape[1]
+        n_clusters = means.shape[0]
+        cov_mats = Numo::DFloat.zeros(n_clusters, n_features, n_features)
+        reg_mat = Numo::DFloat.eye(n_features) * reg_cover
+        n_clusters.times do |n|
+          centered = x - means[n, true]
+          members = memberships[true, n]
+          cov_mats[n, true, true] = reg_mat + (centered.transpose * members).dot(centered) / members.sum
+        end
+        cov_mats
+      end
+      def calc_unnormalized_membership(centered, weight, covar, covar_type)
+        inv_covar = calc_inv_covariance(covar, covar_type)
+        inv_sqrt_det_covar = calc_inv_sqrt_det_covariance(covar, covar_type)
+        distances = if covar_type == 'full'
+                      (centered.dot(inv_covar) * centered).sum(axis: 1)
+                    else
+                      (centered * inv_covar * centered).sum(axis: 1)
+                    end
+        weight * inv_sqrt_det_covar * Numo::NMath.exp(-0.5 * distances)
+      end
+      def calc_inv_covariance(covar, covar_type)
+        if covar_type == 'full'
+          Numo::Linalg.inv(covar)
+        else
+          1.0 / covar
+        end
+      end
+      def calc_inv_sqrt_det_covariance(covar, covar_type)
+        if covar_type == 'full'
+          1.0 / Math.sqrt(Numo::Linalg.det(covar))
+        else
+          1.0 / Math.sqrt(covar.prod)
+        end
+      end
+      def check_enable_linalg(method_name)
+        return unless @params[:covariance_type] == 'full' && !enable_linalg?
+        raise "GaussianMixture##{method_name} requires Numo::Linalg when covariance_type is 'full' but that is not loaded."
+      end
+    end
+  end
+end