RubyGems - rumale-clustering - Versions diffs - 0.24.0 - Mend

rumale-clustering 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +7 -0
data/LICENSE.txt +27 -0
data/README.md +34 -0
data/lib/rumale/clustering/dbscan.rb +126 -0
data/lib/rumale/clustering/gaussian_mixture.rb +215 -0
data/lib/rumale/clustering/hdbscan.rb +289 -0
data/lib/rumale/clustering/k_means.rb +120 -0
data/lib/rumale/clustering/k_medoids.rb +143 -0
data/lib/rumale/clustering/mini_batch_k_means.rb +138 -0
data/lib/rumale/clustering/power_iteration.rb +128 -0
data/lib/rumale/clustering/single_linkage.rb +206 -0
data/lib/rumale/clustering/snn.rb +75 -0
data/lib/rumale/clustering/spectral_clustering.rb +120 -0
data/lib/rumale/clustering/version.rb +10 -0
data/lib/rumale/clustering.rb +15 -0
metadata +93 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 26c8d431fa54beb0ef656cb5c058176ed8b777dcd1075d4ff859c37ca458ab98
+  data.tar.gz: e180764368160a0273fc42e92238beaa25e93ebbbee0766dfb9f0efed2bc80fe
+SHA512:
+  metadata.gz: e5386f87dbed2376c712b9f1e74484f757d0bd6e89b8d1c5455865405f4561ae22f4245863ecc06894202e3bea7373f97c767cd2e182172931eb58c18ee47220
+  data.tar.gz: 52e855b335ea4454850ffc2ab18a2c89c34849bb88f0b59af59073071d803e926c69638241f47d320f40aa06d945b105887e4e9d84ef453d404a43c1825470a5

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,27 @@
+Copyright (c) 2022 Atsushi Tatsuma
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

data/README.md ADDED Viewed

@@ -0,0 +1,34 @@
+# Rumale::Clustering
+[![Gem Version](https://badge.fury.io/rb/rumale-clustering.svg)](https://badge.fury.io/rb/rumale-clustering)
+[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/main/rumale-clustering/LICENSE.txt)
+[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://yoshoku.github.io/rumale/doc/Rumale/Clustering.html)
+Rumale is a machine learning library in Ruby.
+Rumale::Clustering provides cluster analysis algorithms,
+such as K-Means, Gaussian Mixture Model, DBSCAN, and Spectral Clustering,
+with Rumale interface.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'rumale-clustering'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install rumale-clustering
+## Documentation
+- [Rumale API Documentation - Clustering](https://yoshoku.github.io/rumale/doc/Rumale/Clustering.html)
+## License
+The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).

data/lib/rumale/clustering/dbscan.rb ADDED Viewed

@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+require 'rumale/base/estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+require 'rumale/validation'
+module Rumale
+  module Clustering
+    # DBSCAN is a class that implements DBSCAN cluster analysis.
+    #
+    # @example
+    #   require 'rumale/clustering/dbscan'
+    #
+    #   analyzer = Rumale::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - Ester, M., Kriegel, H-P., Sander, J., and Xu, X., "A density-based algorithm for discovering clusters in large spatial databases with noise," Proc. KDD' 96, pp. 266--231, 1996.
+    class DBSCAN < ::Rumale::Base::Estimator
+      include ::Rumale::Base::ClusterAnalyzer
+      # Return the core sample indices.
+      # @return [Numo::Int32] (shape: [n_core_samples])
+      attr_reader :core_sample_ids
+      # Return the cluster labels. The negative cluster label indicates that the point is noise.
+      # @return [Numo::Int32] (shape: [n_samples])
+      attr_reader :labels
+      # Create a new cluster analyzer with DBSCAN method.
+      #
+      # @param eps [Float] The radius of neighborhood.
+      # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(eps: 0.5, min_samples: 5, metric: 'euclidean')
+        super()
+        @params = {
+          eps: eps,
+          min_samples: min_samples,
+          metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
+        }
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> DBSCAN
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #     If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      #   @return [DBSCAN] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
+        partial_fit(x)
+        self
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
+        partial_fit(x)
+        labels
+      end
+      private
+      def check_invalid_array_shape(x)
+        @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+      end
+      def partial_fit(x)
+        cluster_id = 0
+        metric_mat = calc_pairwise_metrics(x)
+        n_samples = metric_mat.shape[0]
+        @core_sample_ids = []
+        @labels = Numo::Int32.zeros(n_samples) - 2
+        n_samples.times do |query_id|
+          next if @labels[query_id] >= -1
+          cluster_id += 1 if expand_cluster(metric_mat, query_id, cluster_id)
+        end
+        @core_sample_ids = Numo::Int32[*@core_sample_ids.flatten]
+        nil
+      end
+      def calc_pairwise_metrics(x)
+        @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
+      end
+      def expand_cluster(metric_mat, query_id, cluster_id)
+        target_ids = region_query(metric_mat[query_id, true])
+        if target_ids.size < @params[:min_samples]
+          @labels[query_id] = -1
+          false
+        else
+          @labels[target_ids] = cluster_id
+          @core_sample_ids.push(target_ids.dup)
+          target_ids.delete(query_id)
+          while (m = target_ids.shift)
+            neighbor_ids = region_query(metric_mat[m, true])
+            next if neighbor_ids.size < @params[:min_samples]
+            neighbor_ids.each do |n|
+              target_ids.push(n) if @labels[n] < -1
+              @labels[n] = cluster_id if @labels[n] <= -1
+            end
+          end
+          true
+        end
+      end
+      def region_query(metric_arr)
+        metric_arr.lt(@params[:eps]).where.to_a
+      end
+    end
+  end
+end

data/lib/rumale/clustering/gaussian_mixture.rb ADDED Viewed

@@ -0,0 +1,215 @@
+# frozen_string_literal: true
+require 'rumale/base/estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/utils'
+require 'rumale/validation'
+require 'rumale/clustering/k_means'
+module Rumale
+  module Clustering
+    # GaussianMixture is a class that implements cluster analysis with gaussian mixture model.
+    #
+    # @example
+    #   require 'rumale/clustering/gaussian_mixture'
+    #
+    #   analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    #   # If Numo::Linalg is installed, you can specify 'full' for the tyep of covariance option.
+    #   require 'numo/linalg/autoloader'
+    #   require 'rumale/clustering/gaussian_mixture'
+    #
+    #   analyzer = Rumale::Clustering::GaussianMixture.new(n_clusters: 10, max_iter: 50, covariance_type: 'full')
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    class GaussianMixture < ::Rumale::Base::Estimator # rubocop:disable Metrics/ClassLength
+      include ::Rumale::Base::ClusterAnalyzer
+      # Return the number of iterations to covergence.
+      # @return [Integer]
+      attr_reader :n_iter
+      # Return the weight of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters])
+      attr_reader :weights
+      # Return the mean of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features])
+      attr_reader :means
+      # Return the diagonal elements of covariance matrix of each cluster.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features] if 'diag', [n_clusters, n_features, n_features] if 'full')
+      attr_reader :covariances
+      # Create a new cluster analyzer with gaussian mixture model.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param init [String] The initialization method for centroids ('random' or 'k-means++').
+      # @param covariance_type [String] The type of covariance parameter to be used ('diag' or 'full').
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param tol [Float] The tolerance of termination criterion.
+      # @param reg_covar [Float] The non-negative regularization to the diagonal of covariance.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_clusters: 8, init: 'k-means++', covariance_type: 'diag',
+                     max_iter: 50, tol: 1.0e-4, reg_covar: 1.0e-6, random_seed: nil)
+        super()
+        @params = {
+          n_clusters: n_clusters,
+          init: (init == 'random' ? 'random' : 'k-means++'),
+          covariance_type: (covariance_type == 'full' ? 'full' : 'diag'),
+          max_iter: max_iter,
+          tol: tol,
+          reg_covar: reg_covar,
+          random_seed: random_seed || srand
+        }
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> GaussianMixture
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   @return [GaussianMixture] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        check_enable_linalg('fit')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        memberships = init_memberships(x)
+        @params[:max_iter].times do |t|
+          @n_iter = t
+          @weights = calc_weights(n_samples, memberships)
+          @means = calc_means(x, memberships)
+          @covariances = calc_covariances(x, @means, memberships, @params[:reg_covar], @params[:covariance_type])
+          new_memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
+          error = (memberships - new_memberships).abs.max
+          break if error <= @params[:tol]
+          memberships = new_memberships.dup
+        end
+        self
+      end
+      # Predict cluster labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def predict(x)
+        check_enable_linalg('predict')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        memberships = calc_memberships(x, @weights, @means, @covariances, @params[:covariance_type])
+        assign_cluster(memberships)
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        check_enable_linalg('fit_predict')
+        x = ::Rumale::Validation.check_convert_sample_array(x)
+        fit(x).predict(x)
+      end
+      private
+      def assign_cluster(memberships)
+        n_clusters = memberships.shape[1]
+        memberships.max_index(axis: 1) - Numo::Int32[*0.step(memberships.size - 1, n_clusters)]
+      end
+      def init_memberships(x)
+        kmeans = ::Rumale::Clustering::KMeans.new(
+          n_clusters: @params[:n_clusters], init: @params[:init], max_iter: 0, random_seed: @params[:random_seed]
+        )
+        cluster_ids = kmeans.fit_predict(x)
+        Numo::DFloat.cast(::Rumale::Utils.binarize_labels(cluster_ids))
+      end
+      def calc_memberships(x, weights, means, covars, covar_type)
+        n_samples = x.shape[0]
+        n_clusters = means.shape[0]
+        memberships = Numo::DFloat.zeros(n_samples, n_clusters)
+        n_clusters.times do |n|
+          centered = x - means[n, true]
+          covar = covar_type == 'full' ? covars[n, true, true] : covars[n, true]
+          memberships[true, n] = calc_unnormalized_membership(centered, weights[n], covar, covar_type)
+        end
+        memberships / memberships.sum(axis: 1).expand_dims(1)
+      end
+      def calc_weights(n_samples, memberships)
+        memberships.sum(axis: 0) / n_samples
+      end
+      def calc_means(x, memberships)
+        memberships.transpose.dot(x) / memberships.sum(axis: 0).expand_dims(1)
+      end
+      def calc_covariances(x, means, memberships, reg_cover, covar_type)
+        if covar_type == 'full'
+          calc_full_covariances(x, means, reg_cover, memberships)
+        else
+          calc_diag_covariances(x, means, reg_cover, memberships)
+        end
+      end
+      def calc_diag_covariances(x, means, reg_cover, memberships)
+        n_clusters = means.shape[0]
+        diag_cov = Array.new(n_clusters) do |n|
+          centered = x - means[n, true]
+          memberships[true, n].dot(centered**2) / memberships[true, n].sum
+        end
+        Numo::DFloat.asarray(diag_cov) + reg_cover
+      end
+      def calc_full_covariances(x, means, reg_cover, memberships)
+        n_features = x.shape[1]
+        n_clusters = means.shape[0]
+        cov_mats = Numo::DFloat.zeros(n_clusters, n_features, n_features)
+        reg_mat = Numo::DFloat.eye(n_features) * reg_cover
+        n_clusters.times do |n|
+          centered = x - means[n, true]
+          members = memberships[true, n]
+          cov_mats[n, true, true] = reg_mat + (centered.transpose * members).dot(centered) / members.sum
+        end
+        cov_mats
+      end
+      def calc_unnormalized_membership(centered, weight, covar, covar_type)
+        inv_covar = calc_inv_covariance(covar, covar_type)
+        inv_sqrt_det_covar = calc_inv_sqrt_det_covariance(covar, covar_type)
+        distances = if covar_type == 'full'
+                      (centered.dot(inv_covar) * centered).sum(axis: 1)
+                    else
+                      (centered * inv_covar * centered).sum(axis: 1)
+                    end
+        weight * inv_sqrt_det_covar * Numo::NMath.exp(-0.5 * distances)
+      end
+      def calc_inv_covariance(covar, covar_type)
+        if covar_type == 'full'
+          Numo::Linalg.inv(covar)
+        else
+          1.0 / covar
+        end
+      end
+      def calc_inv_sqrt_det_covariance(covar, covar_type)
+        if covar_type == 'full'
+          1.0 / Math.sqrt(Numo::Linalg.det(covar))
+        else
+          1.0 / Math.sqrt(covar.prod)
+        end
+      end
+      def check_enable_linalg(method_name)
+        return unless @params[:covariance_type] == 'full' && !enable_linalg?
+        raise "GaussianMixture##{method_name} requires Numo::Linalg when covariance_type is 'full' but that is not loaded."
+      end
+    end
+  end
+end