rumale-clustering 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/clustering/dbscan.rb +126 -0
- data/lib/rumale/clustering/gaussian_mixture.rb +215 -0
- data/lib/rumale/clustering/hdbscan.rb +289 -0
- data/lib/rumale/clustering/k_means.rb +120 -0
- data/lib/rumale/clustering/k_medoids.rb +143 -0
- data/lib/rumale/clustering/mini_batch_k_means.rb +138 -0
- data/lib/rumale/clustering/power_iteration.rb +128 -0
- data/lib/rumale/clustering/single_linkage.rb +206 -0
- data/lib/rumale/clustering/snn.rb +75 -0
- data/lib/rumale/clustering/spectral_clustering.rb +120 -0
- data/lib/rumale/clustering/version.rb +10 -0
- data/lib/rumale/clustering.rb +15 -0
- metadata +93 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
require 'rumale/clustering/k_means'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
module Clustering
|
11
|
+
# PowerIteration is a class that implements power iteration clustering.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/power_iteration'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::PowerIteration.new(n_clusters: 10, gamma: 8.0, max_iter: 1000)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Lin, F., and Cohen, W W., "Power Iteration Clustering," Proc. ICML'10, pp. 655--662, 2010.
|
21
|
+
class PowerIteration < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the data in embedded space.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
26
|
+
attr_reader :embedding
|
27
|
+
|
28
|
+
# Return the cluster labels.
|
29
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
30
|
+
attr_reader :labels
|
31
|
+
|
32
|
+
# Return the number of iterations run for optimization
|
33
|
+
# @return [Integer]
|
34
|
+
attr_reader :n_iter
|
35
|
+
|
36
|
+
# Create a new cluster analyzer with power iteration clustering.
|
37
|
+
#
|
38
|
+
# @param n_clusters [Integer] The number of clusters.
|
39
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
40
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
41
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
42
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
43
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
44
|
+
# @param tol [Float] The tolerance of termination criterion.
|
45
|
+
# @param eps [Float] A small value close to zero to avoid zero division error.
|
46
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
47
|
+
def initialize(n_clusters: 8, affinity: 'rbf', gamma: nil, init: 'k-means++',
|
48
|
+
max_iter: 1000, tol: 1.0e-8, eps: 1.0e-5, random_seed: nil)
|
49
|
+
super()
|
50
|
+
@params = {
|
51
|
+
n_clusters: n_clusters,
|
52
|
+
affinity: affinity,
|
53
|
+
gamma: gamma,
|
54
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
55
|
+
max_iter: max_iter,
|
56
|
+
tol: tol,
|
57
|
+
eps: eps,
|
58
|
+
random_seed: (random_seed || srand)
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Analysis clusters with given training data.
|
63
|
+
#
|
64
|
+
# @overload fit(x) -> PowerIteration
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
66
|
+
# If the affinity is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
67
|
+
# @return [PowerIteration] The learned cluster analyzer itself.
|
68
|
+
def fit(x, _y = nil)
|
69
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
70
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
71
|
+
|
72
|
+
fit_predict(x)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Analysis clusters and assign samples to clusters.
|
77
|
+
#
|
78
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
79
|
+
# If the affinity is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
80
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
81
|
+
def fit_predict(x)
|
82
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
83
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
84
|
+
|
85
|
+
affinity_mat = @params[:affinity] == 'precomputed' ? x : ::Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
86
|
+
@embedding, @n_iter = embedded_space(affinity_mat, @params[:max_iter], @params[:tol].fdiv(affinity_mat.shape[0]))
|
87
|
+
@labels = line_kmeans_clustering(@embedding)
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def check_invalid_array_shape(x)
|
93
|
+
@params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
94
|
+
end
|
95
|
+
|
96
|
+
def embedded_space(affinity_mat, max_iter, tol)
|
97
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
98
|
+
|
99
|
+
degrees = affinity_mat.sum(axis: 1)
|
100
|
+
normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
|
101
|
+
|
102
|
+
iters = 0
|
103
|
+
embedded_line = degrees / degrees.sum
|
104
|
+
n_samples = embedded_line.shape[0]
|
105
|
+
error = Numo::DFloat.ones(n_samples)
|
106
|
+
max_iter.times do |t|
|
107
|
+
iters = t + 1
|
108
|
+
new_embedded_line = normalized_affinity_mat.dot(embedded_line)
|
109
|
+
new_embedded_line /= new_embedded_line.abs.sum
|
110
|
+
new_error = (new_embedded_line - embedded_line).abs
|
111
|
+
break if (new_error - error).abs.max <= tol
|
112
|
+
|
113
|
+
embedded_line = new_embedded_line
|
114
|
+
error = new_error
|
115
|
+
end
|
116
|
+
|
117
|
+
[embedded_line, iters]
|
118
|
+
end
|
119
|
+
|
120
|
+
def line_kmeans_clustering(vec)
|
121
|
+
::Rumale::Clustering::KMeans.new(
|
122
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
123
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
124
|
+
).fit_predict(vec.expand_dims(1))
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Clustering
|
10
|
+
# SingleLinkage is a class that implements hierarchical cluster analysis with single linakge method.
|
11
|
+
# This class is used internally for HDBSCAN.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/single_linkage'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::SingleLinkage.new(n_clusters: 2)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Mullner, D., "Modern hierarchical, agglomerative clustering algorithms," arXiv:1109.2378, 2011.
|
21
|
+
class SingleLinkage < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the cluster labels.
|
25
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
26
|
+
attr_reader :labels
|
27
|
+
|
28
|
+
# Return the hierarchical structure.
|
29
|
+
# @return [Array<SingleLinkage::Node>] (shape: [n_samples - 1])
|
30
|
+
attr_reader :hierarchy
|
31
|
+
|
32
|
+
# Create a new cluster analyzer with single linkage algorithm.
|
33
|
+
#
|
34
|
+
# @param n_clusters [Integer] The number of clusters.
|
35
|
+
# @param metric [String] The metric to calculate the distances.
|
36
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
37
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
38
|
+
def initialize(n_clusters: 2, metric: 'euclidean')
|
39
|
+
super()
|
40
|
+
@params = {
|
41
|
+
n_clusters: n_clusters,
|
42
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
# Analysis clusters with given training data.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> SingleLinkage
|
49
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
50
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
51
|
+
# @return [SingleLinkage] The learned cluster analyzer itself.
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
54
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
55
|
+
|
56
|
+
fit_predict(x)
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
# Analysis clusters and assign samples to clusters.
|
61
|
+
#
|
62
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
|
63
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
64
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
65
|
+
def fit_predict(x)
|
66
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
67
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
68
|
+
|
69
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
70
|
+
@labels = partial_fit(distance_mat)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def check_invalid_array_shape(x)
|
76
|
+
@params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
77
|
+
end
|
78
|
+
|
79
|
+
# @!visibility private
|
80
|
+
class UnionFind
|
81
|
+
def initialize(n)
|
82
|
+
@parent = Numo::Int32.zeros(2 * n - 1) - 1
|
83
|
+
@size = Numo::Int32.hstack([Numo::Int32.ones(n), Numo::Int32.zeros(n - 1)])
|
84
|
+
@next_label = n
|
85
|
+
end
|
86
|
+
|
87
|
+
# @!visibility private
|
88
|
+
def union(x, y)
|
89
|
+
size = @size[x] + @size[y]
|
90
|
+
@parent[x] = @next_label
|
91
|
+
@parent[y] = @next_label
|
92
|
+
@size[@next_label] = size
|
93
|
+
@next_label += 1
|
94
|
+
size
|
95
|
+
end
|
96
|
+
|
97
|
+
# @!visibility private
|
98
|
+
def find(x)
|
99
|
+
p = x
|
100
|
+
x = @parent[x] while @parent[x] != -1
|
101
|
+
while @parent[p] != x
|
102
|
+
p = @parent[p]
|
103
|
+
@parent[p] = x
|
104
|
+
end
|
105
|
+
x
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# @!visibility private
|
110
|
+
class Node
|
111
|
+
# @!visibility private
|
112
|
+
attr_reader :x, :y, :weight, :n_elements
|
113
|
+
|
114
|
+
# @!visibility private
|
115
|
+
def initialize(x:, y:, weight:, n_elements: 0)
|
116
|
+
@x = x
|
117
|
+
@y = y
|
118
|
+
@weight = weight
|
119
|
+
@n_elements = n_elements
|
120
|
+
end
|
121
|
+
|
122
|
+
# @!visibility private
|
123
|
+
def ==(other)
|
124
|
+
x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private_constant :UnionFind, :Node
|
129
|
+
|
130
|
+
def partial_fit(distance_mat)
|
131
|
+
mst = minimum_spanning_tree(distance_mat)
|
132
|
+
@hierarchy = single_linkage_hierarchy(mst)
|
133
|
+
flatten(@hierarchy, @params[:n_clusters])
|
134
|
+
end
|
135
|
+
|
136
|
+
def minimum_spanning_tree(complete_graph)
|
137
|
+
n_samples = complete_graph.shape[0]
|
138
|
+
n_edges = n_samples - 1
|
139
|
+
curr_weights = Numo::DFloat.zeros(n_samples) + Float::INFINITY
|
140
|
+
curr_labels = Numo::Int32.new(n_samples).seq
|
141
|
+
next_node = 0
|
142
|
+
mst = Array.new(n_edges) do
|
143
|
+
curr_node = next_node
|
144
|
+
target = curr_labels.ne(curr_node)
|
145
|
+
curr_labels = curr_labels[target]
|
146
|
+
curr_weights = Numo::DFloat.minimum(curr_weights[target], complete_graph[curr_node, curr_labels])
|
147
|
+
next_node = curr_labels[curr_weights.min_index]
|
148
|
+
weight = curr_weights.min
|
149
|
+
Node.new(x: curr_node, y: next_node, weight: weight)
|
150
|
+
end
|
151
|
+
mst.sort! { |a, b| a.weight <=> b.weight }
|
152
|
+
end
|
153
|
+
|
154
|
+
def single_linkage_hierarchy(mst)
|
155
|
+
n_edges = mst.size
|
156
|
+
n_nodes = n_edges + 1
|
157
|
+
uf = UnionFind.new(n_nodes)
|
158
|
+
Array.new(n_edges) do |n|
|
159
|
+
x_root = uf.find(mst[n].x)
|
160
|
+
y_root = uf.find(mst[n].y)
|
161
|
+
x_root, y_root = [y_root, x_root] unless x_root < y_root
|
162
|
+
weight = mst[n].weight
|
163
|
+
n_samples = uf.union(x_root, y_root)
|
164
|
+
Node.new(x: x_root, y: y_root, weight: weight, n_elements: n_samples)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def descedent_ids(hierarchy_, start_node)
|
169
|
+
n_samples = hierarchy_.size + 1
|
170
|
+
return [start_node] if start_node < n_samples
|
171
|
+
|
172
|
+
res = []
|
173
|
+
indices = [start_node]
|
174
|
+
n_indices = 1
|
175
|
+
while n_indices.positive?
|
176
|
+
idx = indices.pop
|
177
|
+
if idx < n_samples
|
178
|
+
res.push(idx)
|
179
|
+
n_indices -= 1
|
180
|
+
else
|
181
|
+
indices.push(hierarchy_[idx - n_samples].x)
|
182
|
+
indices.push(hierarchy_[idx - n_samples].y)
|
183
|
+
n_indices += 1
|
184
|
+
end
|
185
|
+
end
|
186
|
+
res
|
187
|
+
end
|
188
|
+
|
189
|
+
def flatten(hierarchy_, n_clusters)
|
190
|
+
n_samples = hierarchy_.size + 1
|
191
|
+
return Numo::Int32.zeros(n_samples) if n_clusters < 2
|
192
|
+
|
193
|
+
nodes = [-([hierarchy_[-1].x, hierarchy_[-1].y].max + 1)]
|
194
|
+
(n_clusters - 1).times do
|
195
|
+
children = hierarchy_[-nodes[0] - n_samples]
|
196
|
+
nodes.push(-children.x)
|
197
|
+
nodes.push(-children.y)
|
198
|
+
nodes.sort!.shift
|
199
|
+
end
|
200
|
+
res = Numo::Int32.zeros(n_samples)
|
201
|
+
nodes.each_with_index { |sid, cluster_id| res[descedent_ids(hierarchy_, -sid)] = cluster_id }
|
202
|
+
res
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/pairwise_metric'
|
4
|
+
require 'rumale/clustering/dbscan'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Clustering
|
8
|
+
# SNN is a class that implements Shared Nearest Neighbor cluster analysis.
|
9
|
+
# The SNN method is a variation of DBSCAN that uses similarity based on k-nearest neighbors as a metric.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/clustering/snn'
|
13
|
+
#
|
14
|
+
# analyzer = Rumale::Clustering::SNN.new(n_neighbros: 10, eps: 5, min_samples: 5)
|
15
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
|
+
#
|
17
|
+
# *Reference*
|
18
|
+
# - Ertoz, L., Steinbach, M., and Kumar, V., "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data," Proc. SDM'03, pp. 47--58, 2003.
|
19
|
+
# - Houle, M E., Kriegel, H-P., Kroger, P., Schubert, E., and Zimek, A., "Can Shared-Neighbor Distances Defeat the Curse of Dimensionality?," Proc. SSDBM'10, pp. 482--500, 2010.
|
20
|
+
class SNN < DBSCAN
|
21
|
+
# Create a new cluster analyzer with Shared Neareset Neighbor method.
|
22
|
+
#
|
23
|
+
# @param n_neighbors [Integer] The number of neighbors to be used for finding k-nearest neighbors.
|
24
|
+
# @param eps [Integer] The threshold value for finding connected components based on similarity.
|
25
|
+
# @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
|
26
|
+
# @param metric [String] The metric to calculate the distances.
|
27
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
28
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
29
|
+
def initialize(n_neighbors: 10, eps: 5, min_samples: 5, metric: 'euclidean') # rubocop:disable Lint/MissingSuper
|
30
|
+
@params = {
|
31
|
+
n_neighbors: n_neighbors,
|
32
|
+
eps: eps,
|
33
|
+
min_samples: min_samples,
|
34
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Analysis clusters with given training data.
|
39
|
+
#
|
40
|
+
# @overload fit(x) -> SNN
|
41
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
42
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
43
|
+
# @return [SNN] The learned cluster analyzer itself.
|
44
|
+
def fit(x, _y = nil)
|
45
|
+
super
|
46
|
+
end
|
47
|
+
|
48
|
+
# Analysis clusters and assign samples to clusters.
|
49
|
+
#
|
50
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
|
51
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
52
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
53
|
+
def fit_predict(x) # rubocop:disable Lint/UselessMethodDefinition
|
54
|
+
super
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def calc_pairwise_metrics(x)
|
60
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
61
|
+
n_samples = distance_mat.shape[0]
|
62
|
+
adjacency_mat = Numo::DFloat.zeros(n_samples, n_samples)
|
63
|
+
n_samples.times do |n|
|
64
|
+
neighbor_ids = distance_mat[n, true].sort_index[0...@params[:n_neighbors]]
|
65
|
+
adjacency_mat[n, neighbor_ids] = 1
|
66
|
+
end
|
67
|
+
adjacency_mat.dot(adjacency_mat.transpose)
|
68
|
+
end
|
69
|
+
|
70
|
+
def region_query(similarity_arr)
|
71
|
+
similarity_arr.gt(@params[:eps]).where.to_a
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/utils'
|
7
|
+
require 'rumale/validation'
|
8
|
+
require 'rumale/clustering/k_means'
|
9
|
+
|
10
|
+
module Rumale
|
11
|
+
module Clustering
|
12
|
+
# SpectralClustering is a class that implements the normalized spectral clustering.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# require 'numo/linalg/autoloader'
|
16
|
+
# require 'rumale/clustering/spectral_clustering'
|
17
|
+
#
|
18
|
+
# analyzer = Rumale::Clustering::SpectralClustering.new(n_clusters: 10, gamma: 8.0)
|
19
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
20
|
+
#
|
21
|
+
# *Reference*
|
22
|
+
# - Ng, A Y., Jordan, M I., and Weiss, Y., "On Spectral Clustering: Analyssi and an algorithm," Proc. NIPS'01, pp. 849--856, 2001.
|
23
|
+
# - von Luxburg, U., "A tutorial on spectral clustering," Statistics and Computing, Vol. 17 (4), pp. 395--416, 2007.
|
24
|
+
class SpectralClustering < ::Rumale::Base::Estimator
|
25
|
+
include ::Rumale::Base::ClusterAnalyzer
|
26
|
+
|
27
|
+
# Return the data in embedded space.
|
28
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_clusters])
|
29
|
+
attr_reader :embedding
|
30
|
+
|
31
|
+
# Return the cluster labels.
|
32
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
33
|
+
attr_reader :labels
|
34
|
+
|
35
|
+
# Create a new cluster analyzer with normalized spectral clustering.
|
36
|
+
#
|
37
|
+
# @param n_clusters [Integer] The number of clusters.
|
38
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
39
|
+
# If affinity = 'rbf', the class performs the normalized spectral clustering with the fully connected graph weighted by rbf kernel.
|
40
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
41
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
42
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
43
|
+
# @param max_iter [Integer] The maximum number of iterations for K-Means clustering.
|
44
|
+
# @param tol [Float] The tolerance of termination criterion for K-Means clustering.
|
45
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
46
|
+
def initialize(n_clusters: 2, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 10, tol: 1.0e-8, random_seed: nil)
|
47
|
+
super()
|
48
|
+
@params = {
|
49
|
+
n_clusters: n_clusters,
|
50
|
+
affinity: affinity,
|
51
|
+
gamma: gamma,
|
52
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
53
|
+
max_iter: max_iter,
|
54
|
+
tol: tol,
|
55
|
+
random_seed: (random_seed || srand)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
# Analysis clusters with given training data.
|
60
|
+
# To execute this method, Numo::Linalg must be loaded.
|
61
|
+
#
|
62
|
+
# @overload fit(x) -> SpectralClustering
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
64
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
65
|
+
# @return [SpectralClustering] The learned cluster analyzer itself.
|
66
|
+
def fit(x, _y = nil)
|
67
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
68
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
69
|
+
|
70
|
+
raise 'SpectralClustering#fit requires Numo::Linalg but that is not loaded' unless enable_linalg?(warning: false)
|
71
|
+
|
72
|
+
fit_predict(x)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Analysis clusters and assign samples to clusters.
|
77
|
+
# To execute this method, Numo::Linalg must be loaded.
|
78
|
+
#
|
79
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
80
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
81
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
82
|
+
def fit_predict(x)
|
83
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
84
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
85
|
+
|
86
|
+
unless enable_linalg?(warning: false)
|
87
|
+
raise 'SpectralClustering#fit_predict requires Numo::Linalg but that is not loaded'
|
88
|
+
end
|
89
|
+
|
90
|
+
affinity_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
91
|
+
@embedding = embedded_space(affinity_mat, @params[:n_clusters])
|
92
|
+
normalized_embedding = ::Rumale::Utils.normalize(@embedding, 'l2')
|
93
|
+
@labels = kmeans_clustering(normalized_embedding)
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def check_invalid_array_shape(x)
|
99
|
+
@params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
100
|
+
end
|
101
|
+
|
102
|
+
def embedded_space(affinity_mat, n_clusters)
|
103
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
104
|
+
degrees = 1.0 / Numo::NMath.sqrt(affinity_mat.sum(axis: 1))
|
105
|
+
laplacian_mat = degrees.diag.dot(affinity_mat).dot(degrees.diag)
|
106
|
+
|
107
|
+
n_samples = affinity_mat.shape[0]
|
108
|
+
_, eig_vecs = Numo::Linalg.eigh(laplacian_mat, vals_range: (n_samples - n_clusters)...n_samples)
|
109
|
+
eig_vecs.reverse(1).dup
|
110
|
+
end
|
111
|
+
|
112
|
+
def kmeans_clustering(x)
|
113
|
+
::Rumale::Clustering::KMeans.new(
|
114
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
115
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
116
|
+
).fit_predict(x)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'numo/narray'
|
4
|
+
|
5
|
+
require_relative 'clustering/dbscan'
|
6
|
+
require_relative 'clustering/gaussian_mixture'
|
7
|
+
require_relative 'clustering/hdbscan'
|
8
|
+
require_relative 'clustering/k_means'
|
9
|
+
require_relative 'clustering/k_medoids'
|
10
|
+
require_relative 'clustering/mini_batch_k_means'
|
11
|
+
require_relative 'clustering/power_iteration'
|
12
|
+
require_relative 'clustering/single_linkage'
|
13
|
+
require_relative 'clustering/snn'
|
14
|
+
require_relative 'clustering/spectral_clustering'
|
15
|
+
require_relative 'clustering/version'
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rumale-clustering
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.24.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yoshoku
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-12-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: numo-narray
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rumale-core
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.24.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.24.0
|
41
|
+
description: |
|
42
|
+
Rumale::Clustering provides cluster analysis algorithms,
|
43
|
+
such as K-Means, Gaussian Mixture Model, DBSCAN, and Spectral Clustering,
|
44
|
+
with Rumale interface.
|
45
|
+
email:
|
46
|
+
- yoshoku@outlook.com
|
47
|
+
executables: []
|
48
|
+
extensions: []
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- lib/rumale/clustering.rb
|
54
|
+
- lib/rumale/clustering/dbscan.rb
|
55
|
+
- lib/rumale/clustering/gaussian_mixture.rb
|
56
|
+
- lib/rumale/clustering/hdbscan.rb
|
57
|
+
- lib/rumale/clustering/k_means.rb
|
58
|
+
- lib/rumale/clustering/k_medoids.rb
|
59
|
+
- lib/rumale/clustering/mini_batch_k_means.rb
|
60
|
+
- lib/rumale/clustering/power_iteration.rb
|
61
|
+
- lib/rumale/clustering/single_linkage.rb
|
62
|
+
- lib/rumale/clustering/snn.rb
|
63
|
+
- lib/rumale/clustering/spectral_clustering.rb
|
64
|
+
- lib/rumale/clustering/version.rb
|
65
|
+
homepage: https://github.com/yoshoku/rumale
|
66
|
+
licenses:
|
67
|
+
- BSD-3-Clause
|
68
|
+
metadata:
|
69
|
+
homepage_uri: https://github.com/yoshoku/rumale
|
70
|
+
source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-clustering
|
71
|
+
changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
|
72
|
+
documentation_uri: https://yoshoku.github.io/rumale/doc/
|
73
|
+
rubygems_mfa_required: 'true'
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubygems_version: 3.3.26
|
90
|
+
signing_key:
|
91
|
+
specification_version: 4
|
92
|
+
summary: Rumale::Clustering provides cluster analysis algorithms with Rumale interface.
|
93
|
+
test_files: []
|