rumale-clustering 0.24.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/clustering/dbscan.rb +126 -0
- data/lib/rumale/clustering/gaussian_mixture.rb +215 -0
- data/lib/rumale/clustering/hdbscan.rb +289 -0
- data/lib/rumale/clustering/k_means.rb +120 -0
- data/lib/rumale/clustering/k_medoids.rb +143 -0
- data/lib/rumale/clustering/mini_batch_k_means.rb +138 -0
- data/lib/rumale/clustering/power_iteration.rb +128 -0
- data/lib/rumale/clustering/single_linkage.rb +206 -0
- data/lib/rumale/clustering/snn.rb +75 -0
- data/lib/rumale/clustering/spectral_clustering.rb +120 -0
- data/lib/rumale/clustering/version.rb +10 -0
- data/lib/rumale/clustering.rb +15 -0
- metadata +93 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
require 'rumale/clustering/k_means'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
module Clustering
|
11
|
+
# PowerIteration is a class that implements power iteration clustering.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/power_iteration'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::PowerIteration.new(n_clusters: 10, gamma: 8.0, max_iter: 1000)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Lin, F., and Cohen, W W., "Power Iteration Clustering," Proc. ICML'10, pp. 655--662, 2010.
|
21
|
+
class PowerIteration < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the data in embedded space.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_samples])
|
26
|
+
attr_reader :embedding
|
27
|
+
|
28
|
+
# Return the cluster labels.
|
29
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
30
|
+
attr_reader :labels
|
31
|
+
|
32
|
+
# Return the number of iterations run for optimization
|
33
|
+
# @return [Integer]
|
34
|
+
attr_reader :n_iter
|
35
|
+
|
36
|
+
# Create a new cluster analyzer with power iteration clustering.
|
37
|
+
#
|
38
|
+
# @param n_clusters [Integer] The number of clusters.
|
39
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
40
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
41
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
42
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
43
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
44
|
+
# @param tol [Float] The tolerance of termination criterion.
|
45
|
+
# @param eps [Float] A small value close to zero to avoid zero division error.
|
46
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
47
|
+
def initialize(n_clusters: 8, affinity: 'rbf', gamma: nil, init: 'k-means++',
|
48
|
+
max_iter: 1000, tol: 1.0e-8, eps: 1.0e-5, random_seed: nil)
|
49
|
+
super()
|
50
|
+
@params = {
|
51
|
+
n_clusters: n_clusters,
|
52
|
+
affinity: affinity,
|
53
|
+
gamma: gamma,
|
54
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
55
|
+
max_iter: max_iter,
|
56
|
+
tol: tol,
|
57
|
+
eps: eps,
|
58
|
+
random_seed: (random_seed || srand)
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Analysis clusters with given training data.
|
63
|
+
#
|
64
|
+
# @overload fit(x) -> PowerIteration
|
65
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
66
|
+
# If the affinity is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
67
|
+
# @return [PowerIteration] The learned cluster analyzer itself.
|
68
|
+
def fit(x, _y = nil)
|
69
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
70
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
71
|
+
|
72
|
+
fit_predict(x)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Analysis clusters and assign samples to clusters.
|
77
|
+
#
|
78
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
79
|
+
# If the affinity is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
80
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
81
|
+
def fit_predict(x)
|
82
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
83
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
84
|
+
|
85
|
+
affinity_mat = @params[:affinity] == 'precomputed' ? x : ::Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
86
|
+
@embedding, @n_iter = embedded_space(affinity_mat, @params[:max_iter], @params[:tol].fdiv(affinity_mat.shape[0]))
|
87
|
+
@labels = line_kmeans_clustering(@embedding)
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def check_invalid_array_shape(x)
|
93
|
+
@params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
94
|
+
end
|
95
|
+
|
96
|
+
def embedded_space(affinity_mat, max_iter, tol)
|
97
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
98
|
+
|
99
|
+
degrees = affinity_mat.sum(axis: 1)
|
100
|
+
normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
|
101
|
+
|
102
|
+
iters = 0
|
103
|
+
embedded_line = degrees / degrees.sum
|
104
|
+
n_samples = embedded_line.shape[0]
|
105
|
+
error = Numo::DFloat.ones(n_samples)
|
106
|
+
max_iter.times do |t|
|
107
|
+
iters = t + 1
|
108
|
+
new_embedded_line = normalized_affinity_mat.dot(embedded_line)
|
109
|
+
new_embedded_line /= new_embedded_line.abs.sum
|
110
|
+
new_error = (new_embedded_line - embedded_line).abs
|
111
|
+
break if (new_error - error).abs.max <= tol
|
112
|
+
|
113
|
+
embedded_line = new_embedded_line
|
114
|
+
error = new_error
|
115
|
+
end
|
116
|
+
|
117
|
+
[embedded_line, iters]
|
118
|
+
end
|
119
|
+
|
120
|
+
def line_kmeans_clustering(vec)
|
121
|
+
::Rumale::Clustering::KMeans.new(
|
122
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
123
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
124
|
+
).fit_predict(vec.expand_dims(1))
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Clustering
|
10
|
+
# SingleLinkage is a class that implements hierarchical cluster analysis with single linakge method.
|
11
|
+
# This class is used internally for HDBSCAN.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/single_linkage'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::SingleLinkage.new(n_clusters: 2)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Mullner, D., "Modern hierarchical, agglomerative clustering algorithms," arXiv:1109.2378, 2011.
|
21
|
+
class SingleLinkage < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the cluster labels.
|
25
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
26
|
+
attr_reader :labels
|
27
|
+
|
28
|
+
# Return the hierarchical structure.
|
29
|
+
# @return [Array<SingleLinkage::Node>] (shape: [n_samples - 1])
|
30
|
+
attr_reader :hierarchy
|
31
|
+
|
32
|
+
# Create a new cluster analyzer with single linkage algorithm.
|
33
|
+
#
|
34
|
+
# @param n_clusters [Integer] The number of clusters.
|
35
|
+
# @param metric [String] The metric to calculate the distances.
|
36
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
37
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
38
|
+
def initialize(n_clusters: 2, metric: 'euclidean')
|
39
|
+
super()
|
40
|
+
@params = {
|
41
|
+
n_clusters: n_clusters,
|
42
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
# Analysis clusters with given training data.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> SingleLinkage
|
49
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
50
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
51
|
+
# @return [SingleLinkage] The learned cluster analyzer itself.
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
54
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
55
|
+
|
56
|
+
fit_predict(x)
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
# Analysis clusters and assign samples to clusters.
|
61
|
+
#
|
62
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
|
63
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
64
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
65
|
+
def fit_predict(x)
|
66
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
67
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
68
|
+
|
69
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
70
|
+
@labels = partial_fit(distance_mat)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def check_invalid_array_shape(x)
|
76
|
+
@params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
77
|
+
end
|
78
|
+
|
79
|
+
# @!visibility private
|
80
|
+
class UnionFind
|
81
|
+
def initialize(n)
|
82
|
+
@parent = Numo::Int32.zeros(2 * n - 1) - 1
|
83
|
+
@size = Numo::Int32.hstack([Numo::Int32.ones(n), Numo::Int32.zeros(n - 1)])
|
84
|
+
@next_label = n
|
85
|
+
end
|
86
|
+
|
87
|
+
# @!visibility private
|
88
|
+
def union(x, y)
|
89
|
+
size = @size[x] + @size[y]
|
90
|
+
@parent[x] = @next_label
|
91
|
+
@parent[y] = @next_label
|
92
|
+
@size[@next_label] = size
|
93
|
+
@next_label += 1
|
94
|
+
size
|
95
|
+
end
|
96
|
+
|
97
|
+
# @!visibility private
|
98
|
+
def find(x)
|
99
|
+
p = x
|
100
|
+
x = @parent[x] while @parent[x] != -1
|
101
|
+
while @parent[p] != x
|
102
|
+
p = @parent[p]
|
103
|
+
@parent[p] = x
|
104
|
+
end
|
105
|
+
x
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# @!visibility private
|
110
|
+
class Node
|
111
|
+
# @!visibility private
|
112
|
+
attr_reader :x, :y, :weight, :n_elements
|
113
|
+
|
114
|
+
# @!visibility private
|
115
|
+
def initialize(x:, y:, weight:, n_elements: 0)
|
116
|
+
@x = x
|
117
|
+
@y = y
|
118
|
+
@weight = weight
|
119
|
+
@n_elements = n_elements
|
120
|
+
end
|
121
|
+
|
122
|
+
# @!visibility private
|
123
|
+
def ==(other)
|
124
|
+
x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private_constant :UnionFind, :Node
|
129
|
+
|
130
|
+
def partial_fit(distance_mat)
|
131
|
+
mst = minimum_spanning_tree(distance_mat)
|
132
|
+
@hierarchy = single_linkage_hierarchy(mst)
|
133
|
+
flatten(@hierarchy, @params[:n_clusters])
|
134
|
+
end
|
135
|
+
|
136
|
+
def minimum_spanning_tree(complete_graph)
|
137
|
+
n_samples = complete_graph.shape[0]
|
138
|
+
n_edges = n_samples - 1
|
139
|
+
curr_weights = Numo::DFloat.zeros(n_samples) + Float::INFINITY
|
140
|
+
curr_labels = Numo::Int32.new(n_samples).seq
|
141
|
+
next_node = 0
|
142
|
+
mst = Array.new(n_edges) do
|
143
|
+
curr_node = next_node
|
144
|
+
target = curr_labels.ne(curr_node)
|
145
|
+
curr_labels = curr_labels[target]
|
146
|
+
curr_weights = Numo::DFloat.minimum(curr_weights[target], complete_graph[curr_node, curr_labels])
|
147
|
+
next_node = curr_labels[curr_weights.min_index]
|
148
|
+
weight = curr_weights.min
|
149
|
+
Node.new(x: curr_node, y: next_node, weight: weight)
|
150
|
+
end
|
151
|
+
mst.sort! { |a, b| a.weight <=> b.weight }
|
152
|
+
end
|
153
|
+
|
154
|
+
def single_linkage_hierarchy(mst)
|
155
|
+
n_edges = mst.size
|
156
|
+
n_nodes = n_edges + 1
|
157
|
+
uf = UnionFind.new(n_nodes)
|
158
|
+
Array.new(n_edges) do |n|
|
159
|
+
x_root = uf.find(mst[n].x)
|
160
|
+
y_root = uf.find(mst[n].y)
|
161
|
+
x_root, y_root = [y_root, x_root] unless x_root < y_root
|
162
|
+
weight = mst[n].weight
|
163
|
+
n_samples = uf.union(x_root, y_root)
|
164
|
+
Node.new(x: x_root, y: y_root, weight: weight, n_elements: n_samples)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def descedent_ids(hierarchy_, start_node)
|
169
|
+
n_samples = hierarchy_.size + 1
|
170
|
+
return [start_node] if start_node < n_samples
|
171
|
+
|
172
|
+
res = []
|
173
|
+
indices = [start_node]
|
174
|
+
n_indices = 1
|
175
|
+
while n_indices.positive?
|
176
|
+
idx = indices.pop
|
177
|
+
if idx < n_samples
|
178
|
+
res.push(idx)
|
179
|
+
n_indices -= 1
|
180
|
+
else
|
181
|
+
indices.push(hierarchy_[idx - n_samples].x)
|
182
|
+
indices.push(hierarchy_[idx - n_samples].y)
|
183
|
+
n_indices += 1
|
184
|
+
end
|
185
|
+
end
|
186
|
+
res
|
187
|
+
end
|
188
|
+
|
189
|
+
def flatten(hierarchy_, n_clusters)
|
190
|
+
n_samples = hierarchy_.size + 1
|
191
|
+
return Numo::Int32.zeros(n_samples) if n_clusters < 2
|
192
|
+
|
193
|
+
nodes = [-([hierarchy_[-1].x, hierarchy_[-1].y].max + 1)]
|
194
|
+
(n_clusters - 1).times do
|
195
|
+
children = hierarchy_[-nodes[0] - n_samples]
|
196
|
+
nodes.push(-children.x)
|
197
|
+
nodes.push(-children.y)
|
198
|
+
nodes.sort!.shift
|
199
|
+
end
|
200
|
+
res = Numo::Int32.zeros(n_samples)
|
201
|
+
nodes.each_with_index { |sid, cluster_id| res[descedent_ids(hierarchy_, -sid)] = cluster_id }
|
202
|
+
res
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/pairwise_metric'
|
4
|
+
require 'rumale/clustering/dbscan'
|
5
|
+
|
6
|
+
module Rumale
|
7
|
+
module Clustering
|
8
|
+
# SNN is a class that implements Shared Nearest Neighbor cluster analysis.
|
9
|
+
# The SNN method is a variation of DBSCAN that uses similarity based on k-nearest neighbors as a metric.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/clustering/snn'
|
13
|
+
#
|
14
|
+
# analyzer = Rumale::Clustering::SNN.new(n_neighbros: 10, eps: 5, min_samples: 5)
|
15
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
|
+
#
|
17
|
+
# *Reference*
|
18
|
+
# - Ertoz, L., Steinbach, M., and Kumar, V., "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data," Proc. SDM'03, pp. 47--58, 2003.
|
19
|
+
# - Houle, M E., Kriegel, H-P., Kroger, P., Schubert, E., and Zimek, A., "Can Shared-Neighbor Distances Defeat the Curse of Dimensionality?," Proc. SSDBM'10, pp. 482--500, 2010.
|
20
|
+
class SNN < DBSCAN
|
21
|
+
# Create a new cluster analyzer with Shared Neareset Neighbor method.
|
22
|
+
#
|
23
|
+
# @param n_neighbors [Integer] The number of neighbors to be used for finding k-nearest neighbors.
|
24
|
+
# @param eps [Integer] The threshold value for finding connected components based on similarity.
|
25
|
+
# @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
|
26
|
+
# @param metric [String] The metric to calculate the distances.
|
27
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
28
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
29
|
+
def initialize(n_neighbors: 10, eps: 5, min_samples: 5, metric: 'euclidean') # rubocop:disable Lint/MissingSuper
|
30
|
+
@params = {
|
31
|
+
n_neighbors: n_neighbors,
|
32
|
+
eps: eps,
|
33
|
+
min_samples: min_samples,
|
34
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
# Analysis clusters with given training data.
|
39
|
+
#
|
40
|
+
# @overload fit(x) -> SNN
|
41
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
42
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
43
|
+
# @return [SNN] The learned cluster analyzer itself.
|
44
|
+
def fit(x, _y = nil)
|
45
|
+
super
|
46
|
+
end
|
47
|
+
|
48
|
+
# Analysis clusters and assign samples to clusters.
|
49
|
+
#
|
50
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
|
51
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
52
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
53
|
+
def fit_predict(x) # rubocop:disable Lint/UselessMethodDefinition
|
54
|
+
super
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def calc_pairwise_metrics(x)
|
60
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
61
|
+
n_samples = distance_mat.shape[0]
|
62
|
+
adjacency_mat = Numo::DFloat.zeros(n_samples, n_samples)
|
63
|
+
n_samples.times do |n|
|
64
|
+
neighbor_ids = distance_mat[n, true].sort_index[0...@params[:n_neighbors]]
|
65
|
+
adjacency_mat[n, neighbor_ids] = 1
|
66
|
+
end
|
67
|
+
adjacency_mat.dot(adjacency_mat.transpose)
|
68
|
+
end
|
69
|
+
|
70
|
+
def region_query(similarity_arr)
|
71
|
+
similarity_arr.gt(@params[:eps]).where.to_a
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/utils'
|
7
|
+
require 'rumale/validation'
|
8
|
+
require 'rumale/clustering/k_means'
|
9
|
+
|
10
|
+
module Rumale
|
11
|
+
module Clustering
|
12
|
+
# SpectralClustering is a class that implements the normalized spectral clustering.
|
13
|
+
#
|
14
|
+
# @example
|
15
|
+
# require 'numo/linalg/autoloader'
|
16
|
+
# require 'rumale/clustering/spectral_clustering'
|
17
|
+
#
|
18
|
+
# analyzer = Rumale::Clustering::SpectralClustering.new(n_clusters: 10, gamma: 8.0)
|
19
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
20
|
+
#
|
21
|
+
# *Reference*
|
22
|
+
# - Ng, A Y., Jordan, M I., and Weiss, Y., "On Spectral Clustering: Analyssi and an algorithm," Proc. NIPS'01, pp. 849--856, 2001.
|
23
|
+
# - von Luxburg, U., "A tutorial on spectral clustering," Statistics and Computing, Vol. 17 (4), pp. 395--416, 2007.
|
24
|
+
class SpectralClustering < ::Rumale::Base::Estimator
|
25
|
+
include ::Rumale::Base::ClusterAnalyzer
|
26
|
+
|
27
|
+
# Return the data in embedded space.
|
28
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_clusters])
|
29
|
+
attr_reader :embedding
|
30
|
+
|
31
|
+
# Return the cluster labels.
|
32
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
33
|
+
attr_reader :labels
|
34
|
+
|
35
|
+
# Create a new cluster analyzer with normalized spectral clustering.
|
36
|
+
#
|
37
|
+
# @param n_clusters [Integer] The number of clusters.
|
38
|
+
# @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
|
39
|
+
# If affinity = 'rbf', the class performs the normalized spectral clustering with the fully connected graph weighted by rbf kernel.
|
40
|
+
# @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
|
41
|
+
# If affinity = 'precomputed', this parameter is ignored.
|
42
|
+
# @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
|
43
|
+
# @param max_iter [Integer] The maximum number of iterations for K-Means clustering.
|
44
|
+
# @param tol [Float] The tolerance of termination criterion for K-Means clustering.
|
45
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
46
|
+
def initialize(n_clusters: 2, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 10, tol: 1.0e-8, random_seed: nil)
|
47
|
+
super()
|
48
|
+
@params = {
|
49
|
+
n_clusters: n_clusters,
|
50
|
+
affinity: affinity,
|
51
|
+
gamma: gamma,
|
52
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
53
|
+
max_iter: max_iter,
|
54
|
+
tol: tol,
|
55
|
+
random_seed: (random_seed || srand)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
# Analysis clusters with given training data.
|
60
|
+
# To execute this method, Numo::Linalg must be loaded.
|
61
|
+
#
|
62
|
+
# @overload fit(x) -> SpectralClustering
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
64
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
65
|
+
# @return [SpectralClustering] The learned cluster analyzer itself.
|
66
|
+
def fit(x, _y = nil)
|
67
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
68
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
69
|
+
|
70
|
+
raise 'SpectralClustering#fit requires Numo::Linalg but that is not loaded' unless enable_linalg?(warning: false)
|
71
|
+
|
72
|
+
fit_predict(x)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
# Analysis clusters and assign samples to clusters.
|
77
|
+
# To execute this method, Numo::Linalg must be loaded.
|
78
|
+
#
|
79
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
80
|
+
# If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
|
81
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
82
|
+
def fit_predict(x)
|
83
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
84
|
+
raise ArgumentError, 'the input affinity matrix should be square' if check_invalid_array_shape(x)
|
85
|
+
|
86
|
+
unless enable_linalg?(warning: false)
|
87
|
+
raise 'SpectralClustering#fit_predict requires Numo::Linalg but that is not loaded'
|
88
|
+
end
|
89
|
+
|
90
|
+
affinity_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
|
91
|
+
@embedding = embedded_space(affinity_mat, @params[:n_clusters])
|
92
|
+
normalized_embedding = ::Rumale::Utils.normalize(@embedding, 'l2')
|
93
|
+
@labels = kmeans_clustering(normalized_embedding)
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
def check_invalid_array_shape(x)
|
99
|
+
@params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
|
100
|
+
end
|
101
|
+
|
102
|
+
def embedded_space(affinity_mat, n_clusters)
|
103
|
+
affinity_mat[affinity_mat.diag_indices] = 0.0
|
104
|
+
degrees = 1.0 / Numo::NMath.sqrt(affinity_mat.sum(axis: 1))
|
105
|
+
laplacian_mat = degrees.diag.dot(affinity_mat).dot(degrees.diag)
|
106
|
+
|
107
|
+
n_samples = affinity_mat.shape[0]
|
108
|
+
_, eig_vecs = Numo::Linalg.eigh(laplacian_mat, vals_range: (n_samples - n_clusters)...n_samples)
|
109
|
+
eig_vecs.reverse(1).dup
|
110
|
+
end
|
111
|
+
|
112
|
+
def kmeans_clustering(x)
|
113
|
+
::Rumale::Clustering::KMeans.new(
|
114
|
+
n_clusters: @params[:n_clusters], init: @params[:init],
|
115
|
+
max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
|
116
|
+
).fit_predict(x)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'numo/narray'
|
4
|
+
|
5
|
+
require_relative 'clustering/dbscan'
|
6
|
+
require_relative 'clustering/gaussian_mixture'
|
7
|
+
require_relative 'clustering/hdbscan'
|
8
|
+
require_relative 'clustering/k_means'
|
9
|
+
require_relative 'clustering/k_medoids'
|
10
|
+
require_relative 'clustering/mini_batch_k_means'
|
11
|
+
require_relative 'clustering/power_iteration'
|
12
|
+
require_relative 'clustering/single_linkage'
|
13
|
+
require_relative 'clustering/snn'
|
14
|
+
require_relative 'clustering/spectral_clustering'
|
15
|
+
require_relative 'clustering/version'
|
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rumale-clustering
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.24.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yoshoku
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-12-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: numo-narray
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.9.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.9.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rumale-core
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.24.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.24.0
|
41
|
+
description: |
|
42
|
+
Rumale::Clustering provides cluster analysis algorithms,
|
43
|
+
such as K-Means, Gaussian Mixture Model, DBSCAN, and Spectral Clustering,
|
44
|
+
with Rumale interface.
|
45
|
+
email:
|
46
|
+
- yoshoku@outlook.com
|
47
|
+
executables: []
|
48
|
+
extensions: []
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- lib/rumale/clustering.rb
|
54
|
+
- lib/rumale/clustering/dbscan.rb
|
55
|
+
- lib/rumale/clustering/gaussian_mixture.rb
|
56
|
+
- lib/rumale/clustering/hdbscan.rb
|
57
|
+
- lib/rumale/clustering/k_means.rb
|
58
|
+
- lib/rumale/clustering/k_medoids.rb
|
59
|
+
- lib/rumale/clustering/mini_batch_k_means.rb
|
60
|
+
- lib/rumale/clustering/power_iteration.rb
|
61
|
+
- lib/rumale/clustering/single_linkage.rb
|
62
|
+
- lib/rumale/clustering/snn.rb
|
63
|
+
- lib/rumale/clustering/spectral_clustering.rb
|
64
|
+
- lib/rumale/clustering/version.rb
|
65
|
+
homepage: https://github.com/yoshoku/rumale
|
66
|
+
licenses:
|
67
|
+
- BSD-3-Clause
|
68
|
+
metadata:
|
69
|
+
homepage_uri: https://github.com/yoshoku/rumale
|
70
|
+
source_code_uri: https://github.com/yoshoku/rumale/tree/main/rumale-clustering
|
71
|
+
changelog_uri: https://github.com/yoshoku/rumale/blob/main/CHANGELOG.md
|
72
|
+
documentation_uri: https://yoshoku.github.io/rumale/doc/
|
73
|
+
rubygems_mfa_required: 'true'
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubygems_version: 3.3.26
|
90
|
+
signing_key:
|
91
|
+
specification_version: 4
|
92
|
+
summary: Rumale::Clustering provides cluster analysis algorithms with Rumale interface.
|
93
|
+
test_files: []
|