rumale-clustering 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +27 -0
- data/README.md +34 -0
- data/lib/rumale/clustering/dbscan.rb +126 -0
- data/lib/rumale/clustering/gaussian_mixture.rb +215 -0
- data/lib/rumale/clustering/hdbscan.rb +289 -0
- data/lib/rumale/clustering/k_means.rb +120 -0
- data/lib/rumale/clustering/k_medoids.rb +143 -0
- data/lib/rumale/clustering/mini_batch_k_means.rb +138 -0
- data/lib/rumale/clustering/power_iteration.rb +128 -0
- data/lib/rumale/clustering/single_linkage.rb +206 -0
- data/lib/rumale/clustering/snn.rb +75 -0
- data/lib/rumale/clustering/spectral_clustering.rb +120 -0
- data/lib/rumale/clustering/version.rb +10 -0
- data/lib/rumale/clustering.rb +15 -0
- metadata +93 -0
@@ -0,0 +1,289 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
require 'rumale/clustering/single_linkage'
|
8
|
+
|
9
|
+
module Rumale
|
10
|
+
module Clustering
|
11
|
+
# HDBSCAN is a class that implements HDBSCAN cluster analysis.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/hdbscan'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::HDBSCAN.new(min_samples: 5)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Campello, R J. G. B., Moulavi, D., Zimek, A., and Sander, J., "Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection," TKDD, Vol. 10 (1), pp. 5:1--5:51, 2015.
|
21
|
+
# - Campello, R J. G. B., Moulavi, D., and Sander, J., "Density-Based Clustering Based on Hierarchical Density Estimates," Proc. PAKDD'13, pp. 160--172, 2013.
|
22
|
+
# - Lelis, L., and Sander, J., "Semi-Supervised Density-Based Clustering," Proc. ICDM'09, pp. 842--847, 2009.
|
23
|
+
class HDBSCAN < ::Rumale::Base::Estimator # rubocop:disable Metrics/ClassLength
|
24
|
+
include ::Rumale::Base::ClusterAnalyzer
|
25
|
+
|
26
|
+
# Return the cluster labels. The negative cluster label indicates that the point is noise.
|
27
|
+
# @return [Numo::Int32] (shape: [n_samples])
|
28
|
+
attr_reader :labels
|
29
|
+
|
30
|
+
# Create a new cluster analyzer with HDBSCAN algorithm.
|
31
|
+
#
|
32
|
+
# @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
|
33
|
+
# @param min_cluster_size [Integer/Nil] The minimum size of cluster. If nil is given, it is set equal to min_samples.
|
34
|
+
# @param metric [String] The metric to calculate the distances.
|
35
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
36
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
37
|
+
def initialize(min_samples: 10, min_cluster_size: nil, metric: 'euclidean')
|
38
|
+
super()
|
39
|
+
@params = {
|
40
|
+
min_samples: min_samples,
|
41
|
+
min_cluster_size: min_cluster_size || min_samples,
|
42
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean')
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
# Analysis clusters with given training data.
|
47
|
+
#
|
48
|
+
# @overload fit(x) -> HDBSCAN
|
49
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
50
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
51
|
+
# @return [HDBSCAN] The learned cluster analyzer itself.
|
52
|
+
def fit(x, _y = nil)
|
53
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
54
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
55
|
+
|
56
|
+
fit_predict(x)
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
# Analysis clusters and assign samples to clusters.
|
61
|
+
#
|
62
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
|
63
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
64
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
65
|
+
def fit_predict(x)
|
66
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
67
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
68
|
+
|
69
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
70
|
+
@labels = partial_fit(distance_mat)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def check_invalid_array_shape(x)
|
76
|
+
@params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
77
|
+
end
|
78
|
+
|
79
|
+
# @!visibility private
|
80
|
+
class UnionFind
|
81
|
+
def initialize(n)
|
82
|
+
@parent = Numo::Int32.new(n).seq
|
83
|
+
@rank = Numo::Int32.zeros(n)
|
84
|
+
end
|
85
|
+
|
86
|
+
# @!visibility private
|
87
|
+
def union(x, y)
|
88
|
+
x_root = find(x)
|
89
|
+
y_root = find(y)
|
90
|
+
|
91
|
+
return if x_root == y_root
|
92
|
+
|
93
|
+
if @rank[x_root] < @rank[y_root]
|
94
|
+
@parent[x_root] = y_root
|
95
|
+
else
|
96
|
+
@parent[y_root] = x_root
|
97
|
+
@rank[x_root] += 1 if @rank[x_root] == @rank[y_root]
|
98
|
+
end
|
99
|
+
|
100
|
+
nil
|
101
|
+
end
|
102
|
+
|
103
|
+
# @!visibility private
|
104
|
+
def find(x)
|
105
|
+
@parent[x] = find(@parent[x]) if @parent[x] != x
|
106
|
+
@parent[x]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# @!visibility private
|
111
|
+
class Node
|
112
|
+
# @!visibility private
|
113
|
+
attr_reader :x, :y, :weight, :n_elements
|
114
|
+
|
115
|
+
# @!visibility private
|
116
|
+
def initialize(x:, y:, weight:, n_elements: 0)
|
117
|
+
@x = x
|
118
|
+
@y = y
|
119
|
+
@weight = weight
|
120
|
+
@n_elements = n_elements
|
121
|
+
end
|
122
|
+
|
123
|
+
# @!visibility private
|
124
|
+
def ==(other)
|
125
|
+
x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private_constant :UnionFind, :Node
|
130
|
+
|
131
|
+
def partial_fit(distance_mat)
|
132
|
+
mr_distance_mat = mutual_reachability_distances(distance_mat, @params[:min_samples])
|
133
|
+
hierarchy = ::Rumale::Clustering::SingleLinkage.new(n_clusters: 1, metric: 'precomputed').fit(mr_distance_mat).hierarchy
|
134
|
+
tree = condense_tree(hierarchy, @params[:min_cluster_size])
|
135
|
+
stabilities = cluster_stability(tree)
|
136
|
+
flatten(tree, stabilities)
|
137
|
+
end
|
138
|
+
|
139
|
+
def mutual_reachability_distances(distance_mat, min_samples)
|
140
|
+
core_distances = distance_mat.sort(axis: 1)[true, min_samples + 1]
|
141
|
+
Numo::DFloat.maximum(core_distances.expand_dims(1), Numo::DFloat.maximum(core_distances, distance_mat))
|
142
|
+
end
|
143
|
+
|
144
|
+
def breadth_first_search_hierarchy(hierarchy, root)
|
145
|
+
n_edges = hierarchy.size
|
146
|
+
n_points = n_edges + 1
|
147
|
+
to_process = [root]
|
148
|
+
res = []
|
149
|
+
while to_process.any?
|
150
|
+
res.concat(to_process)
|
151
|
+
to_process = to_process.select { |n| n >= n_points }.map { |n| n - n_points }
|
152
|
+
to_process = to_process.map { |n| [hierarchy[n].x, hierarchy[n].y] }.flatten if to_process.any?
|
153
|
+
end
|
154
|
+
res
|
155
|
+
end
|
156
|
+
|
157
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
158
|
+
def condense_tree(hierarchy, min_cluster_size)
|
159
|
+
n_edges = hierarchy.size
|
160
|
+
root = 2 * n_edges
|
161
|
+
n_points = n_edges + 1
|
162
|
+
next_label = n_points + 1
|
163
|
+
|
164
|
+
node_ids = breadth_first_search_hierarchy(hierarchy, root)
|
165
|
+
|
166
|
+
relabel = Numo::Int32.zeros(root + 1)
|
167
|
+
relabel[root] = n_points
|
168
|
+
res = []
|
169
|
+
visited = {}
|
170
|
+
|
171
|
+
node_ids.each do |n_id|
|
172
|
+
next if visited[n_id] || n_id < n_points
|
173
|
+
|
174
|
+
edge = hierarchy[n_id - n_points]
|
175
|
+
|
176
|
+
density = edge.weight > 0.0 ? 1.fdiv(edge.weight) : Float::INFINITY
|
177
|
+
n_x_elements = edge.x >= n_points ? hierarchy[edge.x - n_points].n_elements : 1
|
178
|
+
n_y_elements = edge.y >= n_points ? hierarchy[edge.y - n_points].n_elements : 1
|
179
|
+
|
180
|
+
if n_x_elements >= min_cluster_size && n_y_elements >= min_cluster_size
|
181
|
+
relabel[edge.x] = next_label
|
182
|
+
res.push(Node.new(x: relabel[n_id], y: relabel[edge.x], weight: density, n_elements: n_x_elements))
|
183
|
+
next_label += 1
|
184
|
+
relabel[edge.y] = next_label
|
185
|
+
res.push(Node.new(x: relabel[n_id], y: relabel[edge.y], weight: density, n_elements: n_y_elements))
|
186
|
+
next_label += 1
|
187
|
+
elsif n_x_elements < min_cluster_size && n_y_elements < min_cluster_size
|
188
|
+
breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
|
189
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
190
|
+
visited[sn_id] = true
|
191
|
+
end
|
192
|
+
breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
|
193
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
194
|
+
visited[sn_id] = true
|
195
|
+
end
|
196
|
+
elsif n_x_elements < min_cluster_size
|
197
|
+
relabel[edge.y] = relabel[n_id]
|
198
|
+
breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
|
199
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
200
|
+
visited[sn_id] = true
|
201
|
+
end
|
202
|
+
elsif n_y_elements < min_cluster_size
|
203
|
+
relabel[edge.x] = relabel[n_id]
|
204
|
+
breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
|
205
|
+
res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
|
206
|
+
visited[sn_id] = true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
res
|
211
|
+
end
|
212
|
+
|
213
|
+
def cluster_stability(tree)
|
214
|
+
tree.sort! { |a, b| a.weight <=> b.weight }
|
215
|
+
|
216
|
+
root = tree.map(&:x).min
|
217
|
+
child_max = tree.map(&:y).max
|
218
|
+
child_max = root if child_max < root
|
219
|
+
densities = Numo::DFloat.zeros(child_max + 1) + Float::INFINITY
|
220
|
+
|
221
|
+
current = tree[0].y
|
222
|
+
density_min = tree[0].weight
|
223
|
+
tree.each do |edge|
|
224
|
+
if edge.x == current
|
225
|
+
density_min = [density_min, edge.weight].min
|
226
|
+
else
|
227
|
+
densities[current] = density_min
|
228
|
+
current = edge.y
|
229
|
+
density_min = edge.weight
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
densities[current] = density_min if current != tree[0].y
|
234
|
+
densities[root] = 0.0
|
235
|
+
|
236
|
+
tree.each_with_object({}) do |edge, stab|
|
237
|
+
stab[edge.x] ||= 0.0
|
238
|
+
stab[edge.x] += (edge.weight - densities[edge.x]) * edge.n_elements
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def breadth_first_search_tree(tree, root)
|
243
|
+
to_process = [root]
|
244
|
+
res = []
|
245
|
+
while to_process.any?
|
246
|
+
res.concat(to_process)
|
247
|
+
to_process = tree.select { |v| to_process.include?(v.x) }.map(&:y)
|
248
|
+
end
|
249
|
+
res
|
250
|
+
end
|
251
|
+
|
252
|
+
def flatten(tree, stabilities)
|
253
|
+
node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
|
254
|
+
|
255
|
+
cluster_tree = tree.select { |edge| edge.n_elements > 1 }
|
256
|
+
is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
|
257
|
+
|
258
|
+
node_ids.each do |n_id|
|
259
|
+
children = cluster_tree.select { |node| node.x == n_id }.map(&:y)
|
260
|
+
subtree_stability = children.inject(0.0) { |sum, c_id| sum + stabilities[c_id] }
|
261
|
+
if subtree_stability > stabilities[n_id]
|
262
|
+
is_cluster[n_id] = false
|
263
|
+
stabilities[n_id] = subtree_stability
|
264
|
+
else
|
265
|
+
breadth_first_search_tree(cluster_tree, n_id).each do |sn_id|
|
266
|
+
is_cluster[sn_id] = false if sn_id != n_id
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
selected_node_ids = is_cluster.select { |_k, v| v == true }.keys.uniq.sort
|
272
|
+
cluster_label_map = selected_node_ids.each_with_object({}).with_index { |(n_idx, h), c_idx| h[n_idx] = c_idx }
|
273
|
+
|
274
|
+
parent_arr = tree.map(&:x)
|
275
|
+
uf = UnionFind.new(parent_arr.max + 1)
|
276
|
+
tree.each { |edge| uf.union(edge.x, edge.y) if cluster_label_map[edge.y].nil? }
|
277
|
+
|
278
|
+
root = parent_arr.min
|
279
|
+
res = Numo::Int32.zeros(root)
|
280
|
+
root.times do |n|
|
281
|
+
cluster = uf.find(n)
|
282
|
+
res[n] = cluster < root ? -1 : cluster_label_map[cluster] || -1
|
283
|
+
end
|
284
|
+
res
|
285
|
+
end
|
286
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Clustering
|
10
|
+
# KMeans is a class that implements K-Means cluster analysis.
|
11
|
+
# The current implementation uses the Euclidean distance for analyzing the clusters.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/k_means'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Arthur, D., and Vassilvitskii, S., "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
21
|
+
class KMeans < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the centroids.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_clusters, n_features])
|
26
|
+
attr_reader :cluster_centers
|
27
|
+
|
28
|
+
# Return the random generator.
|
29
|
+
# @return [Random]
|
30
|
+
attr_reader :rng
|
31
|
+
|
32
|
+
# Create a new cluster analyzer with K-Means method.
|
33
|
+
#
|
34
|
+
# @param n_clusters [Integer] The number of clusters.
|
35
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param tol [Float] The tolerance of termination criterion.
|
38
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
39
|
+
def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
40
|
+
super()
|
41
|
+
@params = {
|
42
|
+
n_clusters: n_clusters,
|
43
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
44
|
+
max_iter: max_iter,
|
45
|
+
tol: tol,
|
46
|
+
random_seed: (random_seed || srand)
|
47
|
+
}
|
48
|
+
@rng = Random.new(@params[:random_seed])
|
49
|
+
end
|
50
|
+
|
51
|
+
# Analysis clusters with given training data.
|
52
|
+
#
|
53
|
+
# @overload fit(x) -> KMeans
|
54
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
55
|
+
# @return [KMeans] The learned cluster analyzer itself.
|
56
|
+
def fit(x, _y = nil)
|
57
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
58
|
+
|
59
|
+
init_cluster_centers(x)
|
60
|
+
@params[:max_iter].times do |_t|
|
61
|
+
cluster_labels = assign_cluster(x)
|
62
|
+
old_centers = @cluster_centers.dup
|
63
|
+
@params[:n_clusters].times do |n|
|
64
|
+
assigned_bits = cluster_labels.eq(n)
|
65
|
+
@cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count.positive?
|
66
|
+
end
|
67
|
+
error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
|
68
|
+
break if error <= @params[:tol]
|
69
|
+
end
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
# Predict cluster labels for samples.
|
74
|
+
#
|
75
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
76
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
77
|
+
def predict(x)
|
78
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
79
|
+
|
80
|
+
assign_cluster(x)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Analysis clusters and assign samples to clusters.
|
84
|
+
#
|
85
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
86
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
87
|
+
def fit_predict(x)
|
88
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
89
|
+
|
90
|
+
fit(x).predict(x)
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def assign_cluster(x)
|
96
|
+
distance_matrix = ::Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
97
|
+
distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
|
98
|
+
end
|
99
|
+
|
100
|
+
def init_cluster_centers(x)
|
101
|
+
# random initialize
|
102
|
+
n_samples = x.shape[0]
|
103
|
+
sub_rng = @rng.dup
|
104
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
105
|
+
@cluster_centers = x[rand_id, true].dup
|
106
|
+
return unless @params[:init] == 'k-means++'
|
107
|
+
|
108
|
+
# k-means++ initialize
|
109
|
+
(1...@params[:n_clusters]).each do |n|
|
110
|
+
distance_matrix = ::Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
|
111
|
+
min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
|
112
|
+
probs = min_distances**2 / (min_distances**2).sum
|
113
|
+
cum_probs = probs.cumsum
|
114
|
+
selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
|
115
|
+
@cluster_centers[n, true] = x[selected_id, true].dup
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
|
7
|
+
module Rumale
|
8
|
+
module Clustering
|
9
|
+
# KMedoids is a class that implements K-Medoids cluster analysis.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# require 'rumale/clustering/k_medoids'
|
13
|
+
#
|
14
|
+
# analyzer = Rumale::Clustering::KMedoids.new(n_clusters: 10, max_iter: 50)
|
15
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
16
|
+
#
|
17
|
+
# *Reference*
|
18
|
+
# - Arthur, D., and Vassilvitskii, S., "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
|
19
|
+
class KMedoids < ::Rumale::Base::Estimator
|
20
|
+
include ::Rumale::Base::ClusterAnalyzer
|
21
|
+
|
22
|
+
# Return the indices of medoids.
|
23
|
+
# @return [Numo::Int32] (shape: [n_clusters])
|
24
|
+
attr_reader :medoid_ids
|
25
|
+
|
26
|
+
# Return the random generator.
|
27
|
+
# @return [Random]
|
28
|
+
attr_reader :rng
|
29
|
+
|
30
|
+
# Create a new cluster analyzer with K-Medoids method.
|
31
|
+
#
|
32
|
+
# @param n_clusters [Integer] The number of clusters.
|
33
|
+
# @param metric [String] The metric to calculate the distances.
|
34
|
+
# If metric is 'euclidean', Euclidean distance is calculated for distance between points.
|
35
|
+
# If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
|
36
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
37
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
38
|
+
# @param tol [Float] The tolerance of termination criterion.
|
39
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
40
|
+
def initialize(n_clusters: 8, metric: 'euclidean', init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
|
41
|
+
super()
|
42
|
+
@params = {
|
43
|
+
n_clusters: n_clusters,
|
44
|
+
metric: (metric == 'precomputed' ? 'precomputed' : 'euclidean'),
|
45
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
46
|
+
max_iter: max_iter,
|
47
|
+
tol: tol,
|
48
|
+
random_seed: (random_seed || srand)
|
49
|
+
}
|
50
|
+
@rng = Random.new(@params[:random_seed])
|
51
|
+
end
|
52
|
+
|
53
|
+
# Analysis clusters with given training data.
|
54
|
+
#
|
55
|
+
# @overload fit(x) -> KMedoids
|
56
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
57
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
58
|
+
# @return [KMedoids] The learned cluster analyzer itself.
|
59
|
+
def fit(x, _y = nil)
|
60
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
61
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
62
|
+
|
63
|
+
# initialize some varibales.
|
64
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x)
|
65
|
+
init_cluster_centers(distance_mat)
|
66
|
+
error = distance_mat[true, @medoid_ids].mean
|
67
|
+
@params[:max_iter].times do |_t|
|
68
|
+
cluster_labels = assign_cluster(distance_mat[true, @medoid_ids])
|
69
|
+
@params[:n_clusters].times do |n|
|
70
|
+
assigned_ids = cluster_labels.eq(n).where
|
71
|
+
@medoid_ids[n] = assigned_ids[distance_mat[assigned_ids, assigned_ids].sum(axis: 1).min_index]
|
72
|
+
end
|
73
|
+
new_error = distance_mat[true, @medoid_ids].mean
|
74
|
+
break if (error - new_error).abs <= @params[:tol]
|
75
|
+
|
76
|
+
error = new_error
|
77
|
+
end
|
78
|
+
@cluster_centers = x[@medoid_ids, true].dup if @params[:metric] == 'euclidean'
|
79
|
+
self
|
80
|
+
end
|
81
|
+
|
82
|
+
# Predict cluster labels for samples.
|
83
|
+
#
|
84
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
85
|
+
# If the metric is 'precomputed', x must be distances between samples and medoids (shape: [n_samples, n_clusters]).
|
86
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
87
|
+
def predict(x)
|
88
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
89
|
+
|
90
|
+
distance_mat = @params[:metric] == 'precomputed' ? x : ::Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
91
|
+
if @params[:metric] == 'precomputed' && distance_mat.shape[1] != @medoid_ids.size
|
92
|
+
raise ArgumentError, 'the shape of input matrix should be n_samples-by-n_clusters'
|
93
|
+
end
|
94
|
+
|
95
|
+
assign_cluster(distance_mat)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Analysis clusters and assign samples to clusters.
|
99
|
+
#
|
100
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
101
|
+
# If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
|
102
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
103
|
+
def fit_predict(x)
|
104
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
105
|
+
raise ArgumentError, 'the input distance matrix should be square' if check_invalid_array_shape(x)
|
106
|
+
|
107
|
+
fit(x)
|
108
|
+
if @params[:metric] == 'precomputed'
|
109
|
+
predict(x[true, @medoid_ids])
|
110
|
+
else
|
111
|
+
predict(x)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def check_invalid_array_shape(x)
|
118
|
+
@params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
|
119
|
+
end
|
120
|
+
|
121
|
+
def assign_cluster(distances_to_medoids)
|
122
|
+
distances_to_medoids.min_index(axis: 1) - Numo::Int32[*0.step(distances_to_medoids.size - 1, @params[:n_clusters])]
|
123
|
+
end
|
124
|
+
|
125
|
+
def init_cluster_centers(distance_mat)
|
126
|
+
# random initialize
|
127
|
+
n_samples = distance_mat.shape[0]
|
128
|
+
sub_rng = @rng.dup
|
129
|
+
@medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
|
130
|
+
return unless @params[:init] == 'k-means++'
|
131
|
+
|
132
|
+
# k-means++ initialize
|
133
|
+
(1...@params[:n_clusters]).each do |n|
|
134
|
+
distances = distance_mat[true, @medoid_ids[0...n]]
|
135
|
+
min_distances = distances.flatten[distances.min_index(axis: 1)]
|
136
|
+
probs = min_distances**2 / (min_distances**2).sum
|
137
|
+
cum_probs = probs.cumsum
|
138
|
+
@medoid_ids[n] = cum_probs.gt(sub_rng.rand).where.to_a.first
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/estimator'
|
4
|
+
require 'rumale/base/cluster_analyzer'
|
5
|
+
require 'rumale/pairwise_metric'
|
6
|
+
require 'rumale/validation'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module Clustering
|
10
|
+
# MniBatchKMeans is a class that implements K-Means cluster analysis
|
11
|
+
# with mini-batch stochastic gradient descent (SGD).
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# require 'rumale/clustering/mini_batch_k_means'
|
15
|
+
#
|
16
|
+
# analyzer = Rumale::Clustering::MiniBatchKMeans.new(n_clusters: 10, max_iter: 50, batch_size: 50, random_seed: 1)
|
17
|
+
# cluster_labels = analyzer.fit_predict(samples)
|
18
|
+
#
|
19
|
+
# *Reference*
|
20
|
+
# - Sculley, D., "Web-scale k-means clustering," Proc. WWW'10, pp. 1177--1178, 2010.
|
21
|
+
class MiniBatchKMeans < ::Rumale::Base::Estimator
|
22
|
+
include ::Rumale::Base::ClusterAnalyzer
|
23
|
+
|
24
|
+
# Return the centroids.
|
25
|
+
# @return [Numo::DFloat] (shape: [n_clusters, n_features])
|
26
|
+
attr_reader :cluster_centers
|
27
|
+
|
28
|
+
# Return the random generator.
|
29
|
+
# @return [Random]
|
30
|
+
attr_reader :rng
|
31
|
+
|
32
|
+
# Create a new cluster analyzer with K-Means method with mini-batch SGD.
|
33
|
+
#
|
34
|
+
# @param n_clusters [Integer] The number of clusters.
|
35
|
+
# @param init [String] The initialization method for centroids ('random' or 'k-means++').
|
36
|
+
# @param max_iter [Integer] The maximum number of iterations.
|
37
|
+
# @param batch_size [Integer] The size of the mini batches.
|
38
|
+
# @param tol [Float] The tolerance of termination criterion.
|
39
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
40
|
+
def initialize(n_clusters: 8, init: 'k-means++', max_iter: 100, batch_size: 100, tol: 1.0e-4, random_seed: nil)
|
41
|
+
super()
|
42
|
+
@params = {
|
43
|
+
n_clusters: n_clusters,
|
44
|
+
init: (init == 'random' ? 'random' : 'k-means++'),
|
45
|
+
max_iter: max_iter,
|
46
|
+
batch_size: batch_size,
|
47
|
+
tol: tol,
|
48
|
+
random_seed: (random_seed || srand)
|
49
|
+
}
|
50
|
+
@rng = Random.new(@params[:random_seed])
|
51
|
+
end
|
52
|
+
|
53
|
+
# Analysis clusters with given training data.
|
54
|
+
#
|
55
|
+
# @overload fit(x) -> MiniBatchKMeans
|
56
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
57
|
+
# @return [KMeans] The learned cluster analyzer itself.
|
58
|
+
def fit(x, _y = nil)
|
59
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
60
|
+
|
61
|
+
# initialization.
|
62
|
+
n_samples = x.shape[0]
|
63
|
+
update_counter = Numo::Int32.zeros(@params[:n_clusters])
|
64
|
+
sub_rng = @rng.dup
|
65
|
+
init_cluster_centers(x, sub_rng)
|
66
|
+
# optimization with mini-batch sgd.
|
67
|
+
@params[:max_iter].times do |_t|
|
68
|
+
sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
|
69
|
+
old_centers = @cluster_centers.dup
|
70
|
+
until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
|
71
|
+
# sub sampling
|
72
|
+
sub_x = x[subset_ids, true]
|
73
|
+
# assign nearest centroids
|
74
|
+
cluster_labels = assign_cluster(sub_x)
|
75
|
+
# update centroids
|
76
|
+
@params[:n_clusters].times do |c|
|
77
|
+
assigned_bits = cluster_labels.eq(c)
|
78
|
+
next unless assigned_bits.count.positive?
|
79
|
+
|
80
|
+
update_counter[c] += 1
|
81
|
+
learning_rate = 1.fdiv(update_counter[c])
|
82
|
+
update = sub_x[assigned_bits.where, true].mean(axis: 0)
|
83
|
+
@cluster_centers[c, true] = (1 - learning_rate) * @cluster_centers[c, true] + learning_rate * update
|
84
|
+
end
|
85
|
+
end
|
86
|
+
error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
|
87
|
+
break if error <= @params[:tol]
|
88
|
+
end
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
# Predict cluster labels for samples.
|
93
|
+
#
|
94
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
|
95
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
96
|
+
def predict(x)
|
97
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
98
|
+
|
99
|
+
assign_cluster(x)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Analysis clusters and assign samples to clusters.
|
103
|
+
#
|
104
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
|
105
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
|
106
|
+
def fit_predict(x)
|
107
|
+
x = ::Rumale::Validation.check_convert_sample_array(x)
|
108
|
+
|
109
|
+
fit(x).predict(x)
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def assign_cluster(x)
|
115
|
+
distance_matrix = ::Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
|
116
|
+
distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
|
117
|
+
end
|
118
|
+
|
119
|
+
def init_cluster_centers(x, sub_rng)
|
120
|
+
# random initialize
|
121
|
+
n_samples = x.shape[0]
|
122
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
123
|
+
@cluster_centers = x[rand_id, true].dup
|
124
|
+
return unless @params[:init] == 'k-means++'
|
125
|
+
|
126
|
+
# k-means++ initialize
|
127
|
+
(1...@params[:n_clusters]).each do |n|
|
128
|
+
distance_matrix = ::Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
|
129
|
+
min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
|
130
|
+
probs = min_distances**2 / (min_distances**2).sum
|
131
|
+
cum_probs = probs.cumsum
|
132
|
+
selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
|
133
|
+
@cluster_centers[n, true] = x[selected_id, true].dup
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|