kmeans-clusterer 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +43 -66
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
|
4
|
+
data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
|
7
|
+
data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -21,13 +21,18 @@ class KMeansClusterer
|
|
21
21
|
data = (data - mean) / std
|
22
22
|
[NMatrix.ref(data), mean, std]
|
23
23
|
end
|
24
|
+
|
25
|
+
def self.row_norms data
|
26
|
+
squared_data = NArray.ref(data)**2
|
27
|
+
NMatrix.ref(squared_data).sum(0)
|
28
|
+
end
|
24
29
|
end
|
25
30
|
|
26
31
|
module Distance
|
27
32
|
def self.euclidean x, y, yy = nil
|
28
33
|
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
29
|
-
xx =
|
30
|
-
yy ||=
|
34
|
+
xx = Scaler.row_norms(x)
|
35
|
+
yy ||= Scaler.row_norms(y)
|
31
36
|
xy = x * y.transpose
|
32
37
|
distance = xy * -2
|
33
38
|
distance += xx
|
@@ -93,7 +98,7 @@ class KMeansClusterer
|
|
93
98
|
end
|
94
99
|
|
95
100
|
|
96
|
-
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
|
101
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
|
97
102
|
|
98
103
|
def self.run k, data, opts = {}
|
99
104
|
opts = DEFAULT_OPTS.merge(opts)
|
@@ -112,7 +117,7 @@ class KMeansClusterer
|
|
112
117
|
end
|
113
118
|
|
114
119
|
opts[:points_matrix] = data
|
115
|
-
opts[:row_norms] =
|
120
|
+
opts[:row_norms] = Scaler.row_norms(data)
|
116
121
|
|
117
122
|
bestrun = nil
|
118
123
|
|
@@ -147,6 +152,7 @@ class KMeansClusterer
|
|
147
152
|
@std = opts[:std]
|
148
153
|
@scale_data = opts[:scale_data]
|
149
154
|
@typecode = opts[:typecode]
|
155
|
+
@max_iter = opts[:max_iter]
|
150
156
|
|
151
157
|
init_centroids
|
152
158
|
end
|
@@ -154,55 +160,60 @@ class KMeansClusterer
|
|
154
160
|
def run
|
155
161
|
start_time = Time.now
|
156
162
|
@iterations, @runtime = 0, 0
|
157
|
-
|
158
|
-
|
163
|
+
@cluster_assigns = NArray.int(@points_count)
|
164
|
+
min_distances = NArray.new(@typecode, @points_count)
|
159
165
|
|
160
166
|
loop do
|
161
167
|
@iterations +=1
|
162
168
|
|
169
|
+
min_distances.fill! Float::INFINITY
|
163
170
|
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
164
171
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
@
|
172
|
+
@k.times do |cluster_id|
|
173
|
+
dist = NArray.ref distances[true, cluster_id].flatten
|
174
|
+
mask = dist < min_distances
|
175
|
+
@cluster_assigns[mask] = cluster_id
|
176
|
+
min_distances[mask] = dist[mask]
|
169
177
|
end
|
170
178
|
|
171
|
-
|
172
|
-
updated_centroids = []
|
179
|
+
max_move = 0
|
173
180
|
|
174
|
-
@k.times do |
|
175
|
-
centroid = NArray.ref(@centroids[true,
|
176
|
-
point_ids = @
|
181
|
+
@k.times do |cluster_id|
|
182
|
+
centroid = NArray.ref(@centroids[true, cluster_id].flatten)
|
183
|
+
point_ids = @cluster_assigns.eq(cluster_id).where
|
177
184
|
|
178
|
-
|
179
|
-
newcenter = centroid
|
180
|
-
moves << 0
|
181
|
-
else
|
185
|
+
unless point_ids.empty?
|
182
186
|
points = @points_matrix[true, point_ids]
|
183
187
|
newcenter = points.mean(1)
|
184
|
-
|
188
|
+
move = Distance.euclidean(centroid, newcenter)
|
189
|
+
max_move = move if move > max_move
|
190
|
+
@centroids[true, cluster_id] = newcenter
|
185
191
|
end
|
186
|
-
|
187
|
-
updated_centroids << newcenter
|
188
192
|
end
|
189
193
|
|
190
|
-
|
191
|
-
|
192
|
-
break if moves.max < 0.001 # i.e., no movement
|
193
|
-
break if @iterations >= 300
|
194
|
-
|
195
|
-
@cluster_point_ids = Array.new(@k) { [] }
|
194
|
+
break if max_move < 0.001 # i.e., no movement
|
195
|
+
break if @iterations >= @max_iter
|
196
196
|
end
|
197
197
|
|
198
|
-
@error =
|
198
|
+
@error = (min_distances**2).sum
|
199
199
|
@runtime = Time.now - start_time
|
200
200
|
self
|
201
201
|
end
|
202
202
|
|
203
203
|
def finish
|
204
|
-
|
205
|
-
|
204
|
+
@clusters = @k.times.map do |i|
|
205
|
+
centroid = NArray.ref @centroids[true, i].flatten
|
206
|
+
Cluster.new i, Point.new(-i, centroid)
|
207
|
+
end
|
208
|
+
|
209
|
+
@points = @points_count.times.map do |i|
|
210
|
+
data = NArray.ref @points_matrix[true, i].flatten
|
211
|
+
point = Point.new(i, data, @labels[i])
|
212
|
+
cluster = @clusters[@cluster_assigns[i]]
|
213
|
+
cluster.points << point
|
214
|
+
point
|
215
|
+
end
|
216
|
+
|
206
217
|
self
|
207
218
|
end
|
208
219
|
|
@@ -305,40 +316,6 @@ class KMeansClusterer
|
|
305
316
|
@points_count.times.to_a.sample @k
|
306
317
|
end
|
307
318
|
|
308
|
-
def set_points
|
309
|
-
@points = @points_count.times.map do |i|
|
310
|
-
data = NArray.ref @points_matrix[true, i].flatten
|
311
|
-
Point.new(i, data, @labels[i])
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
def set_clusters
|
316
|
-
@clusters = @k.times.map do |i|
|
317
|
-
centroid = NArray.ref @centroids[true, i].flatten
|
318
|
-
c = Cluster.new i, Point.new(-i, centroid)
|
319
|
-
@cluster_point_ids[i].each do |p|
|
320
|
-
c << @points[p]
|
321
|
-
end
|
322
|
-
c
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
def calculate_error
|
327
|
-
errors = @k.times.map do |i|
|
328
|
-
centroid = get_centroid i
|
329
|
-
points = get_points_for_centroid i
|
330
|
-
|
331
|
-
if points.empty?
|
332
|
-
0
|
333
|
-
else
|
334
|
-
distances = Distance.euclidean points, centroid
|
335
|
-
(distances**2).sum
|
336
|
-
end
|
337
|
-
end
|
338
|
-
|
339
|
-
errors.reduce(:+)
|
340
|
-
end
|
341
|
-
|
342
319
|
def get_point i
|
343
320
|
NArray.ref @points_matrix[true, i].flatten
|
344
321
|
end
|
@@ -348,7 +325,7 @@ class KMeansClusterer
|
|
348
325
|
end
|
349
326
|
|
350
327
|
def get_points_for_centroid i
|
351
|
-
point_ids = @
|
328
|
+
point_ids = @cluster_assigns.eq(i).where
|
352
329
|
points = @points_matrix[true, point_ids]
|
353
330
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
354
331
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|