kmeans-clusterer 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +43 -66
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
|
4
|
+
data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
|
7
|
+
data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -21,13 +21,18 @@ class KMeansClusterer
|
|
21
21
|
data = (data - mean) / std
|
22
22
|
[NMatrix.ref(data), mean, std]
|
23
23
|
end
|
24
|
+
|
25
|
+
def self.row_norms data
|
26
|
+
squared_data = NArray.ref(data)**2
|
27
|
+
NMatrix.ref(squared_data).sum(0)
|
28
|
+
end
|
24
29
|
end
|
25
30
|
|
26
31
|
module Distance
|
27
32
|
def self.euclidean x, y, yy = nil
|
28
33
|
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
29
|
-
xx =
|
30
|
-
yy ||=
|
34
|
+
xx = Scaler.row_norms(x)
|
35
|
+
yy ||= Scaler.row_norms(y)
|
31
36
|
xy = x * y.transpose
|
32
37
|
distance = xy * -2
|
33
38
|
distance += xx
|
@@ -93,7 +98,7 @@ class KMeansClusterer
|
|
93
98
|
end
|
94
99
|
|
95
100
|
|
96
|
-
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
|
101
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
|
97
102
|
|
98
103
|
def self.run k, data, opts = {}
|
99
104
|
opts = DEFAULT_OPTS.merge(opts)
|
@@ -112,7 +117,7 @@ class KMeansClusterer
|
|
112
117
|
end
|
113
118
|
|
114
119
|
opts[:points_matrix] = data
|
115
|
-
opts[:row_norms] =
|
120
|
+
opts[:row_norms] = Scaler.row_norms(data)
|
116
121
|
|
117
122
|
bestrun = nil
|
118
123
|
|
@@ -147,6 +152,7 @@ class KMeansClusterer
|
|
147
152
|
@std = opts[:std]
|
148
153
|
@scale_data = opts[:scale_data]
|
149
154
|
@typecode = opts[:typecode]
|
155
|
+
@max_iter = opts[:max_iter]
|
150
156
|
|
151
157
|
init_centroids
|
152
158
|
end
|
@@ -154,55 +160,60 @@ class KMeansClusterer
|
|
154
160
|
def run
|
155
161
|
start_time = Time.now
|
156
162
|
@iterations, @runtime = 0, 0
|
157
|
-
|
158
|
-
|
163
|
+
@cluster_assigns = NArray.int(@points_count)
|
164
|
+
min_distances = NArray.new(@typecode, @points_count)
|
159
165
|
|
160
166
|
loop do
|
161
167
|
@iterations +=1
|
162
168
|
|
169
|
+
min_distances.fill! Float::INFINITY
|
163
170
|
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
164
171
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
@
|
172
|
+
@k.times do |cluster_id|
|
173
|
+
dist = NArray.ref distances[true, cluster_id].flatten
|
174
|
+
mask = dist < min_distances
|
175
|
+
@cluster_assigns[mask] = cluster_id
|
176
|
+
min_distances[mask] = dist[mask]
|
169
177
|
end
|
170
178
|
|
171
|
-
|
172
|
-
updated_centroids = []
|
179
|
+
max_move = 0
|
173
180
|
|
174
|
-
@k.times do |
|
175
|
-
centroid = NArray.ref(@centroids[true,
|
176
|
-
point_ids = @
|
181
|
+
@k.times do |cluster_id|
|
182
|
+
centroid = NArray.ref(@centroids[true, cluster_id].flatten)
|
183
|
+
point_ids = @cluster_assigns.eq(cluster_id).where
|
177
184
|
|
178
|
-
|
179
|
-
newcenter = centroid
|
180
|
-
moves << 0
|
181
|
-
else
|
185
|
+
unless point_ids.empty?
|
182
186
|
points = @points_matrix[true, point_ids]
|
183
187
|
newcenter = points.mean(1)
|
184
|
-
|
188
|
+
move = Distance.euclidean(centroid, newcenter)
|
189
|
+
max_move = move if move > max_move
|
190
|
+
@centroids[true, cluster_id] = newcenter
|
185
191
|
end
|
186
|
-
|
187
|
-
updated_centroids << newcenter
|
188
192
|
end
|
189
193
|
|
190
|
-
|
191
|
-
|
192
|
-
break if moves.max < 0.001 # i.e., no movement
|
193
|
-
break if @iterations >= 300
|
194
|
-
|
195
|
-
@cluster_point_ids = Array.new(@k) { [] }
|
194
|
+
break if max_move < 0.001 # i.e., no movement
|
195
|
+
break if @iterations >= @max_iter
|
196
196
|
end
|
197
197
|
|
198
|
-
@error =
|
198
|
+
@error = (min_distances**2).sum
|
199
199
|
@runtime = Time.now - start_time
|
200
200
|
self
|
201
201
|
end
|
202
202
|
|
203
203
|
def finish
|
204
|
-
|
205
|
-
|
204
|
+
@clusters = @k.times.map do |i|
|
205
|
+
centroid = NArray.ref @centroids[true, i].flatten
|
206
|
+
Cluster.new i, Point.new(-i, centroid)
|
207
|
+
end
|
208
|
+
|
209
|
+
@points = @points_count.times.map do |i|
|
210
|
+
data = NArray.ref @points_matrix[true, i].flatten
|
211
|
+
point = Point.new(i, data, @labels[i])
|
212
|
+
cluster = @clusters[@cluster_assigns[i]]
|
213
|
+
cluster.points << point
|
214
|
+
point
|
215
|
+
end
|
216
|
+
|
206
217
|
self
|
207
218
|
end
|
208
219
|
|
@@ -305,40 +316,6 @@ class KMeansClusterer
|
|
305
316
|
@points_count.times.to_a.sample @k
|
306
317
|
end
|
307
318
|
|
308
|
-
def set_points
|
309
|
-
@points = @points_count.times.map do |i|
|
310
|
-
data = NArray.ref @points_matrix[true, i].flatten
|
311
|
-
Point.new(i, data, @labels[i])
|
312
|
-
end
|
313
|
-
end
|
314
|
-
|
315
|
-
def set_clusters
|
316
|
-
@clusters = @k.times.map do |i|
|
317
|
-
centroid = NArray.ref @centroids[true, i].flatten
|
318
|
-
c = Cluster.new i, Point.new(-i, centroid)
|
319
|
-
@cluster_point_ids[i].each do |p|
|
320
|
-
c << @points[p]
|
321
|
-
end
|
322
|
-
c
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
def calculate_error
|
327
|
-
errors = @k.times.map do |i|
|
328
|
-
centroid = get_centroid i
|
329
|
-
points = get_points_for_centroid i
|
330
|
-
|
331
|
-
if points.empty?
|
332
|
-
0
|
333
|
-
else
|
334
|
-
distances = Distance.euclidean points, centroid
|
335
|
-
(distances**2).sum
|
336
|
-
end
|
337
|
-
end
|
338
|
-
|
339
|
-
errors.reduce(:+)
|
340
|
-
end
|
341
|
-
|
342
319
|
def get_point i
|
343
320
|
NArray.ref @points_matrix[true, i].flatten
|
344
321
|
end
|
@@ -348,7 +325,7 @@ class KMeansClusterer
|
|
348
325
|
end
|
349
326
|
|
350
327
|
def get_points_for_centroid i
|
351
|
-
point_ids = @
|
328
|
+
point_ids = @cluster_assigns.eq(i).where
|
352
329
|
points = @points_matrix[true, point_ids]
|
353
330
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
354
331
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|