kmeans-clusterer 0.10.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +34 -33
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c2552e6a8ee7eddd6d03d8fa8cf7038c0458a11
|
4
|
+
data.tar.gz: 62172e9d841aa4df7b332a4e2cda55bf426bc9b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a33d17f4749b00e7ee8a0829e6a402db6eb768bd62f5dcb1e3ed9dcf3020b5ec074b3e5e38a817fb6d4a51ec13b8cf75f189a55b6a3e488738b4dc2828cbb878
|
7
|
+
data.tar.gz: c4a2e27da37141435583d1b09cbf4001f4b2c44a5b2900155e2444f45d9663edf0ccab6d4f80c50d05ad85ce31710a7d4558356bded561bf17199e7333c3e14d
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -46,12 +46,13 @@ class KMeansClusterer
|
|
46
46
|
|
47
47
|
|
48
48
|
class Point
|
49
|
-
attr_reader :id, :data
|
49
|
+
attr_reader :id, :data, :centroid_distances
|
50
50
|
attr_accessor :cluster, :label
|
51
51
|
|
52
|
-
def initialize id, data, label = nil
|
52
|
+
def initialize id, data, centroid_distances, label = nil
|
53
53
|
@id = id
|
54
54
|
@data = data
|
55
|
+
@centroid_distances = centroid_distances
|
55
56
|
@label = label
|
56
57
|
end
|
57
58
|
|
@@ -70,6 +71,10 @@ class KMeansClusterer
|
|
70
71
|
def dimension
|
71
72
|
@data.length
|
72
73
|
end
|
74
|
+
|
75
|
+
def centroid_distance
|
76
|
+
@centroid_distances[@cluster.id]
|
77
|
+
end
|
73
78
|
end
|
74
79
|
|
75
80
|
|
@@ -116,7 +121,7 @@ class KMeansClusterer
|
|
116
121
|
opts[:std] = std
|
117
122
|
end
|
118
123
|
|
119
|
-
opts[:
|
124
|
+
opts[:data] = data
|
120
125
|
opts[:row_norms] = Scaler.row_norms(data)
|
121
126
|
|
122
127
|
bestrun = nil
|
@@ -137,7 +142,7 @@ class KMeansClusterer
|
|
137
142
|
end
|
138
143
|
|
139
144
|
|
140
|
-
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
|
145
|
+
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime, :distances, :data
|
141
146
|
|
142
147
|
|
143
148
|
def initialize opts = {}
|
@@ -146,8 +151,8 @@ class KMeansClusterer
|
|
146
151
|
@labels = opts[:labels] || []
|
147
152
|
@row_norms = opts[:row_norms]
|
148
153
|
|
149
|
-
@
|
150
|
-
@points_count = @
|
154
|
+
@data = opts[:data]
|
155
|
+
@points_count = @data.shape[1] if @data
|
151
156
|
@mean = opts[:mean]
|
152
157
|
@std = opts[:std]
|
153
158
|
@scale_data = opts[:scale_data]
|
@@ -167,10 +172,10 @@ class KMeansClusterer
|
|
167
172
|
@iterations +=1
|
168
173
|
|
169
174
|
min_distances.fill! Float::INFINITY
|
170
|
-
distances = Distance.euclidean(@centroids, @
|
175
|
+
@distances = Distance.euclidean(@centroids, @data, @row_norms)
|
171
176
|
|
172
177
|
@k.times do |cluster_id|
|
173
|
-
dist = NArray.ref distances[true, cluster_id].flatten
|
178
|
+
dist = NArray.ref @distances[true, cluster_id].flatten
|
174
179
|
mask = dist < min_distances
|
175
180
|
@cluster_assigns[mask] = cluster_id
|
176
181
|
min_distances[mask] = dist[mask]
|
@@ -183,7 +188,7 @@ class KMeansClusterer
|
|
183
188
|
point_ids = @cluster_assigns.eq(cluster_id).where
|
184
189
|
|
185
190
|
unless point_ids.empty?
|
186
|
-
points = @
|
191
|
+
points = @data[true, point_ids]
|
187
192
|
newcenter = points.mean(1)
|
188
193
|
move = Distance.euclidean(centroid, newcenter)
|
189
194
|
max_move = move if move > max_move
|
@@ -203,17 +208,21 @@ class KMeansClusterer
|
|
203
208
|
def finish
|
204
209
|
@clusters = @k.times.map do |i|
|
205
210
|
centroid = NArray.ref @centroids[true, i].flatten
|
206
|
-
Cluster.new i, Point.new(-
|
211
|
+
Cluster.new i, Point.new(-1, centroid, nil, nil)
|
207
212
|
end
|
208
213
|
|
209
214
|
@points = @points_count.times.map do |i|
|
210
|
-
data = NArray.ref @
|
211
|
-
point = Point.new(i, data, @labels[i])
|
215
|
+
data = NArray.ref @data[true, i].flatten
|
216
|
+
point = Point.new(i, data, @distances[i, true], @labels[i])
|
212
217
|
cluster = @clusters[@cluster_assigns[i]]
|
213
|
-
cluster
|
218
|
+
cluster << point
|
214
219
|
point
|
215
220
|
end
|
216
221
|
|
222
|
+
@clusters.each do |c|
|
223
|
+
c.points.sort_by! &:centroid_distance
|
224
|
+
end
|
225
|
+
|
217
226
|
self
|
218
227
|
end
|
219
228
|
|
@@ -236,17 +245,13 @@ class KMeansClusterer
|
|
236
245
|
def silhouette
|
237
246
|
return 1.0 if @k < 2
|
238
247
|
|
239
|
-
|
248
|
+
scores = @points.map do |point|
|
249
|
+
sort_index = point.centroid_distances.sort_index
|
250
|
+
c1_points = get_points_for_cluster sort_index[0]
|
251
|
+
c2_points = get_points_for_cluster sort_index[1]
|
240
252
|
|
241
|
-
|
242
|
-
|
243
|
-
cluster_indexes = distances[i, true].sort_index
|
244
|
-
|
245
|
-
c1_points = get_points_for_centroid cluster_indexes[0]
|
246
|
-
c2_points = get_points_for_centroid cluster_indexes[1]
|
247
|
-
|
248
|
-
a = dissimilarity(c1_points, point)
|
249
|
-
b = dissimilarity(c2_points, point)
|
253
|
+
a = dissimilarity(c1_points, point.data)
|
254
|
+
b = dissimilarity(c2_points, point.data)
|
250
255
|
(b - a) / [a,b].max
|
251
256
|
end
|
252
257
|
|
@@ -282,9 +287,9 @@ class KMeansClusterer
|
|
282
287
|
centroid_ids << pick
|
283
288
|
|
284
289
|
while centroid_ids.length < @k
|
285
|
-
centroids = @
|
290
|
+
centroids = @data[true, centroid_ids]
|
286
291
|
|
287
|
-
distances = Distance.euclidean(centroids, @
|
292
|
+
distances = Distance.euclidean(centroids, @data, @row_norms)
|
288
293
|
|
289
294
|
d2 = []
|
290
295
|
@points_count.times do |i|
|
@@ -300,7 +305,7 @@ class KMeansClusterer
|
|
300
305
|
centroid_ids << pick
|
301
306
|
end
|
302
307
|
|
303
|
-
@centroids = @
|
308
|
+
@centroids = @data[true, centroid_ids]
|
304
309
|
end
|
305
310
|
|
306
311
|
def custom_centroid_init
|
@@ -309,24 +314,20 @@ class KMeansClusterer
|
|
309
314
|
end
|
310
315
|
|
311
316
|
def random_centroid_init
|
312
|
-
@centroids = @
|
317
|
+
@centroids = @data[true, pick_k_random_indexes]
|
313
318
|
end
|
314
319
|
|
315
320
|
def pick_k_random_indexes
|
316
321
|
@points_count.times.to_a.sample @k
|
317
322
|
end
|
318
323
|
|
319
|
-
def get_point i
|
320
|
-
NArray.ref @points_matrix[true, i].flatten
|
321
|
-
end
|
322
|
-
|
323
324
|
def get_centroid i
|
324
325
|
NArray.ref(@centroids[true, i].flatten)
|
325
326
|
end
|
326
327
|
|
327
|
-
def
|
328
|
+
def get_points_for_cluster i
|
328
329
|
point_ids = @cluster_assigns.eq(i).where
|
329
|
-
points = @
|
330
|
+
points = @data[true, point_ids]
|
330
331
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
331
332
|
end
|
332
333
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|