kmeans-clusterer 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +34 -33
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c2552e6a8ee7eddd6d03d8fa8cf7038c0458a11
|
4
|
+
data.tar.gz: 62172e9d841aa4df7b332a4e2cda55bf426bc9b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a33d17f4749b00e7ee8a0829e6a402db6eb768bd62f5dcb1e3ed9dcf3020b5ec074b3e5e38a817fb6d4a51ec13b8cf75f189a55b6a3e488738b4dc2828cbb878
|
7
|
+
data.tar.gz: c4a2e27da37141435583d1b09cbf4001f4b2c44a5b2900155e2444f45d9663edf0ccab6d4f80c50d05ad85ce31710a7d4558356bded561bf17199e7333c3e14d
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -46,12 +46,13 @@ class KMeansClusterer
|
|
46
46
|
|
47
47
|
|
48
48
|
class Point
|
49
|
-
attr_reader :id, :data
|
49
|
+
attr_reader :id, :data, :centroid_distances
|
50
50
|
attr_accessor :cluster, :label
|
51
51
|
|
52
|
-
def initialize id, data, label = nil
|
52
|
+
def initialize id, data, centroid_distances, label = nil
|
53
53
|
@id = id
|
54
54
|
@data = data
|
55
|
+
@centroid_distances = centroid_distances
|
55
56
|
@label = label
|
56
57
|
end
|
57
58
|
|
@@ -70,6 +71,10 @@ class KMeansClusterer
|
|
70
71
|
def dimension
|
71
72
|
@data.length
|
72
73
|
end
|
74
|
+
|
75
|
+
def centroid_distance
|
76
|
+
@centroid_distances[@cluster.id]
|
77
|
+
end
|
73
78
|
end
|
74
79
|
|
75
80
|
|
@@ -116,7 +121,7 @@ class KMeansClusterer
|
|
116
121
|
opts[:std] = std
|
117
122
|
end
|
118
123
|
|
119
|
-
opts[:
|
124
|
+
opts[:data] = data
|
120
125
|
opts[:row_norms] = Scaler.row_norms(data)
|
121
126
|
|
122
127
|
bestrun = nil
|
@@ -137,7 +142,7 @@ class KMeansClusterer
|
|
137
142
|
end
|
138
143
|
|
139
144
|
|
140
|
-
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
|
145
|
+
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime, :distances, :data
|
141
146
|
|
142
147
|
|
143
148
|
def initialize opts = {}
|
@@ -146,8 +151,8 @@ class KMeansClusterer
|
|
146
151
|
@labels = opts[:labels] || []
|
147
152
|
@row_norms = opts[:row_norms]
|
148
153
|
|
149
|
-
@
|
150
|
-
@points_count = @
|
154
|
+
@data = opts[:data]
|
155
|
+
@points_count = @data.shape[1] if @data
|
151
156
|
@mean = opts[:mean]
|
152
157
|
@std = opts[:std]
|
153
158
|
@scale_data = opts[:scale_data]
|
@@ -167,10 +172,10 @@ class KMeansClusterer
|
|
167
172
|
@iterations +=1
|
168
173
|
|
169
174
|
min_distances.fill! Float::INFINITY
|
170
|
-
distances = Distance.euclidean(@centroids, @
|
175
|
+
@distances = Distance.euclidean(@centroids, @data, @row_norms)
|
171
176
|
|
172
177
|
@k.times do |cluster_id|
|
173
|
-
dist = NArray.ref distances[true, cluster_id].flatten
|
178
|
+
dist = NArray.ref @distances[true, cluster_id].flatten
|
174
179
|
mask = dist < min_distances
|
175
180
|
@cluster_assigns[mask] = cluster_id
|
176
181
|
min_distances[mask] = dist[mask]
|
@@ -183,7 +188,7 @@ class KMeansClusterer
|
|
183
188
|
point_ids = @cluster_assigns.eq(cluster_id).where
|
184
189
|
|
185
190
|
unless point_ids.empty?
|
186
|
-
points = @
|
191
|
+
points = @data[true, point_ids]
|
187
192
|
newcenter = points.mean(1)
|
188
193
|
move = Distance.euclidean(centroid, newcenter)
|
189
194
|
max_move = move if move > max_move
|
@@ -203,17 +208,21 @@ class KMeansClusterer
|
|
203
208
|
def finish
|
204
209
|
@clusters = @k.times.map do |i|
|
205
210
|
centroid = NArray.ref @centroids[true, i].flatten
|
206
|
-
Cluster.new i, Point.new(-
|
211
|
+
Cluster.new i, Point.new(-1, centroid, nil, nil)
|
207
212
|
end
|
208
213
|
|
209
214
|
@points = @points_count.times.map do |i|
|
210
|
-
data = NArray.ref @
|
211
|
-
point = Point.new(i, data, @labels[i])
|
215
|
+
data = NArray.ref @data[true, i].flatten
|
216
|
+
point = Point.new(i, data, @distances[i, true], @labels[i])
|
212
217
|
cluster = @clusters[@cluster_assigns[i]]
|
213
|
-
cluster
|
218
|
+
cluster << point
|
214
219
|
point
|
215
220
|
end
|
216
221
|
|
222
|
+
@clusters.each do |c|
|
223
|
+
c.points.sort_by! &:centroid_distance
|
224
|
+
end
|
225
|
+
|
217
226
|
self
|
218
227
|
end
|
219
228
|
|
@@ -236,17 +245,13 @@ class KMeansClusterer
|
|
236
245
|
def silhouette
|
237
246
|
return 1.0 if @k < 2
|
238
247
|
|
239
|
-
|
248
|
+
scores = @points.map do |point|
|
249
|
+
sort_index = point.centroid_distances.sort_index
|
250
|
+
c1_points = get_points_for_cluster sort_index[0]
|
251
|
+
c2_points = get_points_for_cluster sort_index[1]
|
240
252
|
|
241
|
-
|
242
|
-
|
243
|
-
cluster_indexes = distances[i, true].sort_index
|
244
|
-
|
245
|
-
c1_points = get_points_for_centroid cluster_indexes[0]
|
246
|
-
c2_points = get_points_for_centroid cluster_indexes[1]
|
247
|
-
|
248
|
-
a = dissimilarity(c1_points, point)
|
249
|
-
b = dissimilarity(c2_points, point)
|
253
|
+
a = dissimilarity(c1_points, point.data)
|
254
|
+
b = dissimilarity(c2_points, point.data)
|
250
255
|
(b - a) / [a,b].max
|
251
256
|
end
|
252
257
|
|
@@ -282,9 +287,9 @@ class KMeansClusterer
|
|
282
287
|
centroid_ids << pick
|
283
288
|
|
284
289
|
while centroid_ids.length < @k
|
285
|
-
centroids = @
|
290
|
+
centroids = @data[true, centroid_ids]
|
286
291
|
|
287
|
-
distances = Distance.euclidean(centroids, @
|
292
|
+
distances = Distance.euclidean(centroids, @data, @row_norms)
|
288
293
|
|
289
294
|
d2 = []
|
290
295
|
@points_count.times do |i|
|
@@ -300,7 +305,7 @@ class KMeansClusterer
|
|
300
305
|
centroid_ids << pick
|
301
306
|
end
|
302
307
|
|
303
|
-
@centroids = @
|
308
|
+
@centroids = @data[true, centroid_ids]
|
304
309
|
end
|
305
310
|
|
306
311
|
def custom_centroid_init
|
@@ -309,24 +314,20 @@ class KMeansClusterer
|
|
309
314
|
end
|
310
315
|
|
311
316
|
def random_centroid_init
|
312
|
-
@centroids = @
|
317
|
+
@centroids = @data[true, pick_k_random_indexes]
|
313
318
|
end
|
314
319
|
|
315
320
|
def pick_k_random_indexes
|
316
321
|
@points_count.times.to_a.sample @k
|
317
322
|
end
|
318
323
|
|
319
|
-
def get_point i
|
320
|
-
NArray.ref @points_matrix[true, i].flatten
|
321
|
-
end
|
322
|
-
|
323
324
|
def get_centroid i
|
324
325
|
NArray.ref(@centroids[true, i].flatten)
|
325
326
|
end
|
326
327
|
|
327
|
-
def
|
328
|
+
def get_points_for_cluster i
|
328
329
|
point_ids = @cluster_assigns.eq(i).where
|
329
|
-
points = @
|
330
|
+
points = @data[true, point_ids]
|
330
331
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
331
332
|
end
|
332
333
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|