kmeans-clusterer 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +38 -38
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38086107da4ad5ec21f6daa33166b4d8542f3a59
|
4
|
+
data.tar.gz: f7fed19b6df6b6c9dbfb21548bc07bd1d997b56e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e4603c448b30875ba49eb694974ca5e43c0f4270d9d7005608f9bad086eff1bab769f23bb92a8756a024f037a1cc03dd568cb02122d7841c3a28b2556571893
|
7
|
+
data.tar.gz: 40ed7fcf8c7db56d7abf50c072cb7b928c501951e3dbf8a3d2276508c4a5ce2bae2b2bb12159c54ce999cd322b721f12f3b1cf3b46f7903aabc79dbf03f8ada4
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -23,6 +23,22 @@ class KMeansClusterer
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
module Distance
|
27
|
+
def self.euclidean x, y, yy = nil
|
28
|
+
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
29
|
+
xx = x.map {|v| v**2}.sum(0)
|
30
|
+
yy ||= y.map {|v| v**2}.sum(0)
|
31
|
+
xy = x * y.transpose
|
32
|
+
distance = xy * -2
|
33
|
+
distance += xx
|
34
|
+
distance += yy.transpose
|
35
|
+
NMath.sqrt distance
|
36
|
+
else
|
37
|
+
NMath.sqrt ((x - y)**2).sum(0)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
26
42
|
|
27
43
|
class Point
|
28
44
|
attr_reader :id, :data
|
@@ -66,6 +82,14 @@ class KMeansClusterer
|
|
66
82
|
point.cluster = self
|
67
83
|
@points << point
|
68
84
|
end
|
85
|
+
|
86
|
+
def sorted_points point = @centroid
|
87
|
+
point = point.data if point.is_a?(Point)
|
88
|
+
point = NArray.cast(point, @centroid.typecode) unless point.is_a?(NArray)
|
89
|
+
points_data = NArray.cast(@points.map(&:data))
|
90
|
+
distances = Distance.euclidean(points_data, point)
|
91
|
+
@points.sort_by.with_index {|p, i| distances[i] }
|
92
|
+
end
|
69
93
|
end
|
70
94
|
|
71
95
|
|
@@ -108,7 +132,7 @@ class KMeansClusterer
|
|
108
132
|
end
|
109
133
|
|
110
134
|
|
111
|
-
attr_reader :k, :points, :clusters, :error, :mean, :std, :iterations, :runtime
|
135
|
+
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
|
112
136
|
|
113
137
|
|
114
138
|
def initialize opts = {}
|
@@ -136,7 +160,7 @@ class KMeansClusterer
|
|
136
160
|
loop do
|
137
161
|
@iterations +=1
|
138
162
|
|
139
|
-
distances =
|
163
|
+
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
140
164
|
|
141
165
|
# assign point ids to @cluster_point_ids
|
142
166
|
@points_count.times do |i|
|
@@ -157,7 +181,7 @@ class KMeansClusterer
|
|
157
181
|
else
|
158
182
|
points = @points_matrix[true, point_ids]
|
159
183
|
newcenter = points.mean(1)
|
160
|
-
moves <<
|
184
|
+
moves << Distance.euclidean(centroid, newcenter)
|
161
185
|
end
|
162
186
|
|
163
187
|
updated_centroids << newcenter
|
@@ -185,27 +209,23 @@ class KMeansClusterer
|
|
185
209
|
def predict data
|
186
210
|
data = NMatrix.cast(data, @typecode)
|
187
211
|
data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
|
188
|
-
distances =
|
212
|
+
distances = Distance.euclidean(@centroids, data)
|
189
213
|
data.shape[1].times.map do |i|
|
190
214
|
distances[i, true].sort_index[0] # index of closest cluster
|
191
215
|
end
|
192
216
|
end
|
193
217
|
|
194
218
|
def sorted_clusters point = origin
|
195
|
-
point =
|
196
|
-
|
197
|
-
distances =
|
219
|
+
point = point.data if point.is_a?(Point)
|
220
|
+
point = NArray.cast(point, @typecode) unless point.is_a?(NArray)
|
221
|
+
distances = Distance.euclidean(NArray.ref(@centroids), point)
|
198
222
|
@clusters.sort_by.with_index {|c, i| distances[i] }
|
199
223
|
end
|
200
224
|
|
201
|
-
def origin
|
202
|
-
wrap_point Array.new(@points[0].dimension, 0)
|
203
|
-
end
|
204
|
-
|
205
225
|
def silhouette
|
206
226
|
return 1.0 if @k < 2
|
207
227
|
|
208
|
-
distances =
|
228
|
+
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
209
229
|
|
210
230
|
scores = @points_count.times.map do |i|
|
211
231
|
point = get_point i
|
@@ -222,20 +242,14 @@ class KMeansClusterer
|
|
222
242
|
scores.reduce(:+) / scores.length # mean score for all points
|
223
243
|
end
|
224
244
|
|
225
|
-
alias_method :silhouette_score, :silhouette
|
226
|
-
|
227
245
|
def inspect
|
228
246
|
%{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
|
229
247
|
end
|
230
248
|
|
231
249
|
private
|
232
|
-
def wrap_point point
|
233
|
-
return point if point.is_a?(Point)
|
234
|
-
Point.new(0, NArray.cast(point, @typecode))
|
235
|
-
end
|
236
250
|
|
237
251
|
def dissimilarity points, point
|
238
|
-
distances =
|
252
|
+
distances = Distance.euclidean points, point
|
239
253
|
distances.sum / distances.length.to_f
|
240
254
|
end
|
241
255
|
|
@@ -259,7 +273,7 @@ class KMeansClusterer
|
|
259
273
|
while centroid_ids.length < @k
|
260
274
|
centroids = @points_matrix[true, centroid_ids]
|
261
275
|
|
262
|
-
distances =
|
276
|
+
distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
|
263
277
|
|
264
278
|
d2 = []
|
265
279
|
@points_count.times do |i|
|
@@ -288,11 +302,7 @@ class KMeansClusterer
|
|
288
302
|
end
|
289
303
|
|
290
304
|
def pick_k_random_indexes
|
291
|
-
@points_count.times.to_a.
|
292
|
-
end
|
293
|
-
|
294
|
-
def get_cluster_centroids
|
295
|
-
NArray.to_na @clusters.map {|c| c.centroid.data }
|
305
|
+
@points_count.times.to_a.sample @k
|
296
306
|
end
|
297
307
|
|
298
308
|
def set_points
|
@@ -321,7 +331,7 @@ class KMeansClusterer
|
|
321
331
|
if points.empty?
|
322
332
|
0
|
323
333
|
else
|
324
|
-
distances =
|
334
|
+
distances = Distance.euclidean points, centroid
|
325
335
|
(distances**2).sum
|
326
336
|
end
|
327
337
|
end
|
@@ -343,17 +353,7 @@ class KMeansClusterer
|
|
343
353
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
344
354
|
end
|
345
355
|
|
346
|
-
def
|
347
|
-
|
348
|
-
xx = x.map {|v| v**2}.sum(0)
|
349
|
-
yy ||= y.map {|v| v**2}.sum(0)
|
350
|
-
xy = x * y.transpose
|
351
|
-
distance = xy * -2
|
352
|
-
distance += xx
|
353
|
-
distance += yy.transpose
|
354
|
-
NMath.sqrt distance
|
355
|
-
else
|
356
|
-
NMath.sqrt ((x - y)**2).sum(0)
|
357
|
-
end
|
356
|
+
def origin
|
357
|
+
Array.new(@points[0].dimension, 0)
|
358
358
|
end
|
359
359
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|