kmeans-clusterer 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +38 -38
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38086107da4ad5ec21f6daa33166b4d8542f3a59
|
4
|
+
data.tar.gz: f7fed19b6df6b6c9dbfb21548bc07bd1d997b56e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e4603c448b30875ba49eb694974ca5e43c0f4270d9d7005608f9bad086eff1bab769f23bb92a8756a024f037a1cc03dd568cb02122d7841c3a28b2556571893
|
7
|
+
data.tar.gz: 40ed7fcf8c7db56d7abf50c072cb7b928c501951e3dbf8a3d2276508c4a5ce2bae2b2bb12159c54ce999cd322b721f12f3b1cf3b46f7903aabc79dbf03f8ada4
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -23,6 +23,22 @@ class KMeansClusterer
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
module Distance
|
27
|
+
def self.euclidean x, y, yy = nil
|
28
|
+
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
29
|
+
xx = x.map {|v| v**2}.sum(0)
|
30
|
+
yy ||= y.map {|v| v**2}.sum(0)
|
31
|
+
xy = x * y.transpose
|
32
|
+
distance = xy * -2
|
33
|
+
distance += xx
|
34
|
+
distance += yy.transpose
|
35
|
+
NMath.sqrt distance
|
36
|
+
else
|
37
|
+
NMath.sqrt ((x - y)**2).sum(0)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
26
42
|
|
27
43
|
class Point
|
28
44
|
attr_reader :id, :data
|
@@ -66,6 +82,14 @@ class KMeansClusterer
|
|
66
82
|
point.cluster = self
|
67
83
|
@points << point
|
68
84
|
end
|
85
|
+
|
86
|
+
def sorted_points point = @centroid
|
87
|
+
point = point.data if point.is_a?(Point)
|
88
|
+
point = NArray.cast(point, @centroid.typecode) unless point.is_a?(NArray)
|
89
|
+
points_data = NArray.cast(@points.map(&:data))
|
90
|
+
distances = Distance.euclidean(points_data, point)
|
91
|
+
@points.sort_by.with_index {|p, i| distances[i] }
|
92
|
+
end
|
69
93
|
end
|
70
94
|
|
71
95
|
|
@@ -108,7 +132,7 @@ class KMeansClusterer
|
|
108
132
|
end
|
109
133
|
|
110
134
|
|
111
|
-
attr_reader :k, :points, :clusters, :error, :mean, :std, :iterations, :runtime
|
135
|
+
attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
|
112
136
|
|
113
137
|
|
114
138
|
def initialize opts = {}
|
@@ -136,7 +160,7 @@ class KMeansClusterer
|
|
136
160
|
loop do
|
137
161
|
@iterations +=1
|
138
162
|
|
139
|
-
distances =
|
163
|
+
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
140
164
|
|
141
165
|
# assign point ids to @cluster_point_ids
|
142
166
|
@points_count.times do |i|
|
@@ -157,7 +181,7 @@ class KMeansClusterer
|
|
157
181
|
else
|
158
182
|
points = @points_matrix[true, point_ids]
|
159
183
|
newcenter = points.mean(1)
|
160
|
-
moves <<
|
184
|
+
moves << Distance.euclidean(centroid, newcenter)
|
161
185
|
end
|
162
186
|
|
163
187
|
updated_centroids << newcenter
|
@@ -185,27 +209,23 @@ class KMeansClusterer
|
|
185
209
|
def predict data
|
186
210
|
data = NMatrix.cast(data, @typecode)
|
187
211
|
data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
|
188
|
-
distances =
|
212
|
+
distances = Distance.euclidean(@centroids, data)
|
189
213
|
data.shape[1].times.map do |i|
|
190
214
|
distances[i, true].sort_index[0] # index of closest cluster
|
191
215
|
end
|
192
216
|
end
|
193
217
|
|
194
218
|
def sorted_clusters point = origin
|
195
|
-
point =
|
196
|
-
|
197
|
-
distances =
|
219
|
+
point = point.data if point.is_a?(Point)
|
220
|
+
point = NArray.cast(point, @typecode) unless point.is_a?(NArray)
|
221
|
+
distances = Distance.euclidean(NArray.ref(@centroids), point)
|
198
222
|
@clusters.sort_by.with_index {|c, i| distances[i] }
|
199
223
|
end
|
200
224
|
|
201
|
-
def origin
|
202
|
-
wrap_point Array.new(@points[0].dimension, 0)
|
203
|
-
end
|
204
|
-
|
205
225
|
def silhouette
|
206
226
|
return 1.0 if @k < 2
|
207
227
|
|
208
|
-
distances =
|
228
|
+
distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
|
209
229
|
|
210
230
|
scores = @points_count.times.map do |i|
|
211
231
|
point = get_point i
|
@@ -222,20 +242,14 @@ class KMeansClusterer
|
|
222
242
|
scores.reduce(:+) / scores.length # mean score for all points
|
223
243
|
end
|
224
244
|
|
225
|
-
alias_method :silhouette_score, :silhouette
|
226
|
-
|
227
245
|
def inspect
|
228
246
|
%{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
|
229
247
|
end
|
230
248
|
|
231
249
|
private
|
232
|
-
def wrap_point point
|
233
|
-
return point if point.is_a?(Point)
|
234
|
-
Point.new(0, NArray.cast(point, @typecode))
|
235
|
-
end
|
236
250
|
|
237
251
|
def dissimilarity points, point
|
238
|
-
distances =
|
252
|
+
distances = Distance.euclidean points, point
|
239
253
|
distances.sum / distances.length.to_f
|
240
254
|
end
|
241
255
|
|
@@ -259,7 +273,7 @@ class KMeansClusterer
|
|
259
273
|
while centroid_ids.length < @k
|
260
274
|
centroids = @points_matrix[true, centroid_ids]
|
261
275
|
|
262
|
-
distances =
|
276
|
+
distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
|
263
277
|
|
264
278
|
d2 = []
|
265
279
|
@points_count.times do |i|
|
@@ -288,11 +302,7 @@ class KMeansClusterer
|
|
288
302
|
end
|
289
303
|
|
290
304
|
def pick_k_random_indexes
|
291
|
-
@points_count.times.to_a.
|
292
|
-
end
|
293
|
-
|
294
|
-
def get_cluster_centroids
|
295
|
-
NArray.to_na @clusters.map {|c| c.centroid.data }
|
305
|
+
@points_count.times.to_a.sample @k
|
296
306
|
end
|
297
307
|
|
298
308
|
def set_points
|
@@ -321,7 +331,7 @@ class KMeansClusterer
|
|
321
331
|
if points.empty?
|
322
332
|
0
|
323
333
|
else
|
324
|
-
distances =
|
334
|
+
distances = Distance.euclidean points, centroid
|
325
335
|
(distances**2).sum
|
326
336
|
end
|
327
337
|
end
|
@@ -343,17 +353,7 @@ class KMeansClusterer
|
|
343
353
|
points.empty? ? NArray.sfloat(0) : NArray.ref(points)
|
344
354
|
end
|
345
355
|
|
346
|
-
def
|
347
|
-
|
348
|
-
xx = x.map {|v| v**2}.sum(0)
|
349
|
-
yy ||= y.map {|v| v**2}.sum(0)
|
350
|
-
xy = x * y.transpose
|
351
|
-
distance = xy * -2
|
352
|
-
distance += xx
|
353
|
-
distance += yy.transpose
|
354
|
-
NMath.sqrt distance
|
355
|
-
else
|
356
|
-
NMath.sqrt ((x - y)**2).sum(0)
|
357
|
-
end
|
356
|
+
def origin
|
357
|
+
Array.new(@points[0].dimension, 0)
|
358
358
|
end
|
359
359
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: narray
|