kmeans-clusterer 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +206 -146
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
|
4
|
+
data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
|
7
|
+
data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -1,17 +1,34 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
3
|
class KMeansClusterer
|
4
|
+
module Scaler
|
5
|
+
def self.mean data
|
6
|
+
data.mean(1)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.std data
|
10
|
+
std = data.rmsdev(1)
|
11
|
+
std[std.eq(0)] = 1.0 # so we don't divide by 0
|
12
|
+
std
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.scale data, mean = nil, std = nil
|
16
|
+
data = NArray.cast(data, NArray::DFLOAT)
|
17
|
+
mean ||= self.mean(data)
|
18
|
+
std ||= self.std(data)
|
19
|
+
data = (data - mean) / std
|
20
|
+
[data, mean, std]
|
21
|
+
end
|
22
|
+
end
|
4
23
|
|
5
|
-
# Euclidean distance function. Requires instances of NArray as args
|
6
|
-
Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
|
7
|
-
CalculateCentroid = -> (a) { a.mean(1) }
|
8
24
|
|
9
25
|
class Point
|
10
|
-
attr_reader :data
|
26
|
+
attr_reader :id, :data
|
11
27
|
attr_accessor :cluster, :label
|
12
28
|
|
13
|
-
def initialize data, label = nil
|
14
|
-
@
|
29
|
+
def initialize id, data, label = nil
|
30
|
+
@id = id
|
31
|
+
@data = data
|
15
32
|
@label = label
|
16
33
|
end
|
17
34
|
|
@@ -34,175 +51,161 @@ class KMeansClusterer
|
|
34
51
|
|
35
52
|
|
36
53
|
class Cluster
|
37
|
-
attr_reader :centroid, :points
|
54
|
+
attr_reader :id, :centroid, :points
|
38
55
|
attr_accessor :label
|
39
56
|
|
40
|
-
def initialize
|
57
|
+
def initialize id, centroid
|
58
|
+
@id = id
|
41
59
|
@centroid = centroid
|
42
|
-
@label = label
|
43
60
|
@points = []
|
44
61
|
end
|
45
62
|
|
46
|
-
def recenter
|
47
|
-
if @points.empty?
|
48
|
-
0
|
49
|
-
else
|
50
|
-
old_centroid = @centroid
|
51
|
-
@centroid = calculate_centroid_from_points
|
52
|
-
Distance.call @centroid.data, old_centroid.data
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
63
|
def << point
|
57
64
|
point.cluster = self
|
58
65
|
@points << point
|
59
66
|
end
|
60
67
|
|
61
|
-
def
|
62
|
-
@points
|
63
|
-
end
|
64
|
-
|
65
|
-
def sorted_points
|
66
|
-
distances = Distance.call points_narray, centroid.data
|
67
|
-
@points.sort_by.with_index {|c, i| distances[i] }
|
68
|
-
end
|
69
|
-
|
70
|
-
def sum_of_squares_error
|
71
|
-
if @points.empty?
|
72
|
-
0
|
73
|
-
else
|
74
|
-
distances = Distance.call points_narray, centroid.data
|
75
|
-
(distances**2).sum
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def sum_of_distances
|
80
|
-
return 0 if @points.empty?
|
81
|
-
Distance.call(points_narray, centroid.data).sum
|
68
|
+
def points_narray
|
69
|
+
NArray.cast @points.map(&:data)
|
82
70
|
end
|
83
|
-
|
84
|
-
def dissimilarity point
|
85
|
-
distances = Distance.call points_narray, point.data
|
86
|
-
distances.sum / distances.length.to_f
|
87
|
-
end
|
88
|
-
|
89
|
-
private
|
90
|
-
def calculate_centroid_from_points
|
91
|
-
data = CalculateCentroid.call points_narray
|
92
|
-
Point.new data
|
93
|
-
end
|
94
|
-
|
95
|
-
def points_narray
|
96
|
-
NArray.to_na @points.map(&:data)
|
97
|
-
end
|
98
71
|
end
|
99
72
|
|
100
73
|
|
74
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
|
75
|
+
|
101
76
|
def self.run k, data, opts = {}
|
102
|
-
|
77
|
+
opts = DEFAULT_OPTS.merge(opts)
|
103
78
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
data
|
79
|
+
opts[:k] = k
|
80
|
+
|
81
|
+
if opts[:scale_data]
|
82
|
+
data, mean, std = Scaler.scale(data)
|
83
|
+
opts[:mean] = mean
|
84
|
+
opts[:std] = std
|
108
85
|
end
|
109
86
|
|
110
|
-
|
111
|
-
|
87
|
+
opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
|
88
|
+
opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
|
112
89
|
|
113
|
-
runs =
|
114
|
-
km = new(
|
115
|
-
error = km.error
|
90
|
+
runs = opts[:runs].times.map do |i|
|
91
|
+
km = new(opts).run
|
116
92
|
if opts[:log]
|
117
|
-
puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
|
93
|
+
puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
|
118
94
|
end
|
119
|
-
errors << error
|
120
95
|
km
|
121
96
|
end
|
122
97
|
|
123
|
-
runs.sort_by
|
98
|
+
runs.sort_by {|run| run.error }.first.finish
|
124
99
|
end
|
125
100
|
|
126
|
-
# see scikit-learn scale and _mean_and_std methods
|
127
|
-
def self.scale_data data
|
128
|
-
nadata = NArray.to_na(data).to_f
|
129
|
-
mean = nadata.mean(1)
|
130
|
-
std = nadata.rmsdev(1)
|
131
|
-
std[std.eq(0)] = 1.0 # so we don't divide by 0
|
132
|
-
nadata = (nadata - mean) / std
|
133
|
-
# convert back to an array, containing NArrays for each row
|
134
|
-
data.length.times.map {|i| nadata[true, i] }
|
135
|
-
end
|
136
101
|
|
102
|
+
attr_reader :k, :points, :clusters, :error, :iterations, :runtime
|
137
103
|
|
138
|
-
attr_reader :k, :points, :clusters, :iterations, :runtime
|
139
104
|
|
105
|
+
def initialize opts = {}
|
106
|
+
@k = opts[:k]
|
107
|
+
@init = opts[:init]
|
108
|
+
@labels = opts[:labels] || []
|
109
|
+
@row_norms = opts[:row_norms]
|
140
110
|
|
141
|
-
|
142
|
-
@
|
143
|
-
@
|
144
|
-
|
111
|
+
@points_matrix = opts[:points_matrix]
|
112
|
+
@points_count = @points_matrix.shape[1] if @points_matrix
|
113
|
+
@mean = opts[:mean]
|
114
|
+
@std = opts[:std]
|
115
|
+
@scale_data = opts[:scale_data]
|
145
116
|
|
146
|
-
|
147
|
-
Point.new instance, labels[i]
|
148
|
-
end
|
149
|
-
|
150
|
-
init_clusters
|
117
|
+
init_centroids
|
151
118
|
end
|
152
119
|
|
153
120
|
def run
|
154
121
|
start_time = Time.now
|
155
122
|
@iterations, @runtime = 0, 0
|
156
123
|
|
124
|
+
@cluster_point_ids = Array.new(@k) { [] }
|
125
|
+
|
157
126
|
loop do
|
158
127
|
@iterations +=1
|
159
128
|
|
160
|
-
|
129
|
+
distances = distance(@centroids, @points_matrix)
|
130
|
+
|
131
|
+
# assign point ids to @cluster_point_ids
|
132
|
+
@points_count.times do |i|
|
133
|
+
min_distance_index = distances[i, true].sort_index[0]
|
134
|
+
@cluster_point_ids[min_distance_index] << i
|
135
|
+
end
|
136
|
+
|
137
|
+
moves = []
|
138
|
+
updated_centroids = []
|
139
|
+
|
140
|
+
@k.times do |i|
|
141
|
+
centroid = NArray.cast(@centroids[true, i].flatten)
|
142
|
+
point_ids = @cluster_point_ids[i]
|
143
|
+
|
144
|
+
if point_ids.empty?
|
145
|
+
newcenter = centroid
|
146
|
+
moves << 0
|
147
|
+
else
|
148
|
+
points = @points_matrix[true, point_ids]
|
149
|
+
newcenter = points.mean(1)
|
150
|
+
moves << distance(centroid, newcenter)
|
151
|
+
end
|
161
152
|
|
162
|
-
|
163
|
-
distances = Distance.call(centroids, point.data)
|
164
|
-
cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
|
165
|
-
cluster << point
|
153
|
+
updated_centroids << newcenter
|
166
154
|
end
|
167
155
|
|
168
|
-
|
156
|
+
@centroids = NMatrix.cast updated_centroids
|
169
157
|
|
170
158
|
break if moves.max < 0.001 # i.e., no movement
|
171
159
|
break if @iterations >= 300
|
172
160
|
|
173
|
-
|
161
|
+
@cluster_point_ids = Array.new(@k) { [] }
|
174
162
|
end
|
175
163
|
|
164
|
+
@error = calculate_error
|
176
165
|
@runtime = Time.now - start_time
|
177
166
|
self
|
178
167
|
end
|
179
168
|
|
180
|
-
def
|
181
|
-
|
169
|
+
def finish
|
170
|
+
set_points
|
171
|
+
set_clusters
|
172
|
+
self
|
182
173
|
end
|
183
174
|
|
184
|
-
def
|
185
|
-
|
175
|
+
def predict data
|
176
|
+
data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
|
177
|
+
data = NMatrix.cast(data, NArray::DFLOAT)
|
178
|
+
distances = distance(@centroids, data, nil)
|
179
|
+
data.shape[1].times.map do |i|
|
180
|
+
distances[i, true].sort_index[0] # index of closest cluster
|
181
|
+
end
|
186
182
|
end
|
187
183
|
|
188
184
|
def sorted_clusters point = origin
|
189
|
-
point =
|
185
|
+
point = wrap_point point
|
190
186
|
centroids = get_cluster_centroids
|
191
|
-
distances =
|
187
|
+
distances = distance(centroids, point.data)
|
192
188
|
@clusters.sort_by.with_index {|c, i| distances[i] }
|
193
189
|
end
|
194
190
|
|
195
191
|
def origin
|
196
|
-
|
192
|
+
wrap_point Array.new(@points[0].dimension, 0)
|
197
193
|
end
|
198
194
|
|
199
195
|
def silhouette_score
|
200
|
-
return 1.0 if @
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
196
|
+
return 1.0 if @k < 2
|
197
|
+
|
198
|
+
distances = distance(@centroids, @points_matrix)
|
199
|
+
|
200
|
+
scores = @points_count.times.map do |i|
|
201
|
+
point = get_point i
|
202
|
+
cluster_indexes = distances[i, true].sort_index
|
203
|
+
|
204
|
+
c1_points = get_points_for_centroid cluster_indexes[0]
|
205
|
+
c2_points = get_points_for_centroid cluster_indexes[1]
|
206
|
+
|
207
|
+
a = dissimilarity(c1_points, point)
|
208
|
+
b = dissimilarity(c2_points, point)
|
206
209
|
(b - a) / [a,b].max
|
207
210
|
end
|
208
211
|
|
@@ -210,73 +213,130 @@ class KMeansClusterer
|
|
210
213
|
end
|
211
214
|
|
212
215
|
private
|
213
|
-
def
|
216
|
+
def wrap_point point
|
217
|
+
return point if point.is_a?(Point)
|
218
|
+
Point.new(0, NArray.to_na(point).to_f)
|
219
|
+
end
|
220
|
+
|
221
|
+
def dissimilarity points, point
|
222
|
+
distances = distance points, point
|
223
|
+
distances.sum / distances.length.to_f
|
224
|
+
end
|
225
|
+
|
226
|
+
def init_centroids
|
214
227
|
case @init
|
215
228
|
when :random
|
216
|
-
|
229
|
+
random_centroid_init
|
217
230
|
when Array
|
218
|
-
|
231
|
+
custom_centroid_init
|
219
232
|
else
|
220
|
-
|
233
|
+
kmpp_centroid_init
|
221
234
|
end
|
222
235
|
end
|
223
236
|
|
224
237
|
# k-means++
|
225
|
-
def
|
226
|
-
|
227
|
-
pick = rand(@
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
238
|
+
def kmpp_centroid_init
|
239
|
+
centroid_ids = []
|
240
|
+
pick = rand(@points_count)
|
241
|
+
centroid_ids << pick
|
242
|
+
|
243
|
+
while centroid_ids.length < @k
|
244
|
+
centroids = @points_matrix[true, centroid_ids]
|
245
|
+
|
246
|
+
distances = distance(centroids, @points_matrix)
|
247
|
+
|
248
|
+
d2 = []
|
249
|
+
@points_count.times do |i|
|
250
|
+
min_distance = distances[i, true].min
|
251
|
+
d2 << min_distance**2
|
237
252
|
end
|
238
253
|
|
239
254
|
d2 = NArray.to_na d2
|
240
255
|
probs = d2 / d2.sum
|
241
256
|
cumprobs = probs.cumsum
|
242
257
|
r = rand
|
243
|
-
# pick = cumprobs.to_a.index {|prob| r < prob }
|
244
258
|
pick = (cumprobs >= r).where[0]
|
245
|
-
|
246
|
-
cluster = Cluster.new(centroid, @clusters.length + 1)
|
247
|
-
@clusters << cluster
|
259
|
+
centroid_ids << pick
|
248
260
|
end
|
249
|
-
end
|
250
261
|
|
251
|
-
|
252
|
-
@clusters = @init.map.with_index do |instance, i|
|
253
|
-
point = Point.new NArray.to_na(instance).to_f
|
254
|
-
Cluster.new point, i+1
|
255
|
-
end
|
262
|
+
@centroids = @points_matrix[true, centroid_ids]
|
256
263
|
end
|
257
264
|
|
258
|
-
def
|
259
|
-
@
|
265
|
+
def custom_centroid_init
|
266
|
+
@centroids = NMatrix.cast @init
|
267
|
+
@k = @init.length
|
260
268
|
end
|
261
269
|
|
262
|
-
def
|
263
|
-
|
270
|
+
def random_centroid_init
|
271
|
+
@centroids = @points_matrix[true, pick_k_random_indexes]
|
264
272
|
end
|
265
273
|
|
266
274
|
def pick_k_random_indexes
|
267
|
-
@
|
275
|
+
@points_count.times.to_a.shuffle.slice(0, @k)
|
268
276
|
end
|
269
277
|
|
270
278
|
def get_cluster_centroids
|
271
279
|
NArray.to_na @clusters.map {|c| c.centroid.data }
|
272
280
|
end
|
273
|
-
end
|
274
281
|
|
275
|
-
|
276
|
-
|
277
|
-
|
282
|
+
def set_points
|
283
|
+
@points = @points_count.times.map do |i|
|
284
|
+
data = NArray.cast @points_matrix[true, i].flatten
|
285
|
+
Point.new(i, data, @labels[i])
|
286
|
+
end
|
287
|
+
end
|
278
288
|
|
279
|
-
|
280
|
-
|
281
|
-
|
289
|
+
def set_clusters
|
290
|
+
@clusters = @k.times.map do |i|
|
291
|
+
centroid = NArray.cast @centroids[true, i].flatten
|
292
|
+
c = Cluster.new i, Point.new(-i, centroid)
|
293
|
+
@cluster_point_ids[i].each do |p|
|
294
|
+
c << @points[p]
|
295
|
+
end
|
296
|
+
c
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
def calculate_error
|
301
|
+
errors = @k.times.map do |i|
|
302
|
+
centroid = get_centroid i
|
303
|
+
points = get_points_for_centroid i
|
304
|
+
|
305
|
+
if points.empty?
|
306
|
+
0
|
307
|
+
else
|
308
|
+
distances = distance points, centroid
|
309
|
+
(distances**2).sum
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
errors.reduce(:+)
|
314
|
+
end
|
315
|
+
|
316
|
+
def get_point i
|
317
|
+
NArray.cast @points_matrix[true, i].flatten
|
318
|
+
end
|
319
|
+
|
320
|
+
def get_centroid i
|
321
|
+
NArray.cast(@centroids[true, i].flatten)
|
322
|
+
end
|
323
|
+
|
324
|
+
def get_points_for_centroid i
|
325
|
+
point_ids = @cluster_point_ids[i]
|
326
|
+
NArray.cast @points_matrix[true, point_ids]
|
327
|
+
end
|
328
|
+
|
329
|
+
def distance x, y, yy = @row_norms
|
330
|
+
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
331
|
+
xx = x.map {|v| v**2}.sum(0)
|
332
|
+
yy ||= y.map {|v| v**2}.sum(0)
|
333
|
+
xy = x * y.transpose
|
334
|
+
distance = xy * -2
|
335
|
+
distance += xx
|
336
|
+
distance += yy.transpose
|
337
|
+
NMath.sqrt distance
|
338
|
+
else
|
339
|
+
NMath.sqrt ((x - y)**2).sum(0)
|
340
|
+
end
|
341
|
+
end
|
282
342
|
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: k-means
|
13
|
+
description: k-means clustering. Uses NArray for fast calculations.
|
14
14
|
email: gbuesing@gmail.com
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
@@ -40,5 +40,5 @@ rubyforge_project:
|
|
40
40
|
rubygems_version: 2.4.5
|
41
41
|
signing_key:
|
42
42
|
specification_version: 4
|
43
|
-
summary: k-means
|
43
|
+
summary: k-means clustering
|
44
44
|
test_files: []
|