kmeans-clusterer 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/kmeans-clusterer.rb +206 -146
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
|
4
|
+
data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
|
7
|
+
data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b
|
data/lib/kmeans-clusterer.rb
CHANGED
@@ -1,17 +1,34 @@
|
|
1
1
|
require 'narray'
|
2
2
|
|
3
3
|
class KMeansClusterer
|
4
|
+
module Scaler
|
5
|
+
def self.mean data
|
6
|
+
data.mean(1)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.std data
|
10
|
+
std = data.rmsdev(1)
|
11
|
+
std[std.eq(0)] = 1.0 # so we don't divide by 0
|
12
|
+
std
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.scale data, mean = nil, std = nil
|
16
|
+
data = NArray.cast(data, NArray::DFLOAT)
|
17
|
+
mean ||= self.mean(data)
|
18
|
+
std ||= self.std(data)
|
19
|
+
data = (data - mean) / std
|
20
|
+
[data, mean, std]
|
21
|
+
end
|
22
|
+
end
|
4
23
|
|
5
|
-
# Euclidean distance function. Requires instances of NArray as args
|
6
|
-
Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
|
7
|
-
CalculateCentroid = -> (a) { a.mean(1) }
|
8
24
|
|
9
25
|
class Point
|
10
|
-
attr_reader :data
|
26
|
+
attr_reader :id, :data
|
11
27
|
attr_accessor :cluster, :label
|
12
28
|
|
13
|
-
def initialize data, label = nil
|
14
|
-
@
|
29
|
+
def initialize id, data, label = nil
|
30
|
+
@id = id
|
31
|
+
@data = data
|
15
32
|
@label = label
|
16
33
|
end
|
17
34
|
|
@@ -34,175 +51,161 @@ class KMeansClusterer
|
|
34
51
|
|
35
52
|
|
36
53
|
class Cluster
|
37
|
-
attr_reader :centroid, :points
|
54
|
+
attr_reader :id, :centroid, :points
|
38
55
|
attr_accessor :label
|
39
56
|
|
40
|
-
def initialize
|
57
|
+
def initialize id, centroid
|
58
|
+
@id = id
|
41
59
|
@centroid = centroid
|
42
|
-
@label = label
|
43
60
|
@points = []
|
44
61
|
end
|
45
62
|
|
46
|
-
def recenter
|
47
|
-
if @points.empty?
|
48
|
-
0
|
49
|
-
else
|
50
|
-
old_centroid = @centroid
|
51
|
-
@centroid = calculate_centroid_from_points
|
52
|
-
Distance.call @centroid.data, old_centroid.data
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
63
|
def << point
|
57
64
|
point.cluster = self
|
58
65
|
@points << point
|
59
66
|
end
|
60
67
|
|
61
|
-
def
|
62
|
-
@points
|
63
|
-
end
|
64
|
-
|
65
|
-
def sorted_points
|
66
|
-
distances = Distance.call points_narray, centroid.data
|
67
|
-
@points.sort_by.with_index {|c, i| distances[i] }
|
68
|
-
end
|
69
|
-
|
70
|
-
def sum_of_squares_error
|
71
|
-
if @points.empty?
|
72
|
-
0
|
73
|
-
else
|
74
|
-
distances = Distance.call points_narray, centroid.data
|
75
|
-
(distances**2).sum
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def sum_of_distances
|
80
|
-
return 0 if @points.empty?
|
81
|
-
Distance.call(points_narray, centroid.data).sum
|
68
|
+
def points_narray
|
69
|
+
NArray.cast @points.map(&:data)
|
82
70
|
end
|
83
|
-
|
84
|
-
def dissimilarity point
|
85
|
-
distances = Distance.call points_narray, point.data
|
86
|
-
distances.sum / distances.length.to_f
|
87
|
-
end
|
88
|
-
|
89
|
-
private
|
90
|
-
def calculate_centroid_from_points
|
91
|
-
data = CalculateCentroid.call points_narray
|
92
|
-
Point.new data
|
93
|
-
end
|
94
|
-
|
95
|
-
def points_narray
|
96
|
-
NArray.to_na @points.map(&:data)
|
97
|
-
end
|
98
71
|
end
|
99
72
|
|
100
73
|
|
74
|
+
DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
|
75
|
+
|
101
76
|
def self.run k, data, opts = {}
|
102
|
-
|
77
|
+
opts = DEFAULT_OPTS.merge(opts)
|
103
78
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
data
|
79
|
+
opts[:k] = k
|
80
|
+
|
81
|
+
if opts[:scale_data]
|
82
|
+
data, mean, std = Scaler.scale(data)
|
83
|
+
opts[:mean] = mean
|
84
|
+
opts[:std] = std
|
108
85
|
end
|
109
86
|
|
110
|
-
|
111
|
-
|
87
|
+
opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
|
88
|
+
opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
|
112
89
|
|
113
|
-
runs =
|
114
|
-
km = new(
|
115
|
-
error = km.error
|
90
|
+
runs = opts[:runs].times.map do |i|
|
91
|
+
km = new(opts).run
|
116
92
|
if opts[:log]
|
117
|
-
puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
|
93
|
+
puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
|
118
94
|
end
|
119
|
-
errors << error
|
120
95
|
km
|
121
96
|
end
|
122
97
|
|
123
|
-
runs.sort_by
|
98
|
+
runs.sort_by {|run| run.error }.first.finish
|
124
99
|
end
|
125
100
|
|
126
|
-
# see scikit-learn scale and _mean_and_std methods
|
127
|
-
def self.scale_data data
|
128
|
-
nadata = NArray.to_na(data).to_f
|
129
|
-
mean = nadata.mean(1)
|
130
|
-
std = nadata.rmsdev(1)
|
131
|
-
std[std.eq(0)] = 1.0 # so we don't divide by 0
|
132
|
-
nadata = (nadata - mean) / std
|
133
|
-
# convert back to an array, containing NArrays for each row
|
134
|
-
data.length.times.map {|i| nadata[true, i] }
|
135
|
-
end
|
136
101
|
|
102
|
+
attr_reader :k, :points, :clusters, :error, :iterations, :runtime
|
137
103
|
|
138
|
-
attr_reader :k, :points, :clusters, :iterations, :runtime
|
139
104
|
|
105
|
+
def initialize opts = {}
|
106
|
+
@k = opts[:k]
|
107
|
+
@init = opts[:init]
|
108
|
+
@labels = opts[:labels] || []
|
109
|
+
@row_norms = opts[:row_norms]
|
140
110
|
|
141
|
-
|
142
|
-
@
|
143
|
-
@
|
144
|
-
|
111
|
+
@points_matrix = opts[:points_matrix]
|
112
|
+
@points_count = @points_matrix.shape[1] if @points_matrix
|
113
|
+
@mean = opts[:mean]
|
114
|
+
@std = opts[:std]
|
115
|
+
@scale_data = opts[:scale_data]
|
145
116
|
|
146
|
-
|
147
|
-
Point.new instance, labels[i]
|
148
|
-
end
|
149
|
-
|
150
|
-
init_clusters
|
117
|
+
init_centroids
|
151
118
|
end
|
152
119
|
|
153
120
|
def run
|
154
121
|
start_time = Time.now
|
155
122
|
@iterations, @runtime = 0, 0
|
156
123
|
|
124
|
+
@cluster_point_ids = Array.new(@k) { [] }
|
125
|
+
|
157
126
|
loop do
|
158
127
|
@iterations +=1
|
159
128
|
|
160
|
-
|
129
|
+
distances = distance(@centroids, @points_matrix)
|
130
|
+
|
131
|
+
# assign point ids to @cluster_point_ids
|
132
|
+
@points_count.times do |i|
|
133
|
+
min_distance_index = distances[i, true].sort_index[0]
|
134
|
+
@cluster_point_ids[min_distance_index] << i
|
135
|
+
end
|
136
|
+
|
137
|
+
moves = []
|
138
|
+
updated_centroids = []
|
139
|
+
|
140
|
+
@k.times do |i|
|
141
|
+
centroid = NArray.cast(@centroids[true, i].flatten)
|
142
|
+
point_ids = @cluster_point_ids[i]
|
143
|
+
|
144
|
+
if point_ids.empty?
|
145
|
+
newcenter = centroid
|
146
|
+
moves << 0
|
147
|
+
else
|
148
|
+
points = @points_matrix[true, point_ids]
|
149
|
+
newcenter = points.mean(1)
|
150
|
+
moves << distance(centroid, newcenter)
|
151
|
+
end
|
161
152
|
|
162
|
-
|
163
|
-
distances = Distance.call(centroids, point.data)
|
164
|
-
cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
|
165
|
-
cluster << point
|
153
|
+
updated_centroids << newcenter
|
166
154
|
end
|
167
155
|
|
168
|
-
|
156
|
+
@centroids = NMatrix.cast updated_centroids
|
169
157
|
|
170
158
|
break if moves.max < 0.001 # i.e., no movement
|
171
159
|
break if @iterations >= 300
|
172
160
|
|
173
|
-
|
161
|
+
@cluster_point_ids = Array.new(@k) { [] }
|
174
162
|
end
|
175
163
|
|
164
|
+
@error = calculate_error
|
176
165
|
@runtime = Time.now - start_time
|
177
166
|
self
|
178
167
|
end
|
179
168
|
|
180
|
-
def
|
181
|
-
|
169
|
+
def finish
|
170
|
+
set_points
|
171
|
+
set_clusters
|
172
|
+
self
|
182
173
|
end
|
183
174
|
|
184
|
-
def
|
185
|
-
|
175
|
+
def predict data
|
176
|
+
data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
|
177
|
+
data = NMatrix.cast(data, NArray::DFLOAT)
|
178
|
+
distances = distance(@centroids, data, nil)
|
179
|
+
data.shape[1].times.map do |i|
|
180
|
+
distances[i, true].sort_index[0] # index of closest cluster
|
181
|
+
end
|
186
182
|
end
|
187
183
|
|
188
184
|
def sorted_clusters point = origin
|
189
|
-
point =
|
185
|
+
point = wrap_point point
|
190
186
|
centroids = get_cluster_centroids
|
191
|
-
distances =
|
187
|
+
distances = distance(centroids, point.data)
|
192
188
|
@clusters.sort_by.with_index {|c, i| distances[i] }
|
193
189
|
end
|
194
190
|
|
195
191
|
def origin
|
196
|
-
|
192
|
+
wrap_point Array.new(@points[0].dimension, 0)
|
197
193
|
end
|
198
194
|
|
199
195
|
def silhouette_score
|
200
|
-
return 1.0 if @
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
196
|
+
return 1.0 if @k < 2
|
197
|
+
|
198
|
+
distances = distance(@centroids, @points_matrix)
|
199
|
+
|
200
|
+
scores = @points_count.times.map do |i|
|
201
|
+
point = get_point i
|
202
|
+
cluster_indexes = distances[i, true].sort_index
|
203
|
+
|
204
|
+
c1_points = get_points_for_centroid cluster_indexes[0]
|
205
|
+
c2_points = get_points_for_centroid cluster_indexes[1]
|
206
|
+
|
207
|
+
a = dissimilarity(c1_points, point)
|
208
|
+
b = dissimilarity(c2_points, point)
|
206
209
|
(b - a) / [a,b].max
|
207
210
|
end
|
208
211
|
|
@@ -210,73 +213,130 @@ class KMeansClusterer
|
|
210
213
|
end
|
211
214
|
|
212
215
|
private
|
213
|
-
def
|
216
|
+
def wrap_point point
|
217
|
+
return point if point.is_a?(Point)
|
218
|
+
Point.new(0, NArray.to_na(point).to_f)
|
219
|
+
end
|
220
|
+
|
221
|
+
def dissimilarity points, point
|
222
|
+
distances = distance points, point
|
223
|
+
distances.sum / distances.length.to_f
|
224
|
+
end
|
225
|
+
|
226
|
+
def init_centroids
|
214
227
|
case @init
|
215
228
|
when :random
|
216
|
-
|
229
|
+
random_centroid_init
|
217
230
|
when Array
|
218
|
-
|
231
|
+
custom_centroid_init
|
219
232
|
else
|
220
|
-
|
233
|
+
kmpp_centroid_init
|
221
234
|
end
|
222
235
|
end
|
223
236
|
|
224
237
|
# k-means++
|
225
|
-
def
|
226
|
-
|
227
|
-
pick = rand(@
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
238
|
+
def kmpp_centroid_init
|
239
|
+
centroid_ids = []
|
240
|
+
pick = rand(@points_count)
|
241
|
+
centroid_ids << pick
|
242
|
+
|
243
|
+
while centroid_ids.length < @k
|
244
|
+
centroids = @points_matrix[true, centroid_ids]
|
245
|
+
|
246
|
+
distances = distance(centroids, @points_matrix)
|
247
|
+
|
248
|
+
d2 = []
|
249
|
+
@points_count.times do |i|
|
250
|
+
min_distance = distances[i, true].min
|
251
|
+
d2 << min_distance**2
|
237
252
|
end
|
238
253
|
|
239
254
|
d2 = NArray.to_na d2
|
240
255
|
probs = d2 / d2.sum
|
241
256
|
cumprobs = probs.cumsum
|
242
257
|
r = rand
|
243
|
-
# pick = cumprobs.to_a.index {|prob| r < prob }
|
244
258
|
pick = (cumprobs >= r).where[0]
|
245
|
-
|
246
|
-
cluster = Cluster.new(centroid, @clusters.length + 1)
|
247
|
-
@clusters << cluster
|
259
|
+
centroid_ids << pick
|
248
260
|
end
|
249
|
-
end
|
250
261
|
|
251
|
-
|
252
|
-
@clusters = @init.map.with_index do |instance, i|
|
253
|
-
point = Point.new NArray.to_na(instance).to_f
|
254
|
-
Cluster.new point, i+1
|
255
|
-
end
|
262
|
+
@centroids = @points_matrix[true, centroid_ids]
|
256
263
|
end
|
257
264
|
|
258
|
-
def
|
259
|
-
@
|
265
|
+
def custom_centroid_init
|
266
|
+
@centroids = NMatrix.cast @init
|
267
|
+
@k = @init.length
|
260
268
|
end
|
261
269
|
|
262
|
-
def
|
263
|
-
|
270
|
+
def random_centroid_init
|
271
|
+
@centroids = @points_matrix[true, pick_k_random_indexes]
|
264
272
|
end
|
265
273
|
|
266
274
|
def pick_k_random_indexes
|
267
|
-
@
|
275
|
+
@points_count.times.to_a.shuffle.slice(0, @k)
|
268
276
|
end
|
269
277
|
|
270
278
|
def get_cluster_centroids
|
271
279
|
NArray.to_na @clusters.map {|c| c.centroid.data }
|
272
280
|
end
|
273
|
-
end
|
274
281
|
|
275
|
-
|
276
|
-
|
277
|
-
|
282
|
+
def set_points
|
283
|
+
@points = @points_count.times.map do |i|
|
284
|
+
data = NArray.cast @points_matrix[true, i].flatten
|
285
|
+
Point.new(i, data, @labels[i])
|
286
|
+
end
|
287
|
+
end
|
278
288
|
|
279
|
-
|
280
|
-
|
281
|
-
|
289
|
+
def set_clusters
|
290
|
+
@clusters = @k.times.map do |i|
|
291
|
+
centroid = NArray.cast @centroids[true, i].flatten
|
292
|
+
c = Cluster.new i, Point.new(-i, centroid)
|
293
|
+
@cluster_point_ids[i].each do |p|
|
294
|
+
c << @points[p]
|
295
|
+
end
|
296
|
+
c
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
def calculate_error
|
301
|
+
errors = @k.times.map do |i|
|
302
|
+
centroid = get_centroid i
|
303
|
+
points = get_points_for_centroid i
|
304
|
+
|
305
|
+
if points.empty?
|
306
|
+
0
|
307
|
+
else
|
308
|
+
distances = distance points, centroid
|
309
|
+
(distances**2).sum
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
errors.reduce(:+)
|
314
|
+
end
|
315
|
+
|
316
|
+
def get_point i
|
317
|
+
NArray.cast @points_matrix[true, i].flatten
|
318
|
+
end
|
319
|
+
|
320
|
+
def get_centroid i
|
321
|
+
NArray.cast(@centroids[true, i].flatten)
|
322
|
+
end
|
323
|
+
|
324
|
+
def get_points_for_centroid i
|
325
|
+
point_ids = @cluster_point_ids[i]
|
326
|
+
NArray.cast @points_matrix[true, point_ids]
|
327
|
+
end
|
328
|
+
|
329
|
+
def distance x, y, yy = @row_norms
|
330
|
+
if x.is_a?(NMatrix) && y.is_a?(NMatrix)
|
331
|
+
xx = x.map {|v| v**2}.sum(0)
|
332
|
+
yy ||= y.map {|v| v**2}.sum(0)
|
333
|
+
xy = x * y.transpose
|
334
|
+
distance = xy * -2
|
335
|
+
distance += xx
|
336
|
+
distance += yy.transpose
|
337
|
+
NMath.sqrt distance
|
338
|
+
else
|
339
|
+
NMath.sqrt ((x - y)**2).sum(0)
|
340
|
+
end
|
341
|
+
end
|
282
342
|
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kmeans-clusterer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Geoff Buesing
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: k-means
|
13
|
+
description: k-means clustering. Uses NArray for fast calculations.
|
14
14
|
email: gbuesing@gmail.com
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
@@ -40,5 +40,5 @@ rubyforge_project:
|
|
40
40
|
rubygems_version: 2.4.5
|
41
41
|
signing_key:
|
42
42
|
specification_version: 4
|
43
|
-
summary: k-means
|
43
|
+
summary: k-means clustering
|
44
44
|
test_files: []
|