kmeans-clusterer 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/kmeans-clusterer.rb +282 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
|
4
|
+
data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
|
7
|
+
data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
|
@@ -0,0 +1,282 @@
|
|
1
|
+
require 'narray'
|
2
|
+
|
3
|
+
class KMeansClusterer
|
4
|
+
|
5
|
+
# Euclidean distance function. Requires instances of NArray as args
|
6
|
+
Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
|
7
|
+
CalculateCentroid = -> (a) { a.mean(1) }
|
8
|
+
|
9
|
+
class Point
|
10
|
+
attr_reader :data
|
11
|
+
attr_accessor :cluster, :label
|
12
|
+
|
13
|
+
def initialize data, label = nil
|
14
|
+
@data = NArray.to_na data
|
15
|
+
@label = label
|
16
|
+
end
|
17
|
+
|
18
|
+
def [] index
|
19
|
+
@data[index]
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_a
|
23
|
+
@data.to_a
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
to_a.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def dimension
|
31
|
+
@data.length
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
class Cluster
|
37
|
+
attr_reader :centroid, :points
|
38
|
+
attr_accessor :label
|
39
|
+
|
40
|
+
def initialize centroid, label = nil
|
41
|
+
@centroid = centroid
|
42
|
+
@label = label
|
43
|
+
@points = []
|
44
|
+
end
|
45
|
+
|
46
|
+
def recenter
|
47
|
+
if @points.empty?
|
48
|
+
0
|
49
|
+
else
|
50
|
+
old_centroid = @centroid
|
51
|
+
@centroid = calculate_centroid_from_points
|
52
|
+
Distance.call @centroid.data, old_centroid.data
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def << point
|
57
|
+
point.cluster = self
|
58
|
+
@points << point
|
59
|
+
end
|
60
|
+
|
61
|
+
def reset_points
|
62
|
+
@points = []
|
63
|
+
end
|
64
|
+
|
65
|
+
def sorted_points
|
66
|
+
distances = Distance.call points_narray, centroid.data
|
67
|
+
@points.sort_by.with_index {|c, i| distances[i] }
|
68
|
+
end
|
69
|
+
|
70
|
+
def sum_of_squares_error
|
71
|
+
if @points.empty?
|
72
|
+
0
|
73
|
+
else
|
74
|
+
distances = Distance.call points_narray, centroid.data
|
75
|
+
(distances**2).sum
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def sum_of_distances
|
80
|
+
return 0 if @points.empty?
|
81
|
+
Distance.call(points_narray, centroid.data).sum
|
82
|
+
end
|
83
|
+
|
84
|
+
def dissimilarity point
|
85
|
+
distances = Distance.call points_narray, point.data
|
86
|
+
distances.sum / distances.length.to_f
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
def calculate_centroid_from_points
|
91
|
+
data = CalculateCentroid.call points_narray
|
92
|
+
Point.new data
|
93
|
+
end
|
94
|
+
|
95
|
+
def points_narray
|
96
|
+
NArray.to_na @points.map(&:data)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def self.run k, data, opts = {}
|
102
|
+
raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
|
103
|
+
|
104
|
+
data = if opts[:scale_data]
|
105
|
+
scale_data data
|
106
|
+
else
|
107
|
+
data.map {|row| NArray.to_na(row).to_f}
|
108
|
+
end
|
109
|
+
|
110
|
+
runcount = opts[:runs] || 10
|
111
|
+
errors = []
|
112
|
+
|
113
|
+
runs = runcount.times.map do |i|
|
114
|
+
km = new(k, data, opts).run
|
115
|
+
error = km.error
|
116
|
+
if opts[:log]
|
117
|
+
puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
|
118
|
+
end
|
119
|
+
errors << error
|
120
|
+
km
|
121
|
+
end
|
122
|
+
|
123
|
+
runs.sort_by.with_index {|run, i| errors[i] }.first
|
124
|
+
end
|
125
|
+
|
126
|
+
# see scikit-learn scale and _mean_and_std methods
|
127
|
+
def self.scale_data data
|
128
|
+
nadata = NArray.to_na(data).to_f
|
129
|
+
mean = nadata.mean(1)
|
130
|
+
std = nadata.rmsdev(1)
|
131
|
+
std[std.eq(0)] = 1.0 # so we don't divide by 0
|
132
|
+
nadata = (nadata - mean) / std
|
133
|
+
# convert back to an array, containing NArrays for each row
|
134
|
+
data.length.times.map {|i| nadata[true, i] }
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
attr_reader :k, :points, :clusters, :iterations, :runtime
|
139
|
+
|
140
|
+
|
141
|
+
def initialize k, data, opts = {}
|
142
|
+
@k = k
|
143
|
+
@init = opts[:init] || :kmpp
|
144
|
+
labels = opts[:labels] || []
|
145
|
+
|
146
|
+
@points = data.map.with_index do |instance, i|
|
147
|
+
Point.new instance, labels[i]
|
148
|
+
end
|
149
|
+
|
150
|
+
init_clusters
|
151
|
+
end
|
152
|
+
|
153
|
+
def run
|
154
|
+
start_time = Time.now
|
155
|
+
@iterations, @runtime = 0, 0
|
156
|
+
|
157
|
+
loop do
|
158
|
+
@iterations +=1
|
159
|
+
|
160
|
+
centroids = get_cluster_centroids
|
161
|
+
|
162
|
+
@points.each do |point|
|
163
|
+
distances = Distance.call(centroids, point.data)
|
164
|
+
cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
|
165
|
+
cluster << point
|
166
|
+
end
|
167
|
+
|
168
|
+
moves = clusters.map(&:recenter)
|
169
|
+
|
170
|
+
break if moves.max < 0.001 # i.e., no movement
|
171
|
+
break if @iterations >= 300
|
172
|
+
|
173
|
+
clusters.each(&:reset_points)
|
174
|
+
end
|
175
|
+
|
176
|
+
@runtime = Time.now - start_time
|
177
|
+
self
|
178
|
+
end
|
179
|
+
|
180
|
+
def error
|
181
|
+
@clusters.map(&:sum_of_squares_error).reduce(:+)
|
182
|
+
end
|
183
|
+
|
184
|
+
def closest_cluster point = origin
|
185
|
+
sorted_clusters(point).first
|
186
|
+
end
|
187
|
+
|
188
|
+
def sorted_clusters point = origin
|
189
|
+
point = Point.new(point) unless point.is_a?(Point)
|
190
|
+
centroids = get_cluster_centroids
|
191
|
+
distances = Distance.call(centroids, point.data)
|
192
|
+
@clusters.sort_by.with_index {|c, i| distances[i] }
|
193
|
+
end
|
194
|
+
|
195
|
+
def origin
|
196
|
+
Point.new Array.new(@points[0].dimension, 0)
|
197
|
+
end
|
198
|
+
|
199
|
+
def silhouette_score
|
200
|
+
return 1.0 if @clusters.length < 2
|
201
|
+
|
202
|
+
scores = @points.map do |point|
|
203
|
+
acluster, bcluster = sorted_clusters(point).slice(0,2)
|
204
|
+
a = acluster.dissimilarity(point)
|
205
|
+
b = bcluster.dissimilarity(point)
|
206
|
+
(b - a) / [a,b].max
|
207
|
+
end
|
208
|
+
|
209
|
+
scores.reduce(:+) / scores.length # mean score for all points
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
def init_clusters
|
214
|
+
case @init
|
215
|
+
when :random
|
216
|
+
random_cluster_init
|
217
|
+
when Array
|
218
|
+
custom_cluster_init
|
219
|
+
else
|
220
|
+
kmpp_cluster_init
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# k-means++
|
225
|
+
def kmpp_cluster_init
|
226
|
+
@clusters = []
|
227
|
+
pick = rand(@points.length)
|
228
|
+
centroid = Point.new @points[pick].data.to_a
|
229
|
+
@clusters << Cluster.new(centroid, 1)
|
230
|
+
|
231
|
+
while @clusters.length < @k
|
232
|
+
centroids = get_cluster_centroids
|
233
|
+
|
234
|
+
d2 = @points.map do |point|
|
235
|
+
dists = Distance.call centroids, point.data
|
236
|
+
dists.min**2 # closest cluster distance, squared
|
237
|
+
end
|
238
|
+
|
239
|
+
d2 = NArray.to_na d2
|
240
|
+
probs = d2 / d2.sum
|
241
|
+
cumprobs = probs.cumsum
|
242
|
+
r = rand
|
243
|
+
# pick = cumprobs.to_a.index {|prob| r < prob }
|
244
|
+
pick = (cumprobs >= r).where[0]
|
245
|
+
centroid = Point.new @points[pick].data.to_a
|
246
|
+
cluster = Cluster.new(centroid, @clusters.length + 1)
|
247
|
+
@clusters << cluster
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def custom_cluster_init
|
252
|
+
@clusters = @init.map.with_index do |instance, i|
|
253
|
+
point = Point.new NArray.to_na(instance).to_f
|
254
|
+
Cluster.new point, i+1
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def random_cluster_init
|
259
|
+
@clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
|
260
|
+
end
|
261
|
+
|
262
|
+
def pick_k_random_points
|
263
|
+
pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
|
264
|
+
end
|
265
|
+
|
266
|
+
def pick_k_random_indexes
|
267
|
+
@points.length.times.to_a.shuffle.slice(0, @k)
|
268
|
+
end
|
269
|
+
|
270
|
+
def get_cluster_centroids
|
271
|
+
NArray.to_na @clusters.map {|c| c.centroid.data }
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
class KMediansClusterer < KMeansClusterer
|
276
|
+
Distance = -> (a, b) { (a - b).abs.sum(0) }
|
277
|
+
CalculateCentroid = -> (a) { a.rot90.median(0) }
|
278
|
+
|
279
|
+
def error
|
280
|
+
@clusters.map(&:sum_of_distances).reduce(:+)
|
281
|
+
end
|
282
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kmeans-clusterer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Geoff Buesing
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-01-29 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: k-means/k-medians clustering. Uses NArray for fast calculations.
|
14
|
+
email: gbuesing@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/kmeans-clusterer.rb
|
20
|
+
homepage: https://github.com/gbuesing/kmeans-clusterer
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.5
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: k-means/k-medians clustering
|
44
|
+
test_files: []
|