kmeans-clusterer 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/kmeans-clusterer.rb +282 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
4
+ data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
5
+ SHA512:
6
+ metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
7
+ data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
@@ -0,0 +1,282 @@
1
+ require 'narray'
2
+
3
+ class KMeansClusterer
4
+
5
+ # Euclidean distance function. Requires instances of NArray as args
6
+ Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
7
+ CalculateCentroid = -> (a) { a.mean(1) }
8
+
9
+ class Point
10
+ attr_reader :data
11
+ attr_accessor :cluster, :label
12
+
13
+ def initialize data, label = nil
14
+ @data = NArray.to_na data
15
+ @label = label
16
+ end
17
+
18
+ def [] index
19
+ @data[index]
20
+ end
21
+
22
+ def to_a
23
+ @data.to_a
24
+ end
25
+
26
+ def to_s
27
+ to_a.to_s
28
+ end
29
+
30
+ def dimension
31
+ @data.length
32
+ end
33
+ end
34
+
35
+
36
+ class Cluster
37
+ attr_reader :centroid, :points
38
+ attr_accessor :label
39
+
40
+ def initialize centroid, label = nil
41
+ @centroid = centroid
42
+ @label = label
43
+ @points = []
44
+ end
45
+
46
+ def recenter
47
+ if @points.empty?
48
+ 0
49
+ else
50
+ old_centroid = @centroid
51
+ @centroid = calculate_centroid_from_points
52
+ Distance.call @centroid.data, old_centroid.data
53
+ end
54
+ end
55
+
56
+ def << point
57
+ point.cluster = self
58
+ @points << point
59
+ end
60
+
61
+ def reset_points
62
+ @points = []
63
+ end
64
+
65
+ def sorted_points
66
+ distances = Distance.call points_narray, centroid.data
67
+ @points.sort_by.with_index {|c, i| distances[i] }
68
+ end
69
+
70
+ def sum_of_squares_error
71
+ if @points.empty?
72
+ 0
73
+ else
74
+ distances = Distance.call points_narray, centroid.data
75
+ (distances**2).sum
76
+ end
77
+ end
78
+
79
+ def sum_of_distances
80
+ return 0 if @points.empty?
81
+ Distance.call(points_narray, centroid.data).sum
82
+ end
83
+
84
+ def dissimilarity point
85
+ distances = Distance.call points_narray, point.data
86
+ distances.sum / distances.length.to_f
87
+ end
88
+
89
+ private
90
+ def calculate_centroid_from_points
91
+ data = CalculateCentroid.call points_narray
92
+ Point.new data
93
+ end
94
+
95
+ def points_narray
96
+ NArray.to_na @points.map(&:data)
97
+ end
98
+ end
99
+
100
+
101
+ def self.run k, data, opts = {}
102
+ raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
103
+
104
+ data = if opts[:scale_data]
105
+ scale_data data
106
+ else
107
+ data.map {|row| NArray.to_na(row).to_f}
108
+ end
109
+
110
+ runcount = opts[:runs] || 10
111
+ errors = []
112
+
113
+ runs = runcount.times.map do |i|
114
+ km = new(k, data, opts).run
115
+ error = km.error
116
+ if opts[:log]
117
+ puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
118
+ end
119
+ errors << error
120
+ km
121
+ end
122
+
123
+ runs.sort_by.with_index {|run, i| errors[i] }.first
124
+ end
125
+
126
+ # see scikit-learn scale and _mean_and_std methods
127
+ def self.scale_data data
128
+ nadata = NArray.to_na(data).to_f
129
+ mean = nadata.mean(1)
130
+ std = nadata.rmsdev(1)
131
+ std[std.eq(0)] = 1.0 # so we don't divide by 0
132
+ nadata = (nadata - mean) / std
133
+ # convert back to an array, containing NArrays for each row
134
+ data.length.times.map {|i| nadata[true, i] }
135
+ end
136
+
137
+
138
+ attr_reader :k, :points, :clusters, :iterations, :runtime
139
+
140
+
141
+ def initialize k, data, opts = {}
142
+ @k = k
143
+ @init = opts[:init] || :kmpp
144
+ labels = opts[:labels] || []
145
+
146
+ @points = data.map.with_index do |instance, i|
147
+ Point.new instance, labels[i]
148
+ end
149
+
150
+ init_clusters
151
+ end
152
+
153
+ def run
154
+ start_time = Time.now
155
+ @iterations, @runtime = 0, 0
156
+
157
+ loop do
158
+ @iterations +=1
159
+
160
+ centroids = get_cluster_centroids
161
+
162
+ @points.each do |point|
163
+ distances = Distance.call(centroids, point.data)
164
+ cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
165
+ cluster << point
166
+ end
167
+
168
+ moves = clusters.map(&:recenter)
169
+
170
+ break if moves.max < 0.001 # i.e., no movement
171
+ break if @iterations >= 300
172
+
173
+ clusters.each(&:reset_points)
174
+ end
175
+
176
+ @runtime = Time.now - start_time
177
+ self
178
+ end
179
+
180
+ def error
181
+ @clusters.map(&:sum_of_squares_error).reduce(:+)
182
+ end
183
+
184
+ def closest_cluster point = origin
185
+ sorted_clusters(point).first
186
+ end
187
+
188
+ def sorted_clusters point = origin
189
+ point = Point.new(point) unless point.is_a?(Point)
190
+ centroids = get_cluster_centroids
191
+ distances = Distance.call(centroids, point.data)
192
+ @clusters.sort_by.with_index {|c, i| distances[i] }
193
+ end
194
+
195
+ def origin
196
+ Point.new Array.new(@points[0].dimension, 0)
197
+ end
198
+
199
+ def silhouette_score
200
+ return 1.0 if @clusters.length < 2
201
+
202
+ scores = @points.map do |point|
203
+ acluster, bcluster = sorted_clusters(point).slice(0,2)
204
+ a = acluster.dissimilarity(point)
205
+ b = bcluster.dissimilarity(point)
206
+ (b - a) / [a,b].max
207
+ end
208
+
209
+ scores.reduce(:+) / scores.length # mean score for all points
210
+ end
211
+
212
+ private
213
+ def init_clusters
214
+ case @init
215
+ when :random
216
+ random_cluster_init
217
+ when Array
218
+ custom_cluster_init
219
+ else
220
+ kmpp_cluster_init
221
+ end
222
+ end
223
+
224
+ # k-means++
225
+ def kmpp_cluster_init
226
+ @clusters = []
227
+ pick = rand(@points.length)
228
+ centroid = Point.new @points[pick].data.to_a
229
+ @clusters << Cluster.new(centroid, 1)
230
+
231
+ while @clusters.length < @k
232
+ centroids = get_cluster_centroids
233
+
234
+ d2 = @points.map do |point|
235
+ dists = Distance.call centroids, point.data
236
+ dists.min**2 # closest cluster distance, squared
237
+ end
238
+
239
+ d2 = NArray.to_na d2
240
+ probs = d2 / d2.sum
241
+ cumprobs = probs.cumsum
242
+ r = rand
243
+ # pick = cumprobs.to_a.index {|prob| r < prob }
244
+ pick = (cumprobs >= r).where[0]
245
+ centroid = Point.new @points[pick].data.to_a
246
+ cluster = Cluster.new(centroid, @clusters.length + 1)
247
+ @clusters << cluster
248
+ end
249
+ end
250
+
251
+ def custom_cluster_init
252
+ @clusters = @init.map.with_index do |instance, i|
253
+ point = Point.new NArray.to_na(instance).to_f
254
+ Cluster.new point, i+1
255
+ end
256
+ end
257
+
258
+ def random_cluster_init
259
+ @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
260
+ end
261
+
262
+ def pick_k_random_points
263
+ pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
264
+ end
265
+
266
+ def pick_k_random_indexes
267
+ @points.length.times.to_a.shuffle.slice(0, @k)
268
+ end
269
+
270
+ def get_cluster_centroids
271
+ NArray.to_na @clusters.map {|c| c.centroid.data }
272
+ end
273
+ end
274
+
275
+ class KMediansClusterer < KMeansClusterer
276
+ Distance = -> (a, b) { (a - b).abs.sum(0) }
277
+ CalculateCentroid = -> (a) { a.rot90.median(0) }
278
+
279
+ def error
280
+ @clusters.map(&:sum_of_distances).reduce(:+)
281
+ end
282
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kmeans-clusterer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Geoff Buesing
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-29 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: k-means/k-medians clustering. Uses NArray for fast calculations.
14
+ email: gbuesing@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/kmeans-clusterer.rb
20
+ homepage: https://github.com/gbuesing/kmeans-clusterer
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.5
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: k-means/k-medians clustering
44
+ test_files: []