kmeans-clusterer 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/kmeans-clusterer.rb +282 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
4
+ data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
5
+ SHA512:
6
+ metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
7
+ data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
@@ -0,0 +1,282 @@
1
+ require 'narray'
2
+
3
+ class KMeansClusterer
4
+
5
+ # Euclidean distance function. Requires instances of NArray as args
6
+ Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
7
+ CalculateCentroid = -> (a) { a.mean(1) }
8
+
9
+ class Point
10
+ attr_reader :data
11
+ attr_accessor :cluster, :label
12
+
13
+ def initialize data, label = nil
14
+ @data = NArray.to_na data
15
+ @label = label
16
+ end
17
+
18
+ def [] index
19
+ @data[index]
20
+ end
21
+
22
+ def to_a
23
+ @data.to_a
24
+ end
25
+
26
+ def to_s
27
+ to_a.to_s
28
+ end
29
+
30
+ def dimension
31
+ @data.length
32
+ end
33
+ end
34
+
35
+
36
+ class Cluster
37
+ attr_reader :centroid, :points
38
+ attr_accessor :label
39
+
40
+ def initialize centroid, label = nil
41
+ @centroid = centroid
42
+ @label = label
43
+ @points = []
44
+ end
45
+
46
+ def recenter
47
+ if @points.empty?
48
+ 0
49
+ else
50
+ old_centroid = @centroid
51
+ @centroid = calculate_centroid_from_points
52
+ Distance.call @centroid.data, old_centroid.data
53
+ end
54
+ end
55
+
56
+ def << point
57
+ point.cluster = self
58
+ @points << point
59
+ end
60
+
61
+ def reset_points
62
+ @points = []
63
+ end
64
+
65
+ def sorted_points
66
+ distances = Distance.call points_narray, centroid.data
67
+ @points.sort_by.with_index {|c, i| distances[i] }
68
+ end
69
+
70
+ def sum_of_squares_error
71
+ if @points.empty?
72
+ 0
73
+ else
74
+ distances = Distance.call points_narray, centroid.data
75
+ (distances**2).sum
76
+ end
77
+ end
78
+
79
+ def sum_of_distances
80
+ return 0 if @points.empty?
81
+ Distance.call(points_narray, centroid.data).sum
82
+ end
83
+
84
+ def dissimilarity point
85
+ distances = Distance.call points_narray, point.data
86
+ distances.sum / distances.length.to_f
87
+ end
88
+
89
+ private
90
+ def calculate_centroid_from_points
91
+ data = CalculateCentroid.call points_narray
92
+ Point.new data
93
+ end
94
+
95
+ def points_narray
96
+ NArray.to_na @points.map(&:data)
97
+ end
98
+ end
99
+
100
+
101
+ def self.run k, data, opts = {}
102
+ raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
103
+
104
+ data = if opts[:scale_data]
105
+ scale_data data
106
+ else
107
+ data.map {|row| NArray.to_na(row).to_f}
108
+ end
109
+
110
+ runcount = opts[:runs] || 10
111
+ errors = []
112
+
113
+ runs = runcount.times.map do |i|
114
+ km = new(k, data, opts).run
115
+ error = km.error
116
+ if opts[:log]
117
+ puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
118
+ end
119
+ errors << error
120
+ km
121
+ end
122
+
123
+ runs.sort_by.with_index {|run, i| errors[i] }.first
124
+ end
125
+
126
+ # see scikit-learn scale and _mean_and_std methods
127
+ def self.scale_data data
128
+ nadata = NArray.to_na(data).to_f
129
+ mean = nadata.mean(1)
130
+ std = nadata.rmsdev(1)
131
+ std[std.eq(0)] = 1.0 # so we don't divide by 0
132
+ nadata = (nadata - mean) / std
133
+ # convert back to an array, containing NArrays for each row
134
+ data.length.times.map {|i| nadata[true, i] }
135
+ end
136
+
137
+
138
+ attr_reader :k, :points, :clusters, :iterations, :runtime
139
+
140
+
141
+ def initialize k, data, opts = {}
142
+ @k = k
143
+ @init = opts[:init] || :kmpp
144
+ labels = opts[:labels] || []
145
+
146
+ @points = data.map.with_index do |instance, i|
147
+ Point.new instance, labels[i]
148
+ end
149
+
150
+ init_clusters
151
+ end
152
+
153
+ def run
154
+ start_time = Time.now
155
+ @iterations, @runtime = 0, 0
156
+
157
+ loop do
158
+ @iterations +=1
159
+
160
+ centroids = get_cluster_centroids
161
+
162
+ @points.each do |point|
163
+ distances = Distance.call(centroids, point.data)
164
+ cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
165
+ cluster << point
166
+ end
167
+
168
+ moves = clusters.map(&:recenter)
169
+
170
+ break if moves.max < 0.001 # i.e., no movement
171
+ break if @iterations >= 300
172
+
173
+ clusters.each(&:reset_points)
174
+ end
175
+
176
+ @runtime = Time.now - start_time
177
+ self
178
+ end
179
+
180
+ def error
181
+ @clusters.map(&:sum_of_squares_error).reduce(:+)
182
+ end
183
+
184
+ def closest_cluster point = origin
185
+ sorted_clusters(point).first
186
+ end
187
+
188
+ def sorted_clusters point = origin
189
+ point = Point.new(point) unless point.is_a?(Point)
190
+ centroids = get_cluster_centroids
191
+ distances = Distance.call(centroids, point.data)
192
+ @clusters.sort_by.with_index {|c, i| distances[i] }
193
+ end
194
+
195
+ def origin
196
+ Point.new Array.new(@points[0].dimension, 0)
197
+ end
198
+
199
+ def silhouette_score
200
+ return 1.0 if @clusters.length < 2
201
+
202
+ scores = @points.map do |point|
203
+ acluster, bcluster = sorted_clusters(point).slice(0,2)
204
+ a = acluster.dissimilarity(point)
205
+ b = bcluster.dissimilarity(point)
206
+ (b - a) / [a,b].max
207
+ end
208
+
209
+ scores.reduce(:+) / scores.length # mean score for all points
210
+ end
211
+
212
+ private
213
+ def init_clusters
214
+ case @init
215
+ when :random
216
+ random_cluster_init
217
+ when Array
218
+ custom_cluster_init
219
+ else
220
+ kmpp_cluster_init
221
+ end
222
+ end
223
+
224
+ # k-means++
225
+ def kmpp_cluster_init
226
+ @clusters = []
227
+ pick = rand(@points.length)
228
+ centroid = Point.new @points[pick].data.to_a
229
+ @clusters << Cluster.new(centroid, 1)
230
+
231
+ while @clusters.length < @k
232
+ centroids = get_cluster_centroids
233
+
234
+ d2 = @points.map do |point|
235
+ dists = Distance.call centroids, point.data
236
+ dists.min**2 # closest cluster distance, squared
237
+ end
238
+
239
+ d2 = NArray.to_na d2
240
+ probs = d2 / d2.sum
241
+ cumprobs = probs.cumsum
242
+ r = rand
243
+ # pick = cumprobs.to_a.index {|prob| r < prob }
244
+ pick = (cumprobs >= r).where[0]
245
+ centroid = Point.new @points[pick].data.to_a
246
+ cluster = Cluster.new(centroid, @clusters.length + 1)
247
+ @clusters << cluster
248
+ end
249
+ end
250
+
251
+ def custom_cluster_init
252
+ @clusters = @init.map.with_index do |instance, i|
253
+ point = Point.new NArray.to_na(instance).to_f
254
+ Cluster.new point, i+1
255
+ end
256
+ end
257
+
258
+ def random_cluster_init
259
+ @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
260
+ end
261
+
262
+ def pick_k_random_points
263
+ pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
264
+ end
265
+
266
+ def pick_k_random_indexes
267
+ @points.length.times.to_a.shuffle.slice(0, @k)
268
+ end
269
+
270
+ def get_cluster_centroids
271
+ NArray.to_na @clusters.map {|c| c.centroid.data }
272
+ end
273
+ end
274
+
275
+ class KMediansClusterer < KMeansClusterer
276
+ Distance = -> (a, b) { (a - b).abs.sum(0) }
277
+ CalculateCentroid = -> (a) { a.rot90.median(0) }
278
+
279
+ def error
280
+ @clusters.map(&:sum_of_distances).reduce(:+)
281
+ end
282
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kmeans-clusterer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - Geoff Buesing
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-29 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: k-means/k-medians clustering. Uses NArray for fast calculations.
14
+ email: gbuesing@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/kmeans-clusterer.rb
20
+ homepage: https://github.com/gbuesing/kmeans-clusterer
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.5
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: k-means/k-medians clustering
44
+ test_files: []