kmeans-clusterer 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +206 -146
  3. metadata +4 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
4
- data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
3
+ metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
4
+ data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
5
5
  SHA512:
6
- metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
7
- data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
6
+ metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
7
+ data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b
@@ -1,17 +1,34 @@
1
1
  require 'narray'
2
2
 
3
3
  class KMeansClusterer
4
+ module Scaler
5
+ def self.mean data
6
+ data.mean(1)
7
+ end
8
+
9
+ def self.std data
10
+ std = data.rmsdev(1)
11
+ std[std.eq(0)] = 1.0 # so we don't divide by 0
12
+ std
13
+ end
14
+
15
+ def self.scale data, mean = nil, std = nil
16
+ data = NArray.cast(data, NArray::DFLOAT)
17
+ mean ||= self.mean(data)
18
+ std ||= self.std(data)
19
+ data = (data - mean) / std
20
+ [data, mean, std]
21
+ end
22
+ end
4
23
 
5
- # Euclidean distance function. Requires instances of NArray as args
6
- Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
7
- CalculateCentroid = -> (a) { a.mean(1) }
8
24
 
9
25
  class Point
10
- attr_reader :data
26
+ attr_reader :id, :data
11
27
  attr_accessor :cluster, :label
12
28
 
13
- def initialize data, label = nil
14
- @data = NArray.to_na data
29
+ def initialize id, data, label = nil
30
+ @id = id
31
+ @data = data
15
32
  @label = label
16
33
  end
17
34
 
@@ -34,175 +51,161 @@ class KMeansClusterer
34
51
 
35
52
 
36
53
  class Cluster
37
- attr_reader :centroid, :points
54
+ attr_reader :id, :centroid, :points
38
55
  attr_accessor :label
39
56
 
40
- def initialize centroid, label = nil
57
+ def initialize id, centroid
58
+ @id = id
41
59
  @centroid = centroid
42
- @label = label
43
60
  @points = []
44
61
  end
45
62
 
46
- def recenter
47
- if @points.empty?
48
- 0
49
- else
50
- old_centroid = @centroid
51
- @centroid = calculate_centroid_from_points
52
- Distance.call @centroid.data, old_centroid.data
53
- end
54
- end
55
-
56
63
  def << point
57
64
  point.cluster = self
58
65
  @points << point
59
66
  end
60
67
 
61
- def reset_points
62
- @points = []
63
- end
64
-
65
- def sorted_points
66
- distances = Distance.call points_narray, centroid.data
67
- @points.sort_by.with_index {|c, i| distances[i] }
68
- end
69
-
70
- def sum_of_squares_error
71
- if @points.empty?
72
- 0
73
- else
74
- distances = Distance.call points_narray, centroid.data
75
- (distances**2).sum
76
- end
77
- end
78
-
79
- def sum_of_distances
80
- return 0 if @points.empty?
81
- Distance.call(points_narray, centroid.data).sum
68
+ def points_narray
69
+ NArray.cast @points.map(&:data)
82
70
  end
83
-
84
- def dissimilarity point
85
- distances = Distance.call points_narray, point.data
86
- distances.sum / distances.length.to_f
87
- end
88
-
89
- private
90
- def calculate_centroid_from_points
91
- data = CalculateCentroid.call points_narray
92
- Point.new data
93
- end
94
-
95
- def points_narray
96
- NArray.to_na @points.map(&:data)
97
- end
98
71
  end
99
72
 
100
73
 
74
+ DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
75
+
101
76
  def self.run k, data, opts = {}
102
- raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
77
+ opts = DEFAULT_OPTS.merge(opts)
103
78
 
104
- data = if opts[:scale_data]
105
- scale_data data
106
- else
107
- data.map {|row| NArray.to_na(row).to_f}
79
+ opts[:k] = k
80
+
81
+ if opts[:scale_data]
82
+ data, mean, std = Scaler.scale(data)
83
+ opts[:mean] = mean
84
+ opts[:std] = std
108
85
  end
109
86
 
110
- runcount = opts[:runs] || 10
111
- errors = []
87
+ opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
88
+ opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
112
89
 
113
- runs = runcount.times.map do |i|
114
- km = new(k, data, opts).run
115
- error = km.error
90
+ runs = opts[:runs].times.map do |i|
91
+ km = new(opts).run
116
92
  if opts[:log]
117
- puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
93
+ puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
118
94
  end
119
- errors << error
120
95
  km
121
96
  end
122
97
 
123
- runs.sort_by.with_index {|run, i| errors[i] }.first
98
+ runs.sort_by {|run| run.error }.first.finish
124
99
  end
125
100
 
126
- # see scikit-learn scale and _mean_and_std methods
127
- def self.scale_data data
128
- nadata = NArray.to_na(data).to_f
129
- mean = nadata.mean(1)
130
- std = nadata.rmsdev(1)
131
- std[std.eq(0)] = 1.0 # so we don't divide by 0
132
- nadata = (nadata - mean) / std
133
- # convert back to an array, containing NArrays for each row
134
- data.length.times.map {|i| nadata[true, i] }
135
- end
136
101
 
102
+ attr_reader :k, :points, :clusters, :error, :iterations, :runtime
137
103
 
138
- attr_reader :k, :points, :clusters, :iterations, :runtime
139
104
 
105
+ def initialize opts = {}
106
+ @k = opts[:k]
107
+ @init = opts[:init]
108
+ @labels = opts[:labels] || []
109
+ @row_norms = opts[:row_norms]
140
110
 
141
- def initialize k, data, opts = {}
142
- @k = k
143
- @init = opts[:init] || :kmpp
144
- labels = opts[:labels] || []
111
+ @points_matrix = opts[:points_matrix]
112
+ @points_count = @points_matrix.shape[1] if @points_matrix
113
+ @mean = opts[:mean]
114
+ @std = opts[:std]
115
+ @scale_data = opts[:scale_data]
145
116
 
146
- @points = data.map.with_index do |instance, i|
147
- Point.new instance, labels[i]
148
- end
149
-
150
- init_clusters
117
+ init_centroids
151
118
  end
152
119
 
153
120
  def run
154
121
  start_time = Time.now
155
122
  @iterations, @runtime = 0, 0
156
123
 
124
+ @cluster_point_ids = Array.new(@k) { [] }
125
+
157
126
  loop do
158
127
  @iterations +=1
159
128
 
160
- centroids = get_cluster_centroids
129
+ distances = distance(@centroids, @points_matrix)
130
+
131
+ # assign point ids to @cluster_point_ids
132
+ @points_count.times do |i|
133
+ min_distance_index = distances[i, true].sort_index[0]
134
+ @cluster_point_ids[min_distance_index] << i
135
+ end
136
+
137
+ moves = []
138
+ updated_centroids = []
139
+
140
+ @k.times do |i|
141
+ centroid = NArray.cast(@centroids[true, i].flatten)
142
+ point_ids = @cluster_point_ids[i]
143
+
144
+ if point_ids.empty?
145
+ newcenter = centroid
146
+ moves << 0
147
+ else
148
+ points = @points_matrix[true, point_ids]
149
+ newcenter = points.mean(1)
150
+ moves << distance(centroid, newcenter)
151
+ end
161
152
 
162
- @points.each do |point|
163
- distances = Distance.call(centroids, point.data)
164
- cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
165
- cluster << point
153
+ updated_centroids << newcenter
166
154
  end
167
155
 
168
- moves = clusters.map(&:recenter)
156
+ @centroids = NMatrix.cast updated_centroids
169
157
 
170
158
  break if moves.max < 0.001 # i.e., no movement
171
159
  break if @iterations >= 300
172
160
 
173
- clusters.each(&:reset_points)
161
+ @cluster_point_ids = Array.new(@k) { [] }
174
162
  end
175
163
 
164
+ @error = calculate_error
176
165
  @runtime = Time.now - start_time
177
166
  self
178
167
  end
179
168
 
180
- def error
181
- @clusters.map(&:sum_of_squares_error).reduce(:+)
169
+ def finish
170
+ set_points
171
+ set_clusters
172
+ self
182
173
  end
183
174
 
184
- def closest_cluster point = origin
185
- sorted_clusters(point).first
175
+ def predict data
176
+ data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
177
+ data = NMatrix.cast(data, NArray::DFLOAT)
178
+ distances = distance(@centroids, data, nil)
179
+ data.shape[1].times.map do |i|
180
+ distances[i, true].sort_index[0] # index of closest cluster
181
+ end
186
182
  end
187
183
 
188
184
  def sorted_clusters point = origin
189
- point = Point.new(point) unless point.is_a?(Point)
185
+ point = wrap_point point
190
186
  centroids = get_cluster_centroids
191
- distances = Distance.call(centroids, point.data)
187
+ distances = distance(centroids, point.data)
192
188
  @clusters.sort_by.with_index {|c, i| distances[i] }
193
189
  end
194
190
 
195
191
  def origin
196
- Point.new Array.new(@points[0].dimension, 0)
192
+ wrap_point Array.new(@points[0].dimension, 0)
197
193
  end
198
194
 
199
195
  def silhouette_score
200
- return 1.0 if @clusters.length < 2
201
-
202
- scores = @points.map do |point|
203
- acluster, bcluster = sorted_clusters(point).slice(0,2)
204
- a = acluster.dissimilarity(point)
205
- b = bcluster.dissimilarity(point)
196
+ return 1.0 if @k < 2
197
+
198
+ distances = distance(@centroids, @points_matrix)
199
+
200
+ scores = @points_count.times.map do |i|
201
+ point = get_point i
202
+ cluster_indexes = distances[i, true].sort_index
203
+
204
+ c1_points = get_points_for_centroid cluster_indexes[0]
205
+ c2_points = get_points_for_centroid cluster_indexes[1]
206
+
207
+ a = dissimilarity(c1_points, point)
208
+ b = dissimilarity(c2_points, point)
206
209
  (b - a) / [a,b].max
207
210
  end
208
211
 
@@ -210,73 +213,130 @@ class KMeansClusterer
210
213
  end
211
214
 
212
215
  private
213
- def init_clusters
216
+ def wrap_point point
217
+ return point if point.is_a?(Point)
218
+ Point.new(0, NArray.to_na(point).to_f)
219
+ end
220
+
221
+ def dissimilarity points, point
222
+ distances = distance points, point
223
+ distances.sum / distances.length.to_f
224
+ end
225
+
226
+ def init_centroids
214
227
  case @init
215
228
  when :random
216
- random_cluster_init
229
+ random_centroid_init
217
230
  when Array
218
- custom_cluster_init
231
+ custom_centroid_init
219
232
  else
220
- kmpp_cluster_init
233
+ kmpp_centroid_init
221
234
  end
222
235
  end
223
236
 
224
237
  # k-means++
225
- def kmpp_cluster_init
226
- @clusters = []
227
- pick = rand(@points.length)
228
- centroid = Point.new @points[pick].data.to_a
229
- @clusters << Cluster.new(centroid, 1)
230
-
231
- while @clusters.length < @k
232
- centroids = get_cluster_centroids
233
-
234
- d2 = @points.map do |point|
235
- dists = Distance.call centroids, point.data
236
- dists.min**2 # closest cluster distance, squared
238
+ def kmpp_centroid_init
239
+ centroid_ids = []
240
+ pick = rand(@points_count)
241
+ centroid_ids << pick
242
+
243
+ while centroid_ids.length < @k
244
+ centroids = @points_matrix[true, centroid_ids]
245
+
246
+ distances = distance(centroids, @points_matrix)
247
+
248
+ d2 = []
249
+ @points_count.times do |i|
250
+ min_distance = distances[i, true].min
251
+ d2 << min_distance**2
237
252
  end
238
253
 
239
254
  d2 = NArray.to_na d2
240
255
  probs = d2 / d2.sum
241
256
  cumprobs = probs.cumsum
242
257
  r = rand
243
- # pick = cumprobs.to_a.index {|prob| r < prob }
244
258
  pick = (cumprobs >= r).where[0]
245
- centroid = Point.new @points[pick].data.to_a
246
- cluster = Cluster.new(centroid, @clusters.length + 1)
247
- @clusters << cluster
259
+ centroid_ids << pick
248
260
  end
249
- end
250
261
 
251
- def custom_cluster_init
252
- @clusters = @init.map.with_index do |instance, i|
253
- point = Point.new NArray.to_na(instance).to_f
254
- Cluster.new point, i+1
255
- end
262
+ @centroids = @points_matrix[true, centroid_ids]
256
263
  end
257
264
 
258
- def random_cluster_init
259
- @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
265
+ def custom_centroid_init
266
+ @centroids = NMatrix.cast @init
267
+ @k = @init.length
260
268
  end
261
269
 
262
- def pick_k_random_points
263
- pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
270
+ def random_centroid_init
271
+ @centroids = @points_matrix[true, pick_k_random_indexes]
264
272
  end
265
273
 
266
274
  def pick_k_random_indexes
267
- @points.length.times.to_a.shuffle.slice(0, @k)
275
+ @points_count.times.to_a.shuffle.slice(0, @k)
268
276
  end
269
277
 
270
278
  def get_cluster_centroids
271
279
  NArray.to_na @clusters.map {|c| c.centroid.data }
272
280
  end
273
- end
274
281
 
275
- class KMediansClusterer < KMeansClusterer
276
- Distance = -> (a, b) { (a - b).abs.sum(0) }
277
- CalculateCentroid = -> (a) { a.rot90.median(0) }
282
+ def set_points
283
+ @points = @points_count.times.map do |i|
284
+ data = NArray.cast @points_matrix[true, i].flatten
285
+ Point.new(i, data, @labels[i])
286
+ end
287
+ end
278
288
 
279
- def error
280
- @clusters.map(&:sum_of_distances).reduce(:+)
281
- end
289
+ def set_clusters
290
+ @clusters = @k.times.map do |i|
291
+ centroid = NArray.cast @centroids[true, i].flatten
292
+ c = Cluster.new i, Point.new(-i, centroid)
293
+ @cluster_point_ids[i].each do |p|
294
+ c << @points[p]
295
+ end
296
+ c
297
+ end
298
+ end
299
+
300
+ def calculate_error
301
+ errors = @k.times.map do |i|
302
+ centroid = get_centroid i
303
+ points = get_points_for_centroid i
304
+
305
+ if points.empty?
306
+ 0
307
+ else
308
+ distances = distance points, centroid
309
+ (distances**2).sum
310
+ end
311
+ end
312
+
313
+ errors.reduce(:+)
314
+ end
315
+
316
+ def get_point i
317
+ NArray.cast @points_matrix[true, i].flatten
318
+ end
319
+
320
+ def get_centroid i
321
+ NArray.cast(@centroids[true, i].flatten)
322
+ end
323
+
324
+ def get_points_for_centroid i
325
+ point_ids = @cluster_point_ids[i]
326
+ NArray.cast @points_matrix[true, point_ids]
327
+ end
328
+
329
+ def distance x, y, yy = @row_norms
330
+ if x.is_a?(NMatrix) && y.is_a?(NMatrix)
331
+ xx = x.map {|v| v**2}.sum(0)
332
+ yy ||= y.map {|v| v**2}.sum(0)
333
+ xy = x * y.transpose
334
+ distance = xy * -2
335
+ distance += xx
336
+ distance += yy.transpose
337
+ NMath.sqrt distance
338
+ else
339
+ NMath.sqrt ((x - y)**2).sum(0)
340
+ end
341
+ end
282
342
  end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-29 00:00:00.000000000 Z
11
+ date: 2015-02-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: k-means/k-medians clustering. Uses NArray for fast calculations.
13
+ description: k-means clustering. Uses NArray for fast calculations.
14
14
  email: gbuesing@gmail.com
15
15
  executables: []
16
16
  extensions: []
@@ -40,5 +40,5 @@ rubyforge_project:
40
40
  rubygems_version: 2.4.5
41
41
  signing_key:
42
42
  specification_version: 4
43
- summary: k-means/k-medians clustering
43
+ summary: k-means clustering
44
44
  test_files: []