kmeans-clusterer 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +206 -146
  3. metadata +4 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
4
- data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
3
+ metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
4
+ data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
5
5
  SHA512:
6
- metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
7
- data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
6
+ metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
7
+ data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b
@@ -1,17 +1,34 @@
1
1
  require 'narray'
2
2
 
3
3
  class KMeansClusterer
4
+ module Scaler
5
+ def self.mean data
6
+ data.mean(1)
7
+ end
8
+
9
+ def self.std data
10
+ std = data.rmsdev(1)
11
+ std[std.eq(0)] = 1.0 # so we don't divide by 0
12
+ std
13
+ end
14
+
15
+ def self.scale data, mean = nil, std = nil
16
+ data = NArray.cast(data, NArray::DFLOAT)
17
+ mean ||= self.mean(data)
18
+ std ||= self.std(data)
19
+ data = (data - mean) / std
20
+ [data, mean, std]
21
+ end
22
+ end
4
23
 
5
- # Euclidean distance function. Requires instances of NArray as args
6
- Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
7
- CalculateCentroid = -> (a) { a.mean(1) }
8
24
 
9
25
  class Point
10
- attr_reader :data
26
+ attr_reader :id, :data
11
27
  attr_accessor :cluster, :label
12
28
 
13
- def initialize data, label = nil
14
- @data = NArray.to_na data
29
+ def initialize id, data, label = nil
30
+ @id = id
31
+ @data = data
15
32
  @label = label
16
33
  end
17
34
 
@@ -34,175 +51,161 @@ class KMeansClusterer
34
51
 
35
52
 
36
53
  class Cluster
37
- attr_reader :centroid, :points
54
+ attr_reader :id, :centroid, :points
38
55
  attr_accessor :label
39
56
 
40
- def initialize centroid, label = nil
57
+ def initialize id, centroid
58
+ @id = id
41
59
  @centroid = centroid
42
- @label = label
43
60
  @points = []
44
61
  end
45
62
 
46
- def recenter
47
- if @points.empty?
48
- 0
49
- else
50
- old_centroid = @centroid
51
- @centroid = calculate_centroid_from_points
52
- Distance.call @centroid.data, old_centroid.data
53
- end
54
- end
55
-
56
63
  def << point
57
64
  point.cluster = self
58
65
  @points << point
59
66
  end
60
67
 
61
- def reset_points
62
- @points = []
63
- end
64
-
65
- def sorted_points
66
- distances = Distance.call points_narray, centroid.data
67
- @points.sort_by.with_index {|c, i| distances[i] }
68
- end
69
-
70
- def sum_of_squares_error
71
- if @points.empty?
72
- 0
73
- else
74
- distances = Distance.call points_narray, centroid.data
75
- (distances**2).sum
76
- end
77
- end
78
-
79
- def sum_of_distances
80
- return 0 if @points.empty?
81
- Distance.call(points_narray, centroid.data).sum
68
+ def points_narray
69
+ NArray.cast @points.map(&:data)
82
70
  end
83
-
84
- def dissimilarity point
85
- distances = Distance.call points_narray, point.data
86
- distances.sum / distances.length.to_f
87
- end
88
-
89
- private
90
- def calculate_centroid_from_points
91
- data = CalculateCentroid.call points_narray
92
- Point.new data
93
- end
94
-
95
- def points_narray
96
- NArray.to_na @points.map(&:data)
97
- end
98
71
  end
99
72
 
100
73
 
74
+ DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
75
+
101
76
  def self.run k, data, opts = {}
102
- raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
77
+ opts = DEFAULT_OPTS.merge(opts)
103
78
 
104
- data = if opts[:scale_data]
105
- scale_data data
106
- else
107
- data.map {|row| NArray.to_na(row).to_f}
79
+ opts[:k] = k
80
+
81
+ if opts[:scale_data]
82
+ data, mean, std = Scaler.scale(data)
83
+ opts[:mean] = mean
84
+ opts[:std] = std
108
85
  end
109
86
 
110
- runcount = opts[:runs] || 10
111
- errors = []
87
+ opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
88
+ opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
112
89
 
113
- runs = runcount.times.map do |i|
114
- km = new(k, data, opts).run
115
- error = km.error
90
+ runs = opts[:runs].times.map do |i|
91
+ km = new(opts).run
116
92
  if opts[:log]
117
- puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
93
+ puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
118
94
  end
119
- errors << error
120
95
  km
121
96
  end
122
97
 
123
- runs.sort_by.with_index {|run, i| errors[i] }.first
98
+ runs.sort_by {|run| run.error }.first.finish
124
99
  end
125
100
 
126
- # see scikit-learn scale and _mean_and_std methods
127
- def self.scale_data data
128
- nadata = NArray.to_na(data).to_f
129
- mean = nadata.mean(1)
130
- std = nadata.rmsdev(1)
131
- std[std.eq(0)] = 1.0 # so we don't divide by 0
132
- nadata = (nadata - mean) / std
133
- # convert back to an array, containing NArrays for each row
134
- data.length.times.map {|i| nadata[true, i] }
135
- end
136
101
 
102
+ attr_reader :k, :points, :clusters, :error, :iterations, :runtime
137
103
 
138
- attr_reader :k, :points, :clusters, :iterations, :runtime
139
104
 
105
+ def initialize opts = {}
106
+ @k = opts[:k]
107
+ @init = opts[:init]
108
+ @labels = opts[:labels] || []
109
+ @row_norms = opts[:row_norms]
140
110
 
141
- def initialize k, data, opts = {}
142
- @k = k
143
- @init = opts[:init] || :kmpp
144
- labels = opts[:labels] || []
111
+ @points_matrix = opts[:points_matrix]
112
+ @points_count = @points_matrix.shape[1] if @points_matrix
113
+ @mean = opts[:mean]
114
+ @std = opts[:std]
115
+ @scale_data = opts[:scale_data]
145
116
 
146
- @points = data.map.with_index do |instance, i|
147
- Point.new instance, labels[i]
148
- end
149
-
150
- init_clusters
117
+ init_centroids
151
118
  end
152
119
 
153
120
  def run
154
121
  start_time = Time.now
155
122
  @iterations, @runtime = 0, 0
156
123
 
124
+ @cluster_point_ids = Array.new(@k) { [] }
125
+
157
126
  loop do
158
127
  @iterations +=1
159
128
 
160
- centroids = get_cluster_centroids
129
+ distances = distance(@centroids, @points_matrix)
130
+
131
+ # assign point ids to @cluster_point_ids
132
+ @points_count.times do |i|
133
+ min_distance_index = distances[i, true].sort_index[0]
134
+ @cluster_point_ids[min_distance_index] << i
135
+ end
136
+
137
+ moves = []
138
+ updated_centroids = []
139
+
140
+ @k.times do |i|
141
+ centroid = NArray.cast(@centroids[true, i].flatten)
142
+ point_ids = @cluster_point_ids[i]
143
+
144
+ if point_ids.empty?
145
+ newcenter = centroid
146
+ moves << 0
147
+ else
148
+ points = @points_matrix[true, point_ids]
149
+ newcenter = points.mean(1)
150
+ moves << distance(centroid, newcenter)
151
+ end
161
152
 
162
- @points.each do |point|
163
- distances = Distance.call(centroids, point.data)
164
- cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
165
- cluster << point
153
+ updated_centroids << newcenter
166
154
  end
167
155
 
168
- moves = clusters.map(&:recenter)
156
+ @centroids = NMatrix.cast updated_centroids
169
157
 
170
158
  break if moves.max < 0.001 # i.e., no movement
171
159
  break if @iterations >= 300
172
160
 
173
- clusters.each(&:reset_points)
161
+ @cluster_point_ids = Array.new(@k) { [] }
174
162
  end
175
163
 
164
+ @error = calculate_error
176
165
  @runtime = Time.now - start_time
177
166
  self
178
167
  end
179
168
 
180
- def error
181
- @clusters.map(&:sum_of_squares_error).reduce(:+)
169
+ def finish
170
+ set_points
171
+ set_clusters
172
+ self
182
173
  end
183
174
 
184
- def closest_cluster point = origin
185
- sorted_clusters(point).first
175
+ def predict data
176
+ data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
177
+ data = NMatrix.cast(data, NArray::DFLOAT)
178
+ distances = distance(@centroids, data, nil)
179
+ data.shape[1].times.map do |i|
180
+ distances[i, true].sort_index[0] # index of closest cluster
181
+ end
186
182
  end
187
183
 
188
184
  def sorted_clusters point = origin
189
- point = Point.new(point) unless point.is_a?(Point)
185
+ point = wrap_point point
190
186
  centroids = get_cluster_centroids
191
- distances = Distance.call(centroids, point.data)
187
+ distances = distance(centroids, point.data)
192
188
  @clusters.sort_by.with_index {|c, i| distances[i] }
193
189
  end
194
190
 
195
191
  def origin
196
- Point.new Array.new(@points[0].dimension, 0)
192
+ wrap_point Array.new(@points[0].dimension, 0)
197
193
  end
198
194
 
199
195
  def silhouette_score
200
- return 1.0 if @clusters.length < 2
201
-
202
- scores = @points.map do |point|
203
- acluster, bcluster = sorted_clusters(point).slice(0,2)
204
- a = acluster.dissimilarity(point)
205
- b = bcluster.dissimilarity(point)
196
+ return 1.0 if @k < 2
197
+
198
+ distances = distance(@centroids, @points_matrix)
199
+
200
+ scores = @points_count.times.map do |i|
201
+ point = get_point i
202
+ cluster_indexes = distances[i, true].sort_index
203
+
204
+ c1_points = get_points_for_centroid cluster_indexes[0]
205
+ c2_points = get_points_for_centroid cluster_indexes[1]
206
+
207
+ a = dissimilarity(c1_points, point)
208
+ b = dissimilarity(c2_points, point)
206
209
  (b - a) / [a,b].max
207
210
  end
208
211
 
@@ -210,73 +213,130 @@ class KMeansClusterer
210
213
  end
211
214
 
212
215
  private
213
- def init_clusters
216
+ def wrap_point point
217
+ return point if point.is_a?(Point)
218
+ Point.new(0, NArray.to_na(point).to_f)
219
+ end
220
+
221
+ def dissimilarity points, point
222
+ distances = distance points, point
223
+ distances.sum / distances.length.to_f
224
+ end
225
+
226
+ def init_centroids
214
227
  case @init
215
228
  when :random
216
- random_cluster_init
229
+ random_centroid_init
217
230
  when Array
218
- custom_cluster_init
231
+ custom_centroid_init
219
232
  else
220
- kmpp_cluster_init
233
+ kmpp_centroid_init
221
234
  end
222
235
  end
223
236
 
224
237
  # k-means++
225
- def kmpp_cluster_init
226
- @clusters = []
227
- pick = rand(@points.length)
228
- centroid = Point.new @points[pick].data.to_a
229
- @clusters << Cluster.new(centroid, 1)
230
-
231
- while @clusters.length < @k
232
- centroids = get_cluster_centroids
233
-
234
- d2 = @points.map do |point|
235
- dists = Distance.call centroids, point.data
236
- dists.min**2 # closest cluster distance, squared
238
+ def kmpp_centroid_init
239
+ centroid_ids = []
240
+ pick = rand(@points_count)
241
+ centroid_ids << pick
242
+
243
+ while centroid_ids.length < @k
244
+ centroids = @points_matrix[true, centroid_ids]
245
+
246
+ distances = distance(centroids, @points_matrix)
247
+
248
+ d2 = []
249
+ @points_count.times do |i|
250
+ min_distance = distances[i, true].min
251
+ d2 << min_distance**2
237
252
  end
238
253
 
239
254
  d2 = NArray.to_na d2
240
255
  probs = d2 / d2.sum
241
256
  cumprobs = probs.cumsum
242
257
  r = rand
243
- # pick = cumprobs.to_a.index {|prob| r < prob }
244
258
  pick = (cumprobs >= r).where[0]
245
- centroid = Point.new @points[pick].data.to_a
246
- cluster = Cluster.new(centroid, @clusters.length + 1)
247
- @clusters << cluster
259
+ centroid_ids << pick
248
260
  end
249
- end
250
261
 
251
- def custom_cluster_init
252
- @clusters = @init.map.with_index do |instance, i|
253
- point = Point.new NArray.to_na(instance).to_f
254
- Cluster.new point, i+1
255
- end
262
+ @centroids = @points_matrix[true, centroid_ids]
256
263
  end
257
264
 
258
- def random_cluster_init
259
- @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
265
+ def custom_centroid_init
266
+ @centroids = NMatrix.cast @init
267
+ @k = @init.length
260
268
  end
261
269
 
262
- def pick_k_random_points
263
- pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
270
+ def random_centroid_init
271
+ @centroids = @points_matrix[true, pick_k_random_indexes]
264
272
  end
265
273
 
266
274
  def pick_k_random_indexes
267
- @points.length.times.to_a.shuffle.slice(0, @k)
275
+ @points_count.times.to_a.shuffle.slice(0, @k)
268
276
  end
269
277
 
270
278
  def get_cluster_centroids
271
279
  NArray.to_na @clusters.map {|c| c.centroid.data }
272
280
  end
273
- end
274
281
 
275
- class KMediansClusterer < KMeansClusterer
276
- Distance = -> (a, b) { (a - b).abs.sum(0) }
277
- CalculateCentroid = -> (a) { a.rot90.median(0) }
282
+ def set_points
283
+ @points = @points_count.times.map do |i|
284
+ data = NArray.cast @points_matrix[true, i].flatten
285
+ Point.new(i, data, @labels[i])
286
+ end
287
+ end
278
288
 
279
- def error
280
- @clusters.map(&:sum_of_distances).reduce(:+)
281
- end
289
+ def set_clusters
290
+ @clusters = @k.times.map do |i|
291
+ centroid = NArray.cast @centroids[true, i].flatten
292
+ c = Cluster.new i, Point.new(-i, centroid)
293
+ @cluster_point_ids[i].each do |p|
294
+ c << @points[p]
295
+ end
296
+ c
297
+ end
298
+ end
299
+
300
+ def calculate_error
301
+ errors = @k.times.map do |i|
302
+ centroid = get_centroid i
303
+ points = get_points_for_centroid i
304
+
305
+ if points.empty?
306
+ 0
307
+ else
308
+ distances = distance points, centroid
309
+ (distances**2).sum
310
+ end
311
+ end
312
+
313
+ errors.reduce(:+)
314
+ end
315
+
316
+ def get_point i
317
+ NArray.cast @points_matrix[true, i].flatten
318
+ end
319
+
320
+ def get_centroid i
321
+ NArray.cast(@centroids[true, i].flatten)
322
+ end
323
+
324
+ def get_points_for_centroid i
325
+ point_ids = @cluster_point_ids[i]
326
+ NArray.cast @points_matrix[true, point_ids]
327
+ end
328
+
329
+ def distance x, y, yy = @row_norms
330
+ if x.is_a?(NMatrix) && y.is_a?(NMatrix)
331
+ xx = x.map {|v| v**2}.sum(0)
332
+ yy ||= y.map {|v| v**2}.sum(0)
333
+ xy = x * y.transpose
334
+ distance = xy * -2
335
+ distance += xx
336
+ distance += yy.transpose
337
+ NMath.sqrt distance
338
+ else
339
+ NMath.sqrt ((x - y)**2).sum(0)
340
+ end
341
+ end
282
342
  end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-29 00:00:00.000000000 Z
11
+ date: 2015-02-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: k-means/k-medians clustering. Uses NArray for fast calculations.
13
+ description: k-means clustering. Uses NArray for fast calculations.
14
14
  email: gbuesing@gmail.com
15
15
  executables: []
16
16
  extensions: []
@@ -40,5 +40,5 @@ rubyforge_project:
40
40
  rubygems_version: 2.4.5
41
41
  signing_key:
42
42
  specification_version: 4
43
- summary: k-means/k-medians clustering
43
+ summary: k-means clustering
44
44
  test_files: []