kmeans-clusterer 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +34 -33
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
4
- data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
3
+ metadata.gz: 2c2552e6a8ee7eddd6d03d8fa8cf7038c0458a11
4
+ data.tar.gz: 62172e9d841aa4df7b332a4e2cda55bf426bc9b1
5
5
  SHA512:
6
- metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
7
- data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
6
+ metadata.gz: a33d17f4749b00e7ee8a0829e6a402db6eb768bd62f5dcb1e3ed9dcf3020b5ec074b3e5e38a817fb6d4a51ec13b8cf75f189a55b6a3e488738b4dc2828cbb878
7
+ data.tar.gz: c4a2e27da37141435583d1b09cbf4001f4b2c44a5b2900155e2444f45d9663edf0ccab6d4f80c50d05ad85ce31710a7d4558356bded561bf17199e7333c3e14d
@@ -46,12 +46,13 @@ class KMeansClusterer
46
46
 
47
47
 
48
48
  class Point
49
- attr_reader :id, :data
49
+ attr_reader :id, :data, :centroid_distances
50
50
  attr_accessor :cluster, :label
51
51
 
52
- def initialize id, data, label = nil
52
+ def initialize id, data, centroid_distances, label = nil
53
53
  @id = id
54
54
  @data = data
55
+ @centroid_distances = centroid_distances
55
56
  @label = label
56
57
  end
57
58
 
@@ -70,6 +71,10 @@ class KMeansClusterer
70
71
  def dimension
71
72
  @data.length
72
73
  end
74
+
75
+ def centroid_distance
76
+ @centroid_distances[@cluster.id]
77
+ end
73
78
  end
74
79
 
75
80
 
@@ -116,7 +121,7 @@ class KMeansClusterer
116
121
  opts[:std] = std
117
122
  end
118
123
 
119
- opts[:points_matrix] = data
124
+ opts[:data] = data
120
125
  opts[:row_norms] = Scaler.row_norms(data)
121
126
 
122
127
  bestrun = nil
@@ -137,7 +142,7 @@ class KMeansClusterer
137
142
  end
138
143
 
139
144
 
140
- attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
145
+ attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime, :distances, :data
141
146
 
142
147
 
143
148
  def initialize opts = {}
@@ -146,8 +151,8 @@ class KMeansClusterer
146
151
  @labels = opts[:labels] || []
147
152
  @row_norms = opts[:row_norms]
148
153
 
149
- @points_matrix = opts[:points_matrix]
150
- @points_count = @points_matrix.shape[1] if @points_matrix
154
+ @data = opts[:data]
155
+ @points_count = @data.shape[1] if @data
151
156
  @mean = opts[:mean]
152
157
  @std = opts[:std]
153
158
  @scale_data = opts[:scale_data]
@@ -167,10 +172,10 @@ class KMeansClusterer
167
172
  @iterations +=1
168
173
 
169
174
  min_distances.fill! Float::INFINITY
170
- distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
175
+ @distances = Distance.euclidean(@centroids, @data, @row_norms)
171
176
 
172
177
  @k.times do |cluster_id|
173
- dist = NArray.ref distances[true, cluster_id].flatten
178
+ dist = NArray.ref @distances[true, cluster_id].flatten
174
179
  mask = dist < min_distances
175
180
  @cluster_assigns[mask] = cluster_id
176
181
  min_distances[mask] = dist[mask]
@@ -183,7 +188,7 @@ class KMeansClusterer
183
188
  point_ids = @cluster_assigns.eq(cluster_id).where
184
189
 
185
190
  unless point_ids.empty?
186
- points = @points_matrix[true, point_ids]
191
+ points = @data[true, point_ids]
187
192
  newcenter = points.mean(1)
188
193
  move = Distance.euclidean(centroid, newcenter)
189
194
  max_move = move if move > max_move
@@ -203,17 +208,21 @@ class KMeansClusterer
203
208
  def finish
204
209
  @clusters = @k.times.map do |i|
205
210
  centroid = NArray.ref @centroids[true, i].flatten
206
- Cluster.new i, Point.new(-i, centroid)
211
+ Cluster.new i, Point.new(-1, centroid, nil, nil)
207
212
  end
208
213
 
209
214
  @points = @points_count.times.map do |i|
210
- data = NArray.ref @points_matrix[true, i].flatten
211
- point = Point.new(i, data, @labels[i])
215
+ data = NArray.ref @data[true, i].flatten
216
+ point = Point.new(i, data, @distances[i, true], @labels[i])
212
217
  cluster = @clusters[@cluster_assigns[i]]
213
- cluster.points << point
218
+ cluster << point
214
219
  point
215
220
  end
216
221
 
222
+ @clusters.each do |c|
223
+ c.points.sort_by! &:centroid_distance
224
+ end
225
+
217
226
  self
218
227
  end
219
228
 
@@ -236,17 +245,13 @@ class KMeansClusterer
236
245
  def silhouette
237
246
  return 1.0 if @k < 2
238
247
 
239
- distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
248
+ scores = @points.map do |point|
249
+ sort_index = point.centroid_distances.sort_index
250
+ c1_points = get_points_for_cluster sort_index[0]
251
+ c2_points = get_points_for_cluster sort_index[1]
240
252
 
241
- scores = @points_count.times.map do |i|
242
- point = get_point i
243
- cluster_indexes = distances[i, true].sort_index
244
-
245
- c1_points = get_points_for_centroid cluster_indexes[0]
246
- c2_points = get_points_for_centroid cluster_indexes[1]
247
-
248
- a = dissimilarity(c1_points, point)
249
- b = dissimilarity(c2_points, point)
253
+ a = dissimilarity(c1_points, point.data)
254
+ b = dissimilarity(c2_points, point.data)
250
255
  (b - a) / [a,b].max
251
256
  end
252
257
 
@@ -282,9 +287,9 @@ class KMeansClusterer
282
287
  centroid_ids << pick
283
288
 
284
289
  while centroid_ids.length < @k
285
- centroids = @points_matrix[true, centroid_ids]
290
+ centroids = @data[true, centroid_ids]
286
291
 
287
- distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
292
+ distances = Distance.euclidean(centroids, @data, @row_norms)
288
293
 
289
294
  d2 = []
290
295
  @points_count.times do |i|
@@ -300,7 +305,7 @@ class KMeansClusterer
300
305
  centroid_ids << pick
301
306
  end
302
307
 
303
- @centroids = @points_matrix[true, centroid_ids]
308
+ @centroids = @data[true, centroid_ids]
304
309
  end
305
310
 
306
311
  def custom_centroid_init
@@ -309,24 +314,20 @@ class KMeansClusterer
309
314
  end
310
315
 
311
316
  def random_centroid_init
312
- @centroids = @points_matrix[true, pick_k_random_indexes]
317
+ @centroids = @data[true, pick_k_random_indexes]
313
318
  end
314
319
 
315
320
  def pick_k_random_indexes
316
321
  @points_count.times.to_a.sample @k
317
322
  end
318
323
 
319
- def get_point i
320
- NArray.ref @points_matrix[true, i].flatten
321
- end
322
-
323
324
  def get_centroid i
324
325
  NArray.ref(@centroids[true, i].flatten)
325
326
  end
326
327
 
327
- def get_points_for_centroid i
328
+ def get_points_for_cluster i
328
329
  point_ids = @cluster_assigns.eq(i).where
329
- points = @points_matrix[true, point_ids]
330
+ points = @data[true, point_ids]
330
331
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
331
332
  end
332
333
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-10 00:00:00.000000000 Z
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray