kmeans-clusterer 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +34 -33
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
4
- data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
3
+ metadata.gz: 2c2552e6a8ee7eddd6d03d8fa8cf7038c0458a11
4
+ data.tar.gz: 62172e9d841aa4df7b332a4e2cda55bf426bc9b1
5
5
  SHA512:
6
- metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
7
- data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
6
+ metadata.gz: a33d17f4749b00e7ee8a0829e6a402db6eb768bd62f5dcb1e3ed9dcf3020b5ec074b3e5e38a817fb6d4a51ec13b8cf75f189a55b6a3e488738b4dc2828cbb878
7
+ data.tar.gz: c4a2e27da37141435583d1b09cbf4001f4b2c44a5b2900155e2444f45d9663edf0ccab6d4f80c50d05ad85ce31710a7d4558356bded561bf17199e7333c3e14d
@@ -46,12 +46,13 @@ class KMeansClusterer
46
46
 
47
47
 
48
48
  class Point
49
- attr_reader :id, :data
49
+ attr_reader :id, :data, :centroid_distances
50
50
  attr_accessor :cluster, :label
51
51
 
52
- def initialize id, data, label = nil
52
+ def initialize id, data, centroid_distances, label = nil
53
53
  @id = id
54
54
  @data = data
55
+ @centroid_distances = centroid_distances
55
56
  @label = label
56
57
  end
57
58
 
@@ -70,6 +71,10 @@ class KMeansClusterer
70
71
  def dimension
71
72
  @data.length
72
73
  end
74
+
75
+ def centroid_distance
76
+ @centroid_distances[@cluster.id]
77
+ end
73
78
  end
74
79
 
75
80
 
@@ -116,7 +121,7 @@ class KMeansClusterer
116
121
  opts[:std] = std
117
122
  end
118
123
 
119
- opts[:points_matrix] = data
124
+ opts[:data] = data
120
125
  opts[:row_norms] = Scaler.row_norms(data)
121
126
 
122
127
  bestrun = nil
@@ -137,7 +142,7 @@ class KMeansClusterer
137
142
  end
138
143
 
139
144
 
140
- attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
145
+ attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime, :distances, :data
141
146
 
142
147
 
143
148
  def initialize opts = {}
@@ -146,8 +151,8 @@ class KMeansClusterer
146
151
  @labels = opts[:labels] || []
147
152
  @row_norms = opts[:row_norms]
148
153
 
149
- @points_matrix = opts[:points_matrix]
150
- @points_count = @points_matrix.shape[1] if @points_matrix
154
+ @data = opts[:data]
155
+ @points_count = @data.shape[1] if @data
151
156
  @mean = opts[:mean]
152
157
  @std = opts[:std]
153
158
  @scale_data = opts[:scale_data]
@@ -167,10 +172,10 @@ class KMeansClusterer
167
172
  @iterations +=1
168
173
 
169
174
  min_distances.fill! Float::INFINITY
170
- distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
175
+ @distances = Distance.euclidean(@centroids, @data, @row_norms)
171
176
 
172
177
  @k.times do |cluster_id|
173
- dist = NArray.ref distances[true, cluster_id].flatten
178
+ dist = NArray.ref @distances[true, cluster_id].flatten
174
179
  mask = dist < min_distances
175
180
  @cluster_assigns[mask] = cluster_id
176
181
  min_distances[mask] = dist[mask]
@@ -183,7 +188,7 @@ class KMeansClusterer
183
188
  point_ids = @cluster_assigns.eq(cluster_id).where
184
189
 
185
190
  unless point_ids.empty?
186
- points = @points_matrix[true, point_ids]
191
+ points = @data[true, point_ids]
187
192
  newcenter = points.mean(1)
188
193
  move = Distance.euclidean(centroid, newcenter)
189
194
  max_move = move if move > max_move
@@ -203,17 +208,21 @@ class KMeansClusterer
203
208
  def finish
204
209
  @clusters = @k.times.map do |i|
205
210
  centroid = NArray.ref @centroids[true, i].flatten
206
- Cluster.new i, Point.new(-i, centroid)
211
+ Cluster.new i, Point.new(-1, centroid, nil, nil)
207
212
  end
208
213
 
209
214
  @points = @points_count.times.map do |i|
210
- data = NArray.ref @points_matrix[true, i].flatten
211
- point = Point.new(i, data, @labels[i])
215
+ data = NArray.ref @data[true, i].flatten
216
+ point = Point.new(i, data, @distances[i, true], @labels[i])
212
217
  cluster = @clusters[@cluster_assigns[i]]
213
- cluster.points << point
218
+ cluster << point
214
219
  point
215
220
  end
216
221
 
222
+ @clusters.each do |c|
223
+ c.points.sort_by! &:centroid_distance
224
+ end
225
+
217
226
  self
218
227
  end
219
228
 
@@ -236,17 +245,13 @@ class KMeansClusterer
236
245
  def silhouette
237
246
  return 1.0 if @k < 2
238
247
 
239
- distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
248
+ scores = @points.map do |point|
249
+ sort_index = point.centroid_distances.sort_index
250
+ c1_points = get_points_for_cluster sort_index[0]
251
+ c2_points = get_points_for_cluster sort_index[1]
240
252
 
241
- scores = @points_count.times.map do |i|
242
- point = get_point i
243
- cluster_indexes = distances[i, true].sort_index
244
-
245
- c1_points = get_points_for_centroid cluster_indexes[0]
246
- c2_points = get_points_for_centroid cluster_indexes[1]
247
-
248
- a = dissimilarity(c1_points, point)
249
- b = dissimilarity(c2_points, point)
253
+ a = dissimilarity(c1_points, point.data)
254
+ b = dissimilarity(c2_points, point.data)
250
255
  (b - a) / [a,b].max
251
256
  end
252
257
 
@@ -282,9 +287,9 @@ class KMeansClusterer
282
287
  centroid_ids << pick
283
288
 
284
289
  while centroid_ids.length < @k
285
- centroids = @points_matrix[true, centroid_ids]
290
+ centroids = @data[true, centroid_ids]
286
291
 
287
- distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
292
+ distances = Distance.euclidean(centroids, @data, @row_norms)
288
293
 
289
294
  d2 = []
290
295
  @points_count.times do |i|
@@ -300,7 +305,7 @@ class KMeansClusterer
300
305
  centroid_ids << pick
301
306
  end
302
307
 
303
- @centroids = @points_matrix[true, centroid_ids]
308
+ @centroids = @data[true, centroid_ids]
304
309
  end
305
310
 
306
311
  def custom_centroid_init
@@ -309,24 +314,20 @@ class KMeansClusterer
309
314
  end
310
315
 
311
316
  def random_centroid_init
312
- @centroids = @points_matrix[true, pick_k_random_indexes]
317
+ @centroids = @data[true, pick_k_random_indexes]
313
318
  end
314
319
 
315
320
  def pick_k_random_indexes
316
321
  @points_count.times.to_a.sample @k
317
322
  end
318
323
 
319
- def get_point i
320
- NArray.ref @points_matrix[true, i].flatten
321
- end
322
-
323
324
  def get_centroid i
324
325
  NArray.ref(@centroids[true, i].flatten)
325
326
  end
326
327
 
327
- def get_points_for_centroid i
328
+ def get_points_for_cluster i
328
329
  point_ids = @cluster_assigns.eq(i).where
329
- points = @points_matrix[true, point_ids]
330
+ points = @data[true, point_ids]
330
331
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
331
332
  end
332
333
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-10 00:00:00.000000000 Z
11
+ date: 2015-03-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray