kmeans-clusterer 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +43 -66
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b1be676c0018dc27d0d3133b2ea021cf6349814
4
- data.tar.gz: 0ef6ad8299f561accd85006569928b8b2d005a94
3
+ metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
4
+ data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
5
5
  SHA512:
6
- metadata.gz: aed54a15689e0b2c785a2d8aca57026959942239ca44576d3ffc835a84981a190daf1b0679580f15b8944a2f58b33c9f98ef60b59598781d65278e506c60ed45
7
- data.tar.gz: 758a21a859d09e981c350d02de7965e39294f4dba667a741b08bab93fab8670a4d8573617c7d0896112c38f7a1e0be4723f9e20fb00a5f190bde55e89e8517e1
6
+ metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
7
+ data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
@@ -21,13 +21,18 @@ class KMeansClusterer
21
21
  data = (data - mean) / std
22
22
  [NMatrix.ref(data), mean, std]
23
23
  end
24
+
25
+ def self.row_norms data
26
+ squared_data = NArray.ref(data)**2
27
+ NMatrix.ref(squared_data).sum(0)
28
+ end
24
29
  end
25
30
 
26
31
  module Distance
27
32
  def self.euclidean x, y, yy = nil
28
33
  if x.is_a?(NMatrix) && y.is_a?(NMatrix)
29
- xx = x.map {|v| v**2}.sum(0)
30
- yy ||= y.map {|v| v**2}.sum(0)
34
+ xx = Scaler.row_norms(x)
35
+ yy ||= Scaler.row_norms(y)
31
36
  xy = x * y.transpose
32
37
  distance = xy * -2
33
38
  distance += xx
@@ -93,7 +98,7 @@ class KMeansClusterer
93
98
  end
94
99
 
95
100
 
96
- DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
101
+ DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
97
102
 
98
103
  def self.run k, data, opts = {}
99
104
  opts = DEFAULT_OPTS.merge(opts)
@@ -112,7 +117,7 @@ class KMeansClusterer
112
117
  end
113
118
 
114
119
  opts[:points_matrix] = data
115
- opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
120
+ opts[:row_norms] = Scaler.row_norms(data)
116
121
 
117
122
  bestrun = nil
118
123
 
@@ -147,6 +152,7 @@ class KMeansClusterer
147
152
  @std = opts[:std]
148
153
  @scale_data = opts[:scale_data]
149
154
  @typecode = opts[:typecode]
155
+ @max_iter = opts[:max_iter]
150
156
 
151
157
  init_centroids
152
158
  end
@@ -154,55 +160,60 @@ class KMeansClusterer
154
160
  def run
155
161
  start_time = Time.now
156
162
  @iterations, @runtime = 0, 0
157
-
158
- @cluster_point_ids = Array.new(@k) { [] }
163
+ @cluster_assigns = NArray.int(@points_count)
164
+ min_distances = NArray.new(@typecode, @points_count)
159
165
 
160
166
  loop do
161
167
  @iterations +=1
162
168
 
169
+ min_distances.fill! Float::INFINITY
163
170
  distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
164
171
 
165
- # assign point ids to @cluster_point_ids
166
- @points_count.times do |i|
167
- min_distance_index = distances[i, true].sort_index[0]
168
- @cluster_point_ids[min_distance_index] << i
172
+ @k.times do |cluster_id|
173
+ dist = NArray.ref distances[true, cluster_id].flatten
174
+ mask = dist < min_distances
175
+ @cluster_assigns[mask] = cluster_id
176
+ min_distances[mask] = dist[mask]
169
177
  end
170
178
 
171
- moves = []
172
- updated_centroids = []
179
+ max_move = 0
173
180
 
174
- @k.times do |i|
175
- centroid = NArray.ref(@centroids[true, i].flatten)
176
- point_ids = @cluster_point_ids[i]
181
+ @k.times do |cluster_id|
182
+ centroid = NArray.ref(@centroids[true, cluster_id].flatten)
183
+ point_ids = @cluster_assigns.eq(cluster_id).where
177
184
 
178
- if point_ids.empty?
179
- newcenter = centroid
180
- moves << 0
181
- else
185
+ unless point_ids.empty?
182
186
  points = @points_matrix[true, point_ids]
183
187
  newcenter = points.mean(1)
184
- moves << Distance.euclidean(centroid, newcenter)
188
+ move = Distance.euclidean(centroid, newcenter)
189
+ max_move = move if move > max_move
190
+ @centroids[true, cluster_id] = newcenter
185
191
  end
186
-
187
- updated_centroids << newcenter
188
192
  end
189
193
 
190
- @centroids = NMatrix.cast updated_centroids, @typecode
191
-
192
- break if moves.max < 0.001 # i.e., no movement
193
- break if @iterations >= 300
194
-
195
- @cluster_point_ids = Array.new(@k) { [] }
194
+ break if max_move < 0.001 # i.e., no movement
195
+ break if @iterations >= @max_iter
196
196
  end
197
197
 
198
- @error = calculate_error
198
+ @error = (min_distances**2).sum
199
199
  @runtime = Time.now - start_time
200
200
  self
201
201
  end
202
202
 
203
203
  def finish
204
- set_points
205
- set_clusters
204
+ @clusters = @k.times.map do |i|
205
+ centroid = NArray.ref @centroids[true, i].flatten
206
+ Cluster.new i, Point.new(-i, centroid)
207
+ end
208
+
209
+ @points = @points_count.times.map do |i|
210
+ data = NArray.ref @points_matrix[true, i].flatten
211
+ point = Point.new(i, data, @labels[i])
212
+ cluster = @clusters[@cluster_assigns[i]]
213
+ cluster.points << point
214
+ point
215
+ end
216
+
206
217
  self
207
218
  end
208
219
 
@@ -305,40 +316,6 @@ class KMeansClusterer
305
316
  @points_count.times.to_a.sample @k
306
317
  end
307
318
 
308
- def set_points
309
- @points = @points_count.times.map do |i|
310
- data = NArray.ref @points_matrix[true, i].flatten
311
- Point.new(i, data, @labels[i])
312
- end
313
- end
314
-
315
- def set_clusters
316
- @clusters = @k.times.map do |i|
317
- centroid = NArray.ref @centroids[true, i].flatten
318
- c = Cluster.new i, Point.new(-i, centroid)
319
- @cluster_point_ids[i].each do |p|
320
- c << @points[p]
321
- end
322
- c
323
- end
324
- end
325
-
326
- def calculate_error
327
- errors = @k.times.map do |i|
328
- centroid = get_centroid i
329
- points = get_points_for_centroid i
330
-
331
- if points.empty?
332
- 0
333
- else
334
- distances = Distance.euclidean points, centroid
335
- (distances**2).sum
336
- end
337
- end
338
-
339
- errors.reduce(:+)
340
- end
341
-
342
319
  def get_point i
343
320
  NArray.ref @points_matrix[true, i].flatten
344
321
  end
@@ -348,7 +325,7 @@ class KMeansClusterer
348
325
  end
349
326
 
350
327
  def get_points_for_centroid i
351
- point_ids = @cluster_point_ids[i]
328
+ point_ids = @cluster_assigns.eq(i).where
352
329
  points = @points_matrix[true, point_ids]
353
330
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
354
331
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-03 00:00:00.000000000 Z
11
+ date: 2015-03-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray