kmeans-clusterer 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +43 -66
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b1be676c0018dc27d0d3133b2ea021cf6349814
4
- data.tar.gz: 0ef6ad8299f561accd85006569928b8b2d005a94
3
+ metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
4
+ data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
5
5
  SHA512:
6
- metadata.gz: aed54a15689e0b2c785a2d8aca57026959942239ca44576d3ffc835a84981a190daf1b0679580f15b8944a2f58b33c9f98ef60b59598781d65278e506c60ed45
7
- data.tar.gz: 758a21a859d09e981c350d02de7965e39294f4dba667a741b08bab93fab8670a4d8573617c7d0896112c38f7a1e0be4723f9e20fb00a5f190bde55e89e8517e1
6
+ metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
7
+ data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
@@ -21,13 +21,18 @@ class KMeansClusterer
21
21
  data = (data - mean) / std
22
22
  [NMatrix.ref(data), mean, std]
23
23
  end
24
+
25
+ def self.row_norms data
26
+ squared_data = NArray.ref(data)**2
27
+ NMatrix.ref(squared_data).sum(0)
28
+ end
24
29
  end
25
30
 
26
31
  module Distance
27
32
  def self.euclidean x, y, yy = nil
28
33
  if x.is_a?(NMatrix) && y.is_a?(NMatrix)
29
- xx = x.map {|v| v**2}.sum(0)
30
- yy ||= y.map {|v| v**2}.sum(0)
34
+ xx = Scaler.row_norms(x)
35
+ yy ||= Scaler.row_norms(y)
31
36
  xy = x * y.transpose
32
37
  distance = xy * -2
33
38
  distance += xx
@@ -93,7 +98,7 @@ class KMeansClusterer
93
98
  end
94
99
 
95
100
 
96
- DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }
101
+ DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
97
102
 
98
103
  def self.run k, data, opts = {}
99
104
  opts = DEFAULT_OPTS.merge(opts)
@@ -112,7 +117,7 @@ class KMeansClusterer
112
117
  end
113
118
 
114
119
  opts[:points_matrix] = data
115
- opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
120
+ opts[:row_norms] = Scaler.row_norms(data)
116
121
 
117
122
  bestrun = nil
118
123
 
@@ -147,6 +152,7 @@ class KMeansClusterer
147
152
  @std = opts[:std]
148
153
  @scale_data = opts[:scale_data]
149
154
  @typecode = opts[:typecode]
155
+ @max_iter = opts[:max_iter]
150
156
 
151
157
  init_centroids
152
158
  end
@@ -154,55 +160,60 @@ class KMeansClusterer
154
160
  def run
155
161
  start_time = Time.now
156
162
  @iterations, @runtime = 0, 0
157
-
158
- @cluster_point_ids = Array.new(@k) { [] }
163
+ @cluster_assigns = NArray.int(@points_count)
164
+ min_distances = NArray.new(@typecode, @points_count)
159
165
 
160
166
  loop do
161
167
  @iterations +=1
162
168
 
169
+ min_distances.fill! Float::INFINITY
163
170
  distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
164
171
 
165
- # assign point ids to @cluster_point_ids
166
- @points_count.times do |i|
167
- min_distance_index = distances[i, true].sort_index[0]
168
- @cluster_point_ids[min_distance_index] << i
172
+ @k.times do |cluster_id|
173
+ dist = NArray.ref distances[true, cluster_id].flatten
174
+ mask = dist < min_distances
175
+ @cluster_assigns[mask] = cluster_id
176
+ min_distances[mask] = dist[mask]
169
177
  end
170
178
 
171
- moves = []
172
- updated_centroids = []
179
+ max_move = 0
173
180
 
174
- @k.times do |i|
175
- centroid = NArray.ref(@centroids[true, i].flatten)
176
- point_ids = @cluster_point_ids[i]
181
+ @k.times do |cluster_id|
182
+ centroid = NArray.ref(@centroids[true, cluster_id].flatten)
183
+ point_ids = @cluster_assigns.eq(cluster_id).where
177
184
 
178
- if point_ids.empty?
179
- newcenter = centroid
180
- moves << 0
181
- else
185
+ unless point_ids.empty?
182
186
  points = @points_matrix[true, point_ids]
183
187
  newcenter = points.mean(1)
184
- moves << Distance.euclidean(centroid, newcenter)
188
+ move = Distance.euclidean(centroid, newcenter)
189
+ max_move = move if move > max_move
190
+ @centroids[true, cluster_id] = newcenter
185
191
  end
186
-
187
- updated_centroids << newcenter
188
192
  end
189
193
 
190
- @centroids = NMatrix.cast updated_centroids, @typecode
191
-
192
- break if moves.max < 0.001 # i.e., no movement
193
- break if @iterations >= 300
194
-
195
- @cluster_point_ids = Array.new(@k) { [] }
194
+ break if max_move < 0.001 # i.e., no movement
195
+ break if @iterations >= @max_iter
196
196
  end
197
197
 
198
- @error = calculate_error
198
+ @error = (min_distances**2).sum
199
199
  @runtime = Time.now - start_time
200
200
  self
201
201
  end
202
202
 
203
203
  def finish
204
- set_points
205
- set_clusters
204
+ @clusters = @k.times.map do |i|
205
+ centroid = NArray.ref @centroids[true, i].flatten
206
+ Cluster.new i, Point.new(-i, centroid)
207
+ end
208
+
209
+ @points = @points_count.times.map do |i|
210
+ data = NArray.ref @points_matrix[true, i].flatten
211
+ point = Point.new(i, data, @labels[i])
212
+ cluster = @clusters[@cluster_assigns[i]]
213
+ cluster.points << point
214
+ point
215
+ end
216
+
206
217
  self
207
218
  end
208
219
 
@@ -305,40 +316,6 @@ class KMeansClusterer
305
316
  @points_count.times.to_a.sample @k
306
317
  end
307
318
 
308
- def set_points
309
- @points = @points_count.times.map do |i|
310
- data = NArray.ref @points_matrix[true, i].flatten
311
- Point.new(i, data, @labels[i])
312
- end
313
- end
314
-
315
- def set_clusters
316
- @clusters = @k.times.map do |i|
317
- centroid = NArray.ref @centroids[true, i].flatten
318
- c = Cluster.new i, Point.new(-i, centroid)
319
- @cluster_point_ids[i].each do |p|
320
- c << @points[p]
321
- end
322
- c
323
- end
324
- end
325
-
326
- def calculate_error
327
- errors = @k.times.map do |i|
328
- centroid = get_centroid i
329
- points = get_points_for_centroid i
330
-
331
- if points.empty?
332
- 0
333
- else
334
- distances = Distance.euclidean points, centroid
335
- (distances**2).sum
336
- end
337
- end
338
-
339
- errors.reduce(:+)
340
- end
341
-
342
319
  def get_point i
343
320
  NArray.ref @points_matrix[true, i].flatten
344
321
  end
@@ -348,7 +325,7 @@ class KMeansClusterer
348
325
  end
349
326
 
350
327
  def get_points_for_centroid i
351
- point_ids = @cluster_point_ids[i]
328
+ point_ids = @cluster_assigns.eq(i).where
352
329
  points = @points_matrix[true, point_ids]
353
330
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
354
331
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-03 00:00:00.000000000 Z
11
+ date: 2015-03-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray