kmeans-clusterer 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +38 -38
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 280cf2639965cde59fcff41909375d284637825d
4
- data.tar.gz: e9d9fc0db3828b9bfc37def487200cad77a4d5a7
3
+ metadata.gz: 38086107da4ad5ec21f6daa33166b4d8542f3a59
4
+ data.tar.gz: f7fed19b6df6b6c9dbfb21548bc07bd1d997b56e
5
5
  SHA512:
6
- metadata.gz: 7cd63b7d8f844ea17972ce535d4f0b1ea29d61524351431ada328fdbf30736006739ced8f9c1b344f834c0c993f6491ef203f22761f30ef224795a6d38ee4996
7
- data.tar.gz: 63ed5fa36fe27785c124877ac9a56d30190f55aec1a08c6ab161f79a24980f91f553c552f7956fed7d3f9bc25b4d1cbcb98df97d06e51a56c17eb5d290381019
6
+ metadata.gz: 2e4603c448b30875ba49eb694974ca5e43c0f4270d9d7005608f9bad086eff1bab769f23bb92a8756a024f037a1cc03dd568cb02122d7841c3a28b2556571893
7
+ data.tar.gz: 40ed7fcf8c7db56d7abf50c072cb7b928c501951e3dbf8a3d2276508c4a5ce2bae2b2bb12159c54ce999cd322b721f12f3b1cf3b46f7903aabc79dbf03f8ada4
@@ -23,6 +23,22 @@ class KMeansClusterer
23
23
  end
24
24
  end
25
25
 
26
+ module Distance
27
+ def self.euclidean x, y, yy = nil
28
+ if x.is_a?(NMatrix) && y.is_a?(NMatrix)
29
+ xx = x.map {|v| v**2}.sum(0)
30
+ yy ||= y.map {|v| v**2}.sum(0)
31
+ xy = x * y.transpose
32
+ distance = xy * -2
33
+ distance += xx
34
+ distance += yy.transpose
35
+ NMath.sqrt distance
36
+ else
37
+ NMath.sqrt ((x - y)**2).sum(0)
38
+ end
39
+ end
40
+ end
41
+
26
42
 
27
43
  class Point
28
44
  attr_reader :id, :data
@@ -66,6 +82,14 @@ class KMeansClusterer
66
82
  point.cluster = self
67
83
  @points << point
68
84
  end
85
+
86
+ def sorted_points point = @centroid
87
+ point = point.data if point.is_a?(Point)
88
+ point = NArray.cast(point, @centroid.typecode) unless point.is_a?(NArray)
89
+ points_data = NArray.cast(@points.map(&:data))
90
+ distances = Distance.euclidean(points_data, point)
91
+ @points.sort_by.with_index {|p, i| distances[i] }
92
+ end
69
93
  end
70
94
 
71
95
 
@@ -108,7 +132,7 @@ class KMeansClusterer
108
132
  end
109
133
 
110
134
 
111
- attr_reader :k, :points, :clusters, :error, :mean, :std, :iterations, :runtime
135
+ attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
112
136
 
113
137
 
114
138
  def initialize opts = {}
@@ -136,7 +160,7 @@ class KMeansClusterer
136
160
  loop do
137
161
  @iterations +=1
138
162
 
139
- distances = distance(@centroids, @points_matrix)
163
+ distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
140
164
 
141
165
  # assign point ids to @cluster_point_ids
142
166
  @points_count.times do |i|
@@ -157,7 +181,7 @@ class KMeansClusterer
157
181
  else
158
182
  points = @points_matrix[true, point_ids]
159
183
  newcenter = points.mean(1)
160
- moves << distance(centroid, newcenter)
184
+ moves << Distance.euclidean(centroid, newcenter)
161
185
  end
162
186
 
163
187
  updated_centroids << newcenter
@@ -185,27 +209,23 @@ class KMeansClusterer
185
209
  def predict data
186
210
  data = NMatrix.cast(data, @typecode)
187
211
  data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
188
- distances = distance(@centroids, data, nil)
212
+ distances = Distance.euclidean(@centroids, data)
189
213
  data.shape[1].times.map do |i|
190
214
  distances[i, true].sort_index[0] # index of closest cluster
191
215
  end
192
216
  end
193
217
 
194
218
  def sorted_clusters point = origin
195
- point = wrap_point point
196
- centroids = get_cluster_centroids
197
- distances = distance(centroids, point.data)
219
+ point = point.data if point.is_a?(Point)
220
+ point = NArray.cast(point, @typecode) unless point.is_a?(NArray)
221
+ distances = Distance.euclidean(NArray.ref(@centroids), point)
198
222
  @clusters.sort_by.with_index {|c, i| distances[i] }
199
223
  end
200
224
 
201
- def origin
202
- wrap_point Array.new(@points[0].dimension, 0)
203
- end
204
-
205
225
  def silhouette
206
226
  return 1.0 if @k < 2
207
227
 
208
- distances = distance(@centroids, @points_matrix)
228
+ distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
209
229
 
210
230
  scores = @points_count.times.map do |i|
211
231
  point = get_point i
@@ -222,20 +242,14 @@ class KMeansClusterer
222
242
  scores.reduce(:+) / scores.length # mean score for all points
223
243
  end
224
244
 
225
- alias_method :silhouette_score, :silhouette
226
-
227
245
  def inspect
228
246
  %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
229
247
  end
230
248
 
231
249
  private
232
- def wrap_point point
233
- return point if point.is_a?(Point)
234
- Point.new(0, NArray.cast(point, @typecode))
235
- end
236
250
 
237
251
  def dissimilarity points, point
238
- distances = distance points, point
252
+ distances = Distance.euclidean points, point
239
253
  distances.sum / distances.length.to_f
240
254
  end
241
255
 
@@ -259,7 +273,7 @@ class KMeansClusterer
259
273
  while centroid_ids.length < @k
260
274
  centroids = @points_matrix[true, centroid_ids]
261
275
 
262
- distances = distance(centroids, @points_matrix)
276
+ distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
263
277
 
264
278
  d2 = []
265
279
  @points_count.times do |i|
@@ -288,11 +302,7 @@ class KMeansClusterer
288
302
  end
289
303
 
290
304
  def pick_k_random_indexes
291
- @points_count.times.to_a.shuffle.slice(0, @k)
292
- end
293
-
294
- def get_cluster_centroids
295
- NArray.to_na @clusters.map {|c| c.centroid.data }
305
+ @points_count.times.to_a.sample @k
296
306
  end
297
307
 
298
308
  def set_points
@@ -321,7 +331,7 @@ class KMeansClusterer
321
331
  if points.empty?
322
332
  0
323
333
  else
324
- distances = distance points, centroid
334
+ distances = Distance.euclidean points, centroid
325
335
  (distances**2).sum
326
336
  end
327
337
  end
@@ -343,17 +353,7 @@ class KMeansClusterer
343
353
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
344
354
  end
345
355
 
346
- def distance x, y, yy = @row_norms
347
- if x.is_a?(NMatrix) && y.is_a?(NMatrix)
348
- xx = x.map {|v| v**2}.sum(0)
349
- yy ||= y.map {|v| v**2}.sum(0)
350
- xy = x * y.transpose
351
- distance = xy * -2
352
- distance += xx
353
- distance += yy.transpose
354
- NMath.sqrt distance
355
- else
356
- NMath.sqrt ((x - y)**2).sum(0)
357
- end
356
+ def origin
357
+ Array.new(@points[0].dimension, 0)
358
358
  end
359
359
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-13 00:00:00.000000000 Z
11
+ date: 2015-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray