kmeans-clusterer 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kmeans-clusterer.rb +38 -38
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 280cf2639965cde59fcff41909375d284637825d
4
- data.tar.gz: e9d9fc0db3828b9bfc37def487200cad77a4d5a7
3
+ metadata.gz: 38086107da4ad5ec21f6daa33166b4d8542f3a59
4
+ data.tar.gz: f7fed19b6df6b6c9dbfb21548bc07bd1d997b56e
5
5
  SHA512:
6
- metadata.gz: 7cd63b7d8f844ea17972ce535d4f0b1ea29d61524351431ada328fdbf30736006739ced8f9c1b344f834c0c993f6491ef203f22761f30ef224795a6d38ee4996
7
- data.tar.gz: 63ed5fa36fe27785c124877ac9a56d30190f55aec1a08c6ab161f79a24980f91f553c552f7956fed7d3f9bc25b4d1cbcb98df97d06e51a56c17eb5d290381019
6
+ metadata.gz: 2e4603c448b30875ba49eb694974ca5e43c0f4270d9d7005608f9bad086eff1bab769f23bb92a8756a024f037a1cc03dd568cb02122d7841c3a28b2556571893
7
+ data.tar.gz: 40ed7fcf8c7db56d7abf50c072cb7b928c501951e3dbf8a3d2276508c4a5ce2bae2b2bb12159c54ce999cd322b721f12f3b1cf3b46f7903aabc79dbf03f8ada4
@@ -23,6 +23,22 @@ class KMeansClusterer
23
23
  end
24
24
  end
25
25
 
26
+ module Distance
27
+ def self.euclidean x, y, yy = nil
28
+ if x.is_a?(NMatrix) && y.is_a?(NMatrix)
29
+ xx = x.map {|v| v**2}.sum(0)
30
+ yy ||= y.map {|v| v**2}.sum(0)
31
+ xy = x * y.transpose
32
+ distance = xy * -2
33
+ distance += xx
34
+ distance += yy.transpose
35
+ NMath.sqrt distance
36
+ else
37
+ NMath.sqrt ((x - y)**2).sum(0)
38
+ end
39
+ end
40
+ end
41
+
26
42
 
27
43
  class Point
28
44
  attr_reader :id, :data
@@ -66,6 +82,14 @@ class KMeansClusterer
66
82
  point.cluster = self
67
83
  @points << point
68
84
  end
85
+
86
+ def sorted_points point = @centroid
87
+ point = point.data if point.is_a?(Point)
88
+ point = NArray.cast(point, @centroid.typecode) unless point.is_a?(NArray)
89
+ points_data = NArray.cast(@points.map(&:data))
90
+ distances = Distance.euclidean(points_data, point)
91
+ @points.sort_by.with_index {|p, i| distances[i] }
92
+ end
69
93
  end
70
94
 
71
95
 
@@ -108,7 +132,7 @@ class KMeansClusterer
108
132
  end
109
133
 
110
134
 
111
- attr_reader :k, :points, :clusters, :error, :mean, :std, :iterations, :runtime
135
+ attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
112
136
 
113
137
 
114
138
  def initialize opts = {}
@@ -136,7 +160,7 @@ class KMeansClusterer
136
160
  loop do
137
161
  @iterations +=1
138
162
 
139
- distances = distance(@centroids, @points_matrix)
163
+ distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
140
164
 
141
165
  # assign point ids to @cluster_point_ids
142
166
  @points_count.times do |i|
@@ -157,7 +181,7 @@ class KMeansClusterer
157
181
  else
158
182
  points = @points_matrix[true, point_ids]
159
183
  newcenter = points.mean(1)
160
- moves << distance(centroid, newcenter)
184
+ moves << Distance.euclidean(centroid, newcenter)
161
185
  end
162
186
 
163
187
  updated_centroids << newcenter
@@ -185,27 +209,23 @@ class KMeansClusterer
185
209
  def predict data
186
210
  data = NMatrix.cast(data, @typecode)
187
211
  data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
188
- distances = distance(@centroids, data, nil)
212
+ distances = Distance.euclidean(@centroids, data)
189
213
  data.shape[1].times.map do |i|
190
214
  distances[i, true].sort_index[0] # index of closest cluster
191
215
  end
192
216
  end
193
217
 
194
218
  def sorted_clusters point = origin
195
- point = wrap_point point
196
- centroids = get_cluster_centroids
197
- distances = distance(centroids, point.data)
219
+ point = point.data if point.is_a?(Point)
220
+ point = NArray.cast(point, @typecode) unless point.is_a?(NArray)
221
+ distances = Distance.euclidean(NArray.ref(@centroids), point)
198
222
  @clusters.sort_by.with_index {|c, i| distances[i] }
199
223
  end
200
224
 
201
- def origin
202
- wrap_point Array.new(@points[0].dimension, 0)
203
- end
204
-
205
225
  def silhouette
206
226
  return 1.0 if @k < 2
207
227
 
208
- distances = distance(@centroids, @points_matrix)
228
+ distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
209
229
 
210
230
  scores = @points_count.times.map do |i|
211
231
  point = get_point i
@@ -222,20 +242,14 @@ class KMeansClusterer
222
242
  scores.reduce(:+) / scores.length # mean score for all points
223
243
  end
224
244
 
225
- alias_method :silhouette_score, :silhouette
226
-
227
245
  def inspect
228
246
  %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
229
247
  end
230
248
 
231
249
  private
232
- def wrap_point point
233
- return point if point.is_a?(Point)
234
- Point.new(0, NArray.cast(point, @typecode))
235
- end
236
250
 
237
251
  def dissimilarity points, point
238
- distances = distance points, point
252
+ distances = Distance.euclidean points, point
239
253
  distances.sum / distances.length.to_f
240
254
  end
241
255
 
@@ -259,7 +273,7 @@ class KMeansClusterer
259
273
  while centroid_ids.length < @k
260
274
  centroids = @points_matrix[true, centroid_ids]
261
275
 
262
- distances = distance(centroids, @points_matrix)
276
+ distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
263
277
 
264
278
  d2 = []
265
279
  @points_count.times do |i|
@@ -288,11 +302,7 @@ class KMeansClusterer
288
302
  end
289
303
 
290
304
  def pick_k_random_indexes
291
- @points_count.times.to_a.shuffle.slice(0, @k)
292
- end
293
-
294
- def get_cluster_centroids
295
- NArray.to_na @clusters.map {|c| c.centroid.data }
305
+ @points_count.times.to_a.sample @k
296
306
  end
297
307
 
298
308
  def set_points
@@ -321,7 +331,7 @@ class KMeansClusterer
321
331
  if points.empty?
322
332
  0
323
333
  else
324
- distances = distance points, centroid
334
+ distances = Distance.euclidean points, centroid
325
335
  (distances**2).sum
326
336
  end
327
337
  end
@@ -343,17 +353,7 @@ class KMeansClusterer
343
353
  points.empty? ? NArray.sfloat(0) : NArray.ref(points)
344
354
  end
345
355
 
346
- def distance x, y, yy = @row_norms
347
- if x.is_a?(NMatrix) && y.is_a?(NMatrix)
348
- xx = x.map {|v| v**2}.sum(0)
349
- yy ||= y.map {|v| v**2}.sum(0)
350
- xy = x * y.transpose
351
- distance = xy * -2
352
- distance += xx
353
- distance += yy.transpose
354
- NMath.sqrt distance
355
- else
356
- NMath.sqrt ((x - y)**2).sum(0)
357
- end
356
+ def origin
357
+ Array.new(@points[0].dimension, 0)
358
358
  end
359
359
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kmeans-clusterer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Geoff Buesing
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-13 00:00:00.000000000 Z
11
+ date: 2015-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: narray