RubyGems - kmeans-clusterer - Versions diffs - 0.10.0 → 0.11.0 - Mend

kmeans-clusterer 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 873f13d79f2d199400d1e359aca6e56c74d68ac0
-  data.tar.gz: a4d1f06a71b2e8289c60badeb93fb007fee06af6
+  metadata.gz: 2c2552e6a8ee7eddd6d03d8fa8cf7038c0458a11
+  data.tar.gz: 62172e9d841aa4df7b332a4e2cda55bf426bc9b1
 SHA512:
-  metadata.gz: 1e160f4bbe512e7aac37c2edcbcc63c1f103e51ae4c441a0c2f2e6cc57783344eddbcd31f30ea1a9a32dcc979250fb5bab6e1faba7a1d686612362d4aa798129
-  data.tar.gz: 405a6ab16edcb0fcdd958c9963446a44bd2954be9cdb1e95dc728bf7d10e18536334c693ca38f00b956d51358b3ab105aa2a9a3f68c862ab99914e4e4ee894e4
+  metadata.gz: a33d17f4749b00e7ee8a0829e6a402db6eb768bd62f5dcb1e3ed9dcf3020b5ec074b3e5e38a817fb6d4a51ec13b8cf75f189a55b6a3e488738b4dc2828cbb878
+  data.tar.gz: c4a2e27da37141435583d1b09cbf4001f4b2c44a5b2900155e2444f45d9663edf0ccab6d4f80c50d05ad85ce31710a7d4558356bded561bf17199e7333c3e14d

data/lib/kmeans-clusterer.rb CHANGED

@@ -46,12 +46,13 @@ class KMeansClusterer
   class Point
-    attr_reader :id, :data
+    attr_reader :id, :data, :centroid_distances
     attr_accessor :cluster, :label
-    def initialize id, data, label = nil
+    def initialize id, data, centroid_distances, label = nil
       @id = id
       @data = data
+      @centroid_distances = centroid_distances
       @label = label
     end
@@ -70,6 +71,10 @@ class KMeansClusterer
     def dimension
       @data.length
     end
+    def centroid_distance
+      @centroid_distances[@cluster.id]
+    end
   end
@@ -116,7 +121,7 @@ class KMeansClusterer
       opts[:std] = std
     end
-    opts[:points_matrix] = data
+    opts[:data] = data
     opts[:row_norms] = Scaler.row_norms(data)
     bestrun = nil
@@ -137,7 +142,7 @@ class KMeansClusterer
   end
-  attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime
+  attr_reader :k, :points, :clusters, :centroids, :error, :mean, :std, :iterations, :runtime, :distances, :data
   def initialize opts = {}
@@ -146,8 +151,8 @@ class KMeansClusterer
     @labels = opts[:labels] || []
     @row_norms = opts[:row_norms]
-    @points_matrix = opts[:points_matrix]
-    @points_count = @points_matrix.shape[1] if @points_matrix
+    @data = opts[:data]
+    @points_count = @data.shape[1] if @data
     @mean = opts[:mean]
     @std = opts[:std]
     @scale_data = opts[:scale_data]
@@ -167,10 +172,10 @@ class KMeansClusterer
       @iterations +=1
       min_distances.fill! Float::INFINITY
-      distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
+      @distances = Distance.euclidean(@centroids, @data, @row_norms)
       @k.times do |cluster_id|
-        dist = NArray.ref distances[true, cluster_id].flatten
+        dist = NArray.ref @distances[true, cluster_id].flatten
         mask = dist < min_distances
         @cluster_assigns[mask] = cluster_id
         min_distances[mask] = dist[mask]
@@ -183,7 +188,7 @@ class KMeansClusterer
         point_ids = @cluster_assigns.eq(cluster_id).where
         unless point_ids.empty?
-          points = @points_matrix[true, point_ids]
+          points = @data[true, point_ids]
           newcenter = points.mean(1)
           move = Distance.euclidean(centroid, newcenter)
           max_move = move if move > max_move
@@ -203,17 +208,21 @@ class KMeansClusterer
   def finish
     @clusters = @k.times.map do |i|
       centroid = NArray.ref @centroids[true, i].flatten
-      Cluster.new i, Point.new(-i, centroid)
+      Cluster.new i, Point.new(-1, centroid, nil, nil)
     end
     @points = @points_count.times.map do |i|
-      data = NArray.ref @points_matrix[true, i].flatten
-      point = Point.new(i, data, @labels[i])
+      data = NArray.ref @data[true, i].flatten
+      point = Point.new(i, data, @distances[i, true], @labels[i])
       cluster = @clusters[@cluster_assigns[i]]
-      cluster.points << point
+      cluster << point
       point
     end
+    @clusters.each do |c|
+      c.points.sort_by! &:centroid_distance
+    end
     self
   end
@@ -236,17 +245,13 @@ class KMeansClusterer
   def silhouette
     return 1.0 if @k < 2
-    distances = Distance.euclidean(@centroids, @points_matrix, @row_norms)
+    scores = @points.map do |point|
+      sort_index = point.centroid_distances.sort_index
+      c1_points = get_points_for_cluster sort_index[0]
+      c2_points = get_points_for_cluster sort_index[1]
-    scores = @points_count.times.map do |i|
-      point = get_point i
-      cluster_indexes = distances[i, true].sort_index
-      c1_points = get_points_for_centroid cluster_indexes[0]
-      c2_points = get_points_for_centroid cluster_indexes[1]
-      a = dissimilarity(c1_points, point)
-      b = dissimilarity(c2_points, point)
+      a = dissimilarity(c1_points, point.data)
+      b = dissimilarity(c2_points, point.data)
       (b - a) / [a,b].max
     end
@@ -282,9 +287,9 @@ class KMeansClusterer
       centroid_ids << pick
       while centroid_ids.length < @k
-        centroids = @points_matrix[true, centroid_ids]
+        centroids = @data[true, centroid_ids]
-        distances = Distance.euclidean(centroids, @points_matrix, @row_norms)
+        distances = Distance.euclidean(centroids, @data, @row_norms)
         d2 = []
         @points_count.times do |i|
@@ -300,7 +305,7 @@ class KMeansClusterer
         centroid_ids << pick
       end
-      @centroids = @points_matrix[true, centroid_ids]
+      @centroids = @data[true, centroid_ids]
     end
     def custom_centroid_init
@@ -309,24 +314,20 @@ class KMeansClusterer
     end
     def random_centroid_init
-      @centroids = @points_matrix[true, pick_k_random_indexes]
+      @centroids = @data[true, pick_k_random_indexes]
     end
     def pick_k_random_indexes
       @points_count.times.to_a.sample @k
     end
-    def get_point i
-      NArray.ref @points_matrix[true, i].flatten
-    end
     def get_centroid i
       NArray.ref(@centroids[true, i].flatten)
     end
-    def get_points_for_centroid i
+    def get_points_for_cluster i
       point_ids = @cluster_assigns.eq(i).where
-      points = @points_matrix[true, point_ids]
+      points = @data[true, point_ids]
       points.empty? ? NArray.sfloat(0) : NArray.ref(points)
     end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kmeans-clusterer
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.11.0
 platform: ruby
 authors:
 - Geoff Buesing
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-03-10 00:00:00.000000000 Z
+date: 2015-03-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: narray