RubyGems - kmeans-clusterer - Versions diffs - 0.4.0 → 0.5.0 - Mend

kmeans-clusterer 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
-  data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
+  metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
+  data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
 SHA512:
-  metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
-  data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
+  metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
+  data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b

data/lib/kmeans-clusterer.rb CHANGED Viewed

@@ -1,17 +1,34 @@
 require 'narray'
 class KMeansClusterer
+  module Scaler
+    def self.mean data
+      data.mean(1)
+    end
+    def self.std data
+      std = data.rmsdev(1)
+      std[std.eq(0)] = 1.0 # so we don't divide by 0
+      std
+    end
+    def self.scale data, mean = nil, std = nil
+      data = NArray.cast(data, NArray::DFLOAT)
+      mean ||= self.mean(data)
+      std ||= self.std(data)
+      data = (data - mean) / std
+      [data, mean, std]
+    end
+  end
-  # Euclidean distance function. Requires instances of NArray as args
-  Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
-  CalculateCentroid = -> (a) { a.mean(1) }
   class Point
-    attr_reader :data
+    attr_reader :id, :data
     attr_accessor :cluster, :label
-    def initialize data, label = nil
-      @data = NArray.to_na data
+    def initialize id, data, label = nil
+      @id = id
+      @data = data
       @label = label
     end
@@ -34,175 +51,161 @@ class KMeansClusterer
   class Cluster
-    attr_reader :centroid, :points
+    attr_reader :id, :centroid, :points
     attr_accessor :label
-    def initialize centroid, label = nil
+    def initialize id, centroid
+      @id = id
       @centroid = centroid
-      @label = label
       @points = []
     end
-    def recenter
-      if @points.empty?
-        0
-      else
-        old_centroid = @centroid
-        @centroid = calculate_centroid_from_points
-        Distance.call @centroid.data, old_centroid.data
-      end
-    end
     def << point
       point.cluster = self
       @points << point
     end
-    def reset_points
-      @points = []
-    end
-    def sorted_points
-      distances = Distance.call points_narray, centroid.data
-      @points.sort_by.with_index {|c, i| distances[i] }
-    end
-    def sum_of_squares_error
-      if @points.empty?
-        0
-      else
-        distances = Distance.call points_narray, centroid.data
-        (distances**2).sum
-      end
-    end
-    def sum_of_distances
-      return 0 if @points.empty?
-      Distance.call(points_narray, centroid.data).sum
+    def points_narray
+      NArray.cast @points.map(&:data)
     end
-    def dissimilarity point
-      distances = Distance.call points_narray, point.data
-      distances.sum / distances.length.to_f
-    end
-    private
-      def calculate_centroid_from_points
-        data = CalculateCentroid.call points_narray
-        Point.new data
-      end
-      def points_narray
-        NArray.to_na @points.map(&:data)
-      end
   end
+  DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
   def self.run k, data, opts = {}
-    raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
+    opts = DEFAULT_OPTS.merge(opts)
-    data = if opts[:scale_data]
-      scale_data data
-    else
-      data.map {|row| NArray.to_na(row).to_f}
+    opts[:k] = k
+    if opts[:scale_data]
+      data, mean, std = Scaler.scale(data)
+      opts[:mean] = mean
+      opts[:std] = std
     end
-    runcount = opts[:runs] || 10
-    errors = []
+    opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
+    opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
-    runs = runcount.times.map do |i|
-      km = new(k, data, opts).run
-      error = km.error
+    runs = opts[:runs].times.map do |i|
+      km = new(opts).run
       if opts[:log]
-        puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
+        puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
       end
-      errors << error
       km
     end
-    runs.sort_by.with_index {|run, i| errors[i] }.first
+    runs.sort_by {|run| run.error }.first.finish
   end
-  # see scikit-learn scale and _mean_and_std methods
-  def self.scale_data data
-    nadata = NArray.to_na(data).to_f
-    mean = nadata.mean(1)
-    std = nadata.rmsdev(1)
-    std[std.eq(0)] = 1.0 # so we don't divide by 0
-    nadata = (nadata - mean) / std
-    # convert back to an array, containing NArrays for each row
-    data.length.times.map {|i| nadata[true, i] }
-  end
+  attr_reader :k, :points, :clusters, :error, :iterations, :runtime
-  attr_reader :k, :points, :clusters, :iterations, :runtime
+  def initialize opts = {}
+    @k = opts[:k]
+    @init = opts[:init]
+    @labels = opts[:labels] || []
+    @row_norms = opts[:row_norms]
-  def initialize k, data, opts = {}
-    @k = k
-    @init = opts[:init] || :kmpp
-    labels = opts[:labels] || []
+    @points_matrix = opts[:points_matrix]
+    @points_count = @points_matrix.shape[1] if @points_matrix
+    @mean = opts[:mean]
+    @std = opts[:std]
+    @scale_data = opts[:scale_data]
-    @points = data.map.with_index do |instance, i|
-      Point.new instance, labels[i]
-    end
-    init_clusters
+    init_centroids
   end
   def run
     start_time = Time.now
     @iterations, @runtime = 0, 0
+    @cluster_point_ids = Array.new(@k) { [] }
     loop do
       @iterations +=1
-      centroids = get_cluster_centroids
+      distances = distance(@centroids, @points_matrix)
+      # assign point ids to @cluster_point_ids
+      @points_count.times do |i|
+        min_distance_index = distances[i, true].sort_index[0]
+        @cluster_point_ids[min_distance_index] << i
+      end
+      moves = []
+      updated_centroids = []
+      @k.times do |i|
+        centroid = NArray.cast(@centroids[true, i].flatten)
+        point_ids = @cluster_point_ids[i]
+        if point_ids.empty?
+          newcenter = centroid
+          moves << 0
+        else
+          points = @points_matrix[true, point_ids]
+          newcenter = points.mean(1)
+          moves << distance(centroid, newcenter)
+        end
-      @points.each do |point|
-        distances = Distance.call(centroids, point.data)
-        cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
-        cluster << point
+        updated_centroids << newcenter
       end
-      moves = clusters.map(&:recenter)
+      @centroids = NMatrix.cast updated_centroids
       break if moves.max < 0.001 # i.e., no movement
       break if @iterations >= 300
-      clusters.each(&:reset_points)
+      @cluster_point_ids = Array.new(@k) { [] }
     end
+    @error = calculate_error
     @runtime =  Time.now - start_time
     self
   end
-  def error
-    @clusters.map(&:sum_of_squares_error).reduce(:+)
+  def finish
+    set_points
+    set_clusters
+    self
   end
-  def closest_cluster point = origin
-    sorted_clusters(point).first
+  def predict data
+    data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
+    data = NMatrix.cast(data, NArray::DFLOAT)
+    distances = distance(@centroids, data, nil)
+    data.shape[1].times.map do |i|
+      distances[i, true].sort_index[0] # index of closest cluster
+    end
   end
   def sorted_clusters point = origin
-    point = Point.new(point) unless point.is_a?(Point)
+    point = wrap_point point
     centroids = get_cluster_centroids
-    distances = Distance.call(centroids, point.data)
+    distances = distance(centroids, point.data)
     @clusters.sort_by.with_index {|c, i| distances[i] }
   end
   def origin
-    Point.new Array.new(@points[0].dimension, 0)
+    wrap_point Array.new(@points[0].dimension, 0)
   end
   def silhouette_score
-    return 1.0 if @clusters.length < 2
-    scores = @points.map do |point|
-      acluster, bcluster = sorted_clusters(point).slice(0,2)
-      a = acluster.dissimilarity(point)
-      b = bcluster.dissimilarity(point)
+    return 1.0 if @k < 2
+    distances = distance(@centroids, @points_matrix)
+    scores = @points_count.times.map do |i|
+      point = get_point i
+      cluster_indexes = distances[i, true].sort_index
+      c1_points = get_points_for_centroid cluster_indexes[0]
+      c2_points = get_points_for_centroid cluster_indexes[1]
+      a = dissimilarity(c1_points, point)
+      b = dissimilarity(c2_points, point)
       (b - a) / [a,b].max
     end
@@ -210,73 +213,130 @@ class KMeansClusterer
   end
   private
-    def init_clusters
+    def wrap_point point
+      return point if point.is_a?(Point)
+      Point.new(0, NArray.to_na(point).to_f)
+    end
+    def dissimilarity points, point
+      distances = distance points, point
+      distances.sum / distances.length.to_f
+    end
+    def init_centroids
       case @init
       when :random
-        random_cluster_init
+        random_centroid_init
       when Array
-        custom_cluster_init
+        custom_centroid_init
       else
-        kmpp_cluster_init
+        kmpp_centroid_init
       end
     end
     # k-means++
-    def kmpp_cluster_init
-      @clusters = []
-      pick = rand(@points.length)
-      centroid = Point.new @points[pick].data.to_a
-      @clusters << Cluster.new(centroid, 1)
-      while @clusters.length < @k
-        centroids = get_cluster_centroids
-        d2 = @points.map do |point|
-          dists = Distance.call centroids, point.data
-          dists.min**2 # closest cluster distance, squared
+    def kmpp_centroid_init
+      centroid_ids = []
+      pick = rand(@points_count)
+      centroid_ids << pick
+      while centroid_ids.length < @k
+        centroids = @points_matrix[true, centroid_ids]
+        distances = distance(centroids, @points_matrix)
+        d2 = []
+        @points_count.times do |i|
+          min_distance = distances[i, true].min
+          d2 << min_distance**2
         end
         d2 = NArray.to_na d2
         probs = d2 / d2.sum
         cumprobs = probs.cumsum
         r = rand
-        # pick = cumprobs.to_a.index {|prob| r < prob }
         pick = (cumprobs >= r).where[0]
-        centroid = Point.new @points[pick].data.to_a
-        cluster = Cluster.new(centroid, @clusters.length + 1)
-        @clusters << cluster
+        centroid_ids << pick
       end
-    end
-    def custom_cluster_init
-      @clusters = @init.map.with_index do |instance, i|
-        point = Point.new NArray.to_na(instance).to_f
-        Cluster.new point, i+1
-      end
+      @centroids = @points_matrix[true, centroid_ids]
     end
-    def random_cluster_init
-      @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
+    def custom_centroid_init
+      @centroids = NMatrix.cast @init
+      @k = @init.length
     end
-    def pick_k_random_points
-      pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
+    def random_centroid_init
+      @centroids = @points_matrix[true, pick_k_random_indexes]
     end
     def pick_k_random_indexes
-      @points.length.times.to_a.shuffle.slice(0, @k)
+      @points_count.times.to_a.shuffle.slice(0, @k)
     end
     def get_cluster_centroids
       NArray.to_na @clusters.map {|c| c.centroid.data }
     end
-end
-class KMediansClusterer < KMeansClusterer
-  Distance = -> (a, b) { (a - b).abs.sum(0) }
-  CalculateCentroid = -> (a) { a.rot90.median(0) }
+    def set_points
+      @points = @points_count.times.map do |i|
+        data = NArray.cast @points_matrix[true, i].flatten
+        Point.new(i, data, @labels[i])
+      end
+    end
-  def error
-    @clusters.map(&:sum_of_distances).reduce(:+)
-  end
+    def set_clusters
+      @clusters = @k.times.map do |i|
+        centroid = NArray.cast @centroids[true, i].flatten
+        c = Cluster.new i, Point.new(-i, centroid)
+        @cluster_point_ids[i].each do |p|
+          c << @points[p]
+        end
+        c
+      end
+    end
+    def calculate_error
+      errors = @k.times.map do |i|
+        centroid = get_centroid i
+        points = get_points_for_centroid i
+        if points.empty?
+          0
+        else
+          distances = distance points, centroid
+          (distances**2).sum
+        end
+      end
+      errors.reduce(:+)
+    end
+    def get_point i
+      NArray.cast @points_matrix[true, i].flatten
+    end
+    def get_centroid i
+      NArray.cast(@centroids[true, i].flatten)
+    end
+    def get_points_for_centroid i
+      point_ids = @cluster_point_ids[i]
+      NArray.cast @points_matrix[true, point_ids]
+    end
+    def distance x, y, yy = @row_norms
+      if x.is_a?(NMatrix) && y.is_a?(NMatrix)
+        xx = x.map {|v| v**2}.sum(0)
+        yy ||= y.map {|v| v**2}.sum(0)
+        xy = x * y.transpose
+        distance = xy * -2
+        distance += xx
+        distance += yy.transpose
+        NMath.sqrt distance
+      else
+        NMath.sqrt ((x - y)**2).sum(0)
+      end
+    end
 end

metadata CHANGED Viewed

@@ -1,16 +1,16 @@
 --- !ruby/object:Gem::Specification
 name: kmeans-clusterer
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Geoff Buesing
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-29 00:00:00.000000000 Z
+date: 2015-02-05 00:00:00.000000000 Z
 dependencies: []
-description: k-means/k-medians clustering. Uses NArray for fast calculations.
+description: k-means clustering. Uses NArray for fast calculations.
 email: gbuesing@gmail.com
 executables: []
 extensions: []
@@ -40,5 +40,5 @@ rubyforge_project:
 rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
-summary: k-means/k-medians clustering
+summary: k-means clustering
 test_files: []