RubyGems - kmeans-clusterer - Versions diffs - 0.4.0 → 0.5.0 - Mend

kmeans-clusterer 0.4.0 → 0.5.0

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: daff89d4293b6131cf1c684c9a5ef3a3d9dfa878
-  data.tar.gz: 7917de7e84065b9c55c4c501bb2587f6e86aff7b
+  metadata.gz: 83e84b661282a532269410c4bf91d98f7831cc45
+  data.tar.gz: 2ec3bf6b111d67e4f5beced25b088b59592e3cdf
 SHA512:
-  metadata.gz: 795ac3f9fb65b6a40adbb67264413fc3819bf828de13edef857ef340d5e2467de4807e35fb87277bd6aeec687ab93f11b35c0b34a92922d0b5c609a0e731be74
-  data.tar.gz: ce6dc360936e0a3fcdc4be3b4c3fd207d0b0df67ed45c36ea2b476e473eedd11b7fea35c060eabca38f733325d1aba6fa30defa56601bdcac6c65d68c27d8145
+  metadata.gz: 7735bf8e71c4fa4300793e40a299b7a4af267821860092b2dd21db16cfc407bc0b2e6a9b99c37846ffbd02b76976242d9719bfa92e244455be14208b1934d21f
+  data.tar.gz: 54f7bf8afff09caf185dbf95c8baeb4ea471e4496b29967ad88b3fc377e0063803c7810295825fe581fd744048964b592197bcfea6d2aa1178133d39e2937b0b

data/lib/kmeans-clusterer.rb CHANGED Viewed

@@ -1,17 +1,34 @@
 require 'narray'
 class KMeansClusterer
+  module Scaler
+    def self.mean data
+      data.mean(1)
+    end
+    def self.std data
+      std = data.rmsdev(1)
+      std[std.eq(0)] = 1.0 # so we don't divide by 0
+      std
+    end
+    def self.scale data, mean = nil, std = nil
+      data = NArray.cast(data, NArray::DFLOAT)
+      mean ||= self.mean(data)
+      std ||= self.std(data)
+      data = (data - mean) / std
+      [data, mean, std]
+    end
+  end
-  # Euclidean distance function. Requires instances of NArray as args
-  Distance = -> (a, b) { NMath.sqrt ((a - b)**2).sum(0) }
-  CalculateCentroid = -> (a) { a.mean(1) }
   class Point
-    attr_reader :data
+    attr_reader :id, :data
     attr_accessor :cluster, :label
-    def initialize data, label = nil
-      @data = NArray.to_na data
+    def initialize id, data, label = nil
+      @id = id
+      @data = data
       @label = label
     end
@@ -34,175 +51,161 @@ class KMeansClusterer
   class Cluster
-    attr_reader :centroid, :points
+    attr_reader :id, :centroid, :points
     attr_accessor :label
-    def initialize centroid, label = nil
+    def initialize id, centroid
+      @id = id
       @centroid = centroid
-      @label = label
       @points = []
     end
-    def recenter
-      if @points.empty?
-        0
-      else
-        old_centroid = @centroid
-        @centroid = calculate_centroid_from_points
-        Distance.call @centroid.data, old_centroid.data
-      end
-    end
     def << point
       point.cluster = self
       @points << point
     end
-    def reset_points
-      @points = []
-    end
-    def sorted_points
-      distances = Distance.call points_narray, centroid.data
-      @points.sort_by.with_index {|c, i| distances[i] }
-    end
-    def sum_of_squares_error
-      if @points.empty?
-        0
-      else
-        distances = Distance.call points_narray, centroid.data
-        (distances**2).sum
-      end
-    end
-    def sum_of_distances
-      return 0 if @points.empty?
-      Distance.call(points_narray, centroid.data).sum
+    def points_narray
+      NArray.cast @points.map(&:data)
     end
-    def dissimilarity point
-      distances = Distance.call points_narray, point.data
-      distances.sum / distances.length.to_f
-    end
-    private
-      def calculate_centroid_from_points
-        data = CalculateCentroid.call points_narray
-        Point.new data
-      end
-      def points_narray
-        NArray.to_na @points.map(&:data)
-      end
   end
+  DEFAULT_OPTS = { scale_data: false, runs: 10, log: false, init: :kmpp}
   def self.run k, data, opts = {}
-    raise(ArgumentError, "k cannot be greater than the number of points") if k > data.length
+    opts = DEFAULT_OPTS.merge(opts)
-    data = if opts[:scale_data]
-      scale_data data
-    else
-      data.map {|row| NArray.to_na(row).to_f}
+    opts[:k] = k
+    if opts[:scale_data]
+      data, mean, std = Scaler.scale(data)
+      opts[:mean] = mean
+      opts[:std] = std
     end
-    runcount = opts[:runs] || 10
-    errors = []
+    opts[:points_matrix] = NMatrix.cast(data, NArray::DFLOAT)
+    opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)
-    runs = runcount.times.map do |i|
-      km = new(k, data, opts).run
-      error = km.error
+    runs = opts[:runs].times.map do |i|
+      km = new(opts).run
       if opts[:log]
-        puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{error.round(2)} err"
+        puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
       end
-      errors << error
       km
     end
-    runs.sort_by.with_index {|run, i| errors[i] }.first
+    runs.sort_by {|run| run.error }.first.finish
   end
-  # see scikit-learn scale and _mean_and_std methods
-  def self.scale_data data
-    nadata = NArray.to_na(data).to_f
-    mean = nadata.mean(1)
-    std = nadata.rmsdev(1)
-    std[std.eq(0)] = 1.0 # so we don't divide by 0
-    nadata = (nadata - mean) / std
-    # convert back to an array, containing NArrays for each row
-    data.length.times.map {|i| nadata[true, i] }
-  end
+  attr_reader :k, :points, :clusters, :error, :iterations, :runtime
-  attr_reader :k, :points, :clusters, :iterations, :runtime
+  def initialize opts = {}
+    @k = opts[:k]
+    @init = opts[:init]
+    @labels = opts[:labels] || []
+    @row_norms = opts[:row_norms]
-  def initialize k, data, opts = {}
-    @k = k
-    @init = opts[:init] || :kmpp
-    labels = opts[:labels] || []
+    @points_matrix = opts[:points_matrix]
+    @points_count = @points_matrix.shape[1] if @points_matrix
+    @mean = opts[:mean]
+    @std = opts[:std]
+    @scale_data = opts[:scale_data]
-    @points = data.map.with_index do |instance, i|
-      Point.new instance, labels[i]
-    end
-    init_clusters
+    init_centroids
   end
   def run
     start_time = Time.now
     @iterations, @runtime = 0, 0
+    @cluster_point_ids = Array.new(@k) { [] }
     loop do
       @iterations +=1
-      centroids = get_cluster_centroids
+      distances = distance(@centroids, @points_matrix)
+      # assign point ids to @cluster_point_ids
+      @points_count.times do |i|
+        min_distance_index = distances[i, true].sort_index[0]
+        @cluster_point_ids[min_distance_index] << i
+      end
+      moves = []
+      updated_centroids = []
+      @k.times do |i|
+        centroid = NArray.cast(@centroids[true, i].flatten)
+        point_ids = @cluster_point_ids[i]
+        if point_ids.empty?
+          newcenter = centroid
+          moves << 0
+        else
+          points = @points_matrix[true, point_ids]
+          newcenter = points.mean(1)
+          moves << distance(centroid, newcenter)
+        end
-      @points.each do |point|
-        distances = Distance.call(centroids, point.data)
-        cluster = @clusters.sort_by.with_index {|c, i| distances[i] }.first
-        cluster << point
+        updated_centroids << newcenter
       end
-      moves = clusters.map(&:recenter)
+      @centroids = NMatrix.cast updated_centroids
       break if moves.max < 0.001 # i.e., no movement
       break if @iterations >= 300
-      clusters.each(&:reset_points)
+      @cluster_point_ids = Array.new(@k) { [] }
     end
+    @error = calculate_error
     @runtime =  Time.now - start_time
     self
   end
-  def error
-    @clusters.map(&:sum_of_squares_error).reduce(:+)
+  def finish
+    set_points
+    set_clusters
+    self
   end
-  def closest_cluster point = origin
-    sorted_clusters(point).first
+  def predict data
+    data, _m, _s = Scaler.scale(data, @mean, @std) if @scale_data
+    data = NMatrix.cast(data, NArray::DFLOAT)
+    distances = distance(@centroids, data, nil)
+    data.shape[1].times.map do |i|
+      distances[i, true].sort_index[0] # index of closest cluster
+    end
   end
   def sorted_clusters point = origin
-    point = Point.new(point) unless point.is_a?(Point)
+    point = wrap_point point
     centroids = get_cluster_centroids
-    distances = Distance.call(centroids, point.data)
+    distances = distance(centroids, point.data)
     @clusters.sort_by.with_index {|c, i| distances[i] }
   end
   def origin
-    Point.new Array.new(@points[0].dimension, 0)
+    wrap_point Array.new(@points[0].dimension, 0)
   end
   def silhouette_score
-    return 1.0 if @clusters.length < 2
-    scores = @points.map do |point|
-      acluster, bcluster = sorted_clusters(point).slice(0,2)
-      a = acluster.dissimilarity(point)
-      b = bcluster.dissimilarity(point)
+    return 1.0 if @k < 2
+    distances = distance(@centroids, @points_matrix)
+    scores = @points_count.times.map do |i|
+      point = get_point i
+      cluster_indexes = distances[i, true].sort_index
+      c1_points = get_points_for_centroid cluster_indexes[0]
+      c2_points = get_points_for_centroid cluster_indexes[1]
+      a = dissimilarity(c1_points, point)
+      b = dissimilarity(c2_points, point)
       (b - a) / [a,b].max
     end
@@ -210,73 +213,130 @@ class KMeansClusterer
   end
   private
-    def init_clusters
+    def wrap_point point
+      return point if point.is_a?(Point)
+      Point.new(0, NArray.to_na(point).to_f)
+    end
+    def dissimilarity points, point
+      distances = distance points, point
+      distances.sum / distances.length.to_f
+    end
+    def init_centroids
       case @init
       when :random
-        random_cluster_init
+        random_centroid_init
       when Array
-        custom_cluster_init
+        custom_centroid_init
       else
-        kmpp_cluster_init
+        kmpp_centroid_init
       end
     end
     # k-means++
-    def kmpp_cluster_init
-      @clusters = []
-      pick = rand(@points.length)
-      centroid = Point.new @points[pick].data.to_a
-      @clusters << Cluster.new(centroid, 1)
-      while @clusters.length < @k
-        centroids = get_cluster_centroids
-        d2 = @points.map do |point|
-          dists = Distance.call centroids, point.data
-          dists.min**2 # closest cluster distance, squared
+    def kmpp_centroid_init
+      centroid_ids = []
+      pick = rand(@points_count)
+      centroid_ids << pick
+      while centroid_ids.length < @k
+        centroids = @points_matrix[true, centroid_ids]
+        distances = distance(centroids, @points_matrix)
+        d2 = []
+        @points_count.times do |i|
+          min_distance = distances[i, true].min
+          d2 << min_distance**2
         end
         d2 = NArray.to_na d2
         probs = d2 / d2.sum
         cumprobs = probs.cumsum
         r = rand
-        # pick = cumprobs.to_a.index {|prob| r < prob }
         pick = (cumprobs >= r).where[0]
-        centroid = Point.new @points[pick].data.to_a
-        cluster = Cluster.new(centroid, @clusters.length + 1)
-        @clusters << cluster
+        centroid_ids << pick
       end
-    end
-    def custom_cluster_init
-      @clusters = @init.map.with_index do |instance, i|
-        point = Point.new NArray.to_na(instance).to_f
-        Cluster.new point, i+1
-      end
+      @centroids = @points_matrix[true, centroid_ids]
     end
-    def random_cluster_init
-      @clusters = pick_k_random_points.map.with_index {|centroid, i| Cluster.new centroid, i+1 }
+    def custom_centroid_init
+      @centroids = NMatrix.cast @init
+      @k = @init.length
     end
-    def pick_k_random_points
-      pick_k_random_indexes.map {|i| Point.new @points[i].data.to_a }
+    def random_centroid_init
+      @centroids = @points_matrix[true, pick_k_random_indexes]
     end
     def pick_k_random_indexes
-      @points.length.times.to_a.shuffle.slice(0, @k)
+      @points_count.times.to_a.shuffle.slice(0, @k)
     end
     def get_cluster_centroids
       NArray.to_na @clusters.map {|c| c.centroid.data }
     end
-end
-class KMediansClusterer < KMeansClusterer
-  Distance = -> (a, b) { (a - b).abs.sum(0) }
-  CalculateCentroid = -> (a) { a.rot90.median(0) }
+    def set_points
+      @points = @points_count.times.map do |i|
+        data = NArray.cast @points_matrix[true, i].flatten
+        Point.new(i, data, @labels[i])
+      end
+    end
-  def error
-    @clusters.map(&:sum_of_distances).reduce(:+)
-  end
+    def set_clusters
+      @clusters = @k.times.map do |i|
+        centroid = NArray.cast @centroids[true, i].flatten
+        c = Cluster.new i, Point.new(-i, centroid)
+        @cluster_point_ids[i].each do |p|
+          c << @points[p]
+        end
+        c
+      end
+    end
+    def calculate_error
+      errors = @k.times.map do |i|
+        centroid = get_centroid i
+        points = get_points_for_centroid i
+        if points.empty?
+          0
+        else
+          distances = distance points, centroid
+          (distances**2).sum
+        end
+      end
+      errors.reduce(:+)
+    end
+    def get_point i
+      NArray.cast @points_matrix[true, i].flatten
+    end
+    def get_centroid i
+      NArray.cast(@centroids[true, i].flatten)
+    end
+    def get_points_for_centroid i
+      point_ids = @cluster_point_ids[i]
+      NArray.cast @points_matrix[true, point_ids]
+    end
+    def distance x, y, yy = @row_norms
+      if x.is_a?(NMatrix) && y.is_a?(NMatrix)
+        xx = x.map {|v| v**2}.sum(0)
+        yy ||= y.map {|v| v**2}.sum(0)
+        xy = x * y.transpose
+        distance = xy * -2
+        distance += xx
+        distance += yy.transpose
+        NMath.sqrt distance
+      else
+        NMath.sqrt ((x - y)**2).sum(0)
+      end
+    end
 end

metadata CHANGED Viewed

@@ -1,16 +1,16 @@
 --- !ruby/object:Gem::Specification
 name: kmeans-clusterer
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Geoff Buesing
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-29 00:00:00.000000000 Z
+date: 2015-02-05 00:00:00.000000000 Z
 dependencies: []
-description: k-means/k-medians clustering. Uses NArray for fast calculations.
+description: k-means clustering. Uses NArray for fast calculations.
 email: gbuesing@gmail.com
 executables: []
 extensions: []
@@ -40,5 +40,5 @@ rubyforge_project:
 rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
-summary: k-means/k-medians clustering
+summary: k-means clustering
 test_files: []