RubyGems - hierclust - Versions diffs - 0.1.2 → 0.1.3 - Mend

hierclust 0.1.2 → 0.1.3

Files changed (7) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,11 @@
+== 0.1.3 2008-02-10
+* 1 performance improvement
+  * added linear-time preclustering based on minimum separation distance
+* 1 major change:
+  * when minimum separation is given, the clusterer will no longer calculate
+    and return clusters smaller than "separation / 2.0"
 == 0.1.2 2008-02-07
 * 1 performance improvement

data/lib/hierclust/cluster.rb CHANGED Viewed

@@ -24,6 +24,12 @@ module Hierclust
       @y ||= @items.inject(0) {|sum, p| sum + p.y} / size
     end
+    # Add an +item+ to this Cluster.
+    def <<(item)
+      @x, @y = nil, nil # flush cached pseudo-attributes
+      @items << item
+    end
     # Returns the number of items in this Cluster.
     def size
       @items.size

data/lib/hierclust/clusterer.rb CHANGED Viewed

@@ -11,8 +11,8 @@ module Hierclust
     # Specify +separation+ to stop the clustering process once all the
     # items are at least +separation+ units apart.
     def initialize(data, separation = nil)
-      @data = data.dup
       @separation = separation
+      @data = precluster(data)
       @distances = Distances.new(@data)
     end
@@ -43,5 +43,26 @@ module Hierclust
         [Cluster.new(nearest), *outliers]
       end
     end
+    def precluster(points)
+      if @separation.nil?
+        # can't precluster w/ no min separation given
+        return points.dup
+      end
+      if @separation == 0
+        # if no separation is asked for, it's all one cluster
+        return [Cluster.new(points)]
+      end
+      grid_size = @separation / 2.0
+      grid_clusters = Hash.new
+      points.each do |point|
+        grid_x = (point.x / grid_size).floor
+        grid_y = (point.y / grid_size).floor
+        grid_clusters[grid_x] ||= Hash.new
+        grid_clusters[grid_x][grid_y] ||= Cluster.new([])
+        grid_clusters[grid_x][grid_y] << point
+      end
+      grid_clusters.values.map{|h| h.values}.flatten
+    end
   end
 end

data/lib/hierclust/distances.rb CHANGED Viewed

@@ -21,5 +21,27 @@ module Hierclust
       end
       @outliers = @items - @nearest
     end
+=begin
+old idea
+1 calculate all distances
+2 update distances when a new cluster is created from two existing points
+3 keep distances sorted by separation so that we always know which is shortest
+new idea
+don't worry about the lower level clusters
+don't worry about the higher level clusters
+just form clusters of the desired separation
+start by dividing the points into a grid of 0.5 * sep
+and put all points in the same grid cells together
+...
+and then do regular hierarchical clustering! we should be fine at that point.
+sweet....
+=end
   end
 end

data/lib/hierclust/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Hierclust #:nodoc:
   module VERSION #:nodoc:
     MAJOR = 0
     MINOR = 1
-    TINY  = 2
+    TINY  = 3
     STRING = [MAJOR, MINOR, TINY].join('.')
   end

data/website/index.html CHANGED Viewed

@@ -33,7 +33,7 @@
     <h1>Simple Hierarchical Clustering</h1>
     <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/hierclust"; return false'>
       <p>Get Version</p>
-      <a href="http://rubyforge.org/projects/hierclust" class="numbers">0.1.2</a>
+      <a href="http://rubyforge.org/projects/hierclust" class="numbers">0.1.3</a>
     </div>
     <h1>&#x2192; &#8216;hierclust&#8217;</h1>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: hierclust
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Brandt Kurowski
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-02-08 00:00:00 -05:00
+date: 2008-02-10 00:00:00 -05:00
 default_executable:
 dependencies: []