Empact-hierclust 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
@@ -0,0 +1,82 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{Empact-hierclust}
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Brandt Kurowski", "Ben Woosley"]
12
+ s.date = %q{2010-11-01}
13
+ s.description = %q{performs hierarchical clustering on points in Euclidian space}
14
+ s.email = %q{ben.woosley@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "Empact-hierclust.gemspec",
22
+ "History.txt",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "examples/visualize.rb",
28
+ "lib/hierclust.rb",
29
+ "lib/hierclust/cluster.rb",
30
+ "lib/hierclust/clusterer.rb",
31
+ "lib/hierclust/distances.rb",
32
+ "lib/hierclust/point.rb",
33
+ "log/debug.log",
34
+ "script/destroy",
35
+ "script/generate",
36
+ "script/txt2html",
37
+ "spec/hierclust/cluster_spec.rb",
38
+ "spec/hierclust/clusterer_spec.rb",
39
+ "spec/hierclust/distances_spec.rb",
40
+ "spec/hierclust/point_spec.rb",
41
+ "spec/hierclust_spec.rb",
42
+ "spec/spec.opts",
43
+ "spec/spec_helper.rb",
44
+ "tasks/deployment.rake",
45
+ "tasks/environment.rake",
46
+ "tasks/rspec.rake",
47
+ "tasks/website.rake",
48
+ "website/index.txt",
49
+ "website/javascripts/rounded_corners_lite.inc.js",
50
+ "website/stylesheets/screen.css",
51
+ "website/template.rhtml"
52
+ ]
53
+ s.homepage = %q{http://github.com/Empact/hierclust}
54
+ s.rdoc_options = ["--charset=UTF-8"]
55
+ s.require_paths = ["lib"]
56
+ s.rubyforge_project = %q{hierclust}
57
+ s.rubygems_version = %q{1.3.7}
58
+ s.summary = %q{performs hierarchical clustering in N dimensions}
59
+ s.test_files = [
60
+ "spec/hierclust/cluster_spec.rb",
61
+ "spec/hierclust/clusterer_spec.rb",
62
+ "spec/hierclust/distances_spec.rb",
63
+ "spec/hierclust/point_spec.rb",
64
+ "spec/hierclust_spec.rb",
65
+ "spec/spec_helper.rb",
66
+ "examples/visualize.rb"
67
+ ]
68
+
69
+ if s.respond_to? :specification_version then
70
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
71
+ s.specification_version = 3
72
+
73
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
74
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
75
+ else
76
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
77
+ end
78
+ else
79
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
80
+ end
81
+ end
82
+
@@ -0,0 +1,47 @@
1
+ == 0.2.0 2010-11-01
2
+
3
+ * 1 major enhancement:
4
+ * Clustering & Points now support N dimensions (up from 2)
5
+ * 1 minor enhancement:
6
+ * Switch to jeweler
7
+
8
+ == 0.1.5 2008-03-21
9
+
10
+ * 1 minor enhancement:
11
+ * added cluster radius
12
+ * 1 new example script:
13
+ * demonstrates SVG rendering of points and clusters
14
+
15
+ == 0.1.4 2008-02-13
16
+
17
+ * 1 minor enhancement:
18
+ * gave linear-time preclustering an independent "resolution" parameter
19
+ * 1 bugfix:
20
+ * corrected cluster coordinate calculation
21
+
22
+ == 0.1.3 2008-02-10
23
+
24
+ * 1 performance improvement
25
+ * added linear-time preclustering based on minimum separation distance
26
+ * 1 major change:
27
+ * when minimum separation is given, the clusterer will no longer calculate
28
+ and return clusters smaller than "separation / 2.0"
29
+
30
+ == 0.1.2 2008-02-07
31
+
32
+ * 1 performance improvement
33
+ * refactored Distances to be more intelligent about precalculated values
34
+ * 1 bugfix:
35
+ * correct degenerate case of Hierclust::Point.points
36
+
37
+ == 0.1.1 2008-02-04
38
+
39
+ * 1 minor enhancement:
40
+ * add method for returning flattened list of points in a cluster
41
+ * 1 bugfix:
42
+ * correct intermittent failure of Clusterer spec
43
+
44
+ == 0.1.0 2008-02-01
45
+
46
+ * 1 major enhancement:
47
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Brandt Kurowski <brandt@kurowski.net>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,28 @@
1
+ = Hierclust
2
+
3
+ A simple hierarchical clustering library for spatial data.
4
+
5
+ == Example
6
+
7
+ require 'hierclust'
8
+ points = (1..6).map { Hierclust::Point.new(rand(10), rand(10)) }
9
+ clusterer = Hierclust::Clusterer.new(points)
10
+ puts clusterer.clusters => [[[(4, 9), (4, 8)], (9, 6)], [[(1, 4), (3, 1)], (6, 3)]]
11
+
12
+ == Contact for this fork
13
+
14
+ Ben Woosley <ben.woosley@gmail.com>
15
+
16
+ == Note on Patches/Pull Requests
17
+
18
+ * Fork the project.
19
+ * Make your feature addition or bug fix.
20
+ * Add tests for it. This is important so I don't break it in a
21
+ future version unintentionally.
22
+ * Commit, do not mess with rakefile, version, or history.
23
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
24
+ * Send me a pull request. Bonus points for topic branches.
25
+
26
+ == Copyright
27
+
28
+ Copyright (c) 2010 Brandt Kurowski. See LICENSE for details.
@@ -0,0 +1,49 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "Empact-hierclust"
8
+ gem.summary = %Q{performs hierarchical clustering in N dimensions}
9
+ gem.description = %Q{performs hierarchical clustering on points in Euclidian space}
10
+ gem.email = "ben.woosley@gmail.com"
11
+ gem.homepage = "http://github.com/Empact/hierclust"
12
+ gem.authors = ["Brandt Kurowski", "Ben Woosley"]
13
+ gem.rubyforge_project = "hierclust"
14
+ gem.add_development_dependency "rspec", ">= 1.2.9"
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ Jeweler::RubyforgeTasks.new do |rubyforge|
19
+ rubyforge.doc_task = "rdoc"
20
+ end
21
+ rescue LoadError
22
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
23
+ end
24
+
25
+ require 'spec/rake/spectask'
26
+ Spec::Rake::SpecTask.new(:spec) do |spec|
27
+ spec.libs << 'lib' << 'spec'
28
+ spec.spec_files = FileList['spec/**/*_spec.rb']
29
+ end
30
+
31
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
32
+ spec.libs << 'lib' << 'spec'
33
+ spec.pattern = 'spec/**/*_spec.rb'
34
+ spec.rcov = true
35
+ end
36
+
37
+ task :spec => :check_dependencies
38
+
39
+ task :default => :spec
40
+
41
+ require 'rake/rdoctask'
42
+ Rake::RDocTask.new do |rdoc|
43
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
+
45
+ rdoc.rdoc_dir = 'rdoc'
46
+ rdoc.title = "hierclust #{version}"
47
+ rdoc.rdoc_files.include('README*')
48
+ rdoc.rdoc_files.include('lib/**/*.rb')
49
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
5
+ require 'hierclust'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'hierclust'
9
+ end
10
+
11
+ points = (1..20).map { Hierclust::Point.new(rand(800), rand(600)) }
12
+ clusterer = Hierclust::Clusterer.new(points)
13
+
14
+ print %Q{<?xml version="1.0" standalone="no"?>
15
+ <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
16
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
17
+ <svg width="800" height="600" version="1.1"
18
+ xmlns="http://www.w3.org/2000/svg">
19
+ }
20
+
21
+ def plot(cluster)
22
+ if cluster.kind_of? Hierclust::Cluster
23
+ print %Q{
24
+ <circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="#{cluster.radius}"
25
+ fill="none" stroke="black" stroke-width="#{cluster.items}"/>
26
+ }
27
+ cluster.items.each {|item| plot(item)}
28
+ else
29
+ print %Q{
30
+ <circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="2"
31
+ fill="red" stroke="none"/>
32
+ }
33
+ end
34
+ end
35
+
36
+ clusterer.clusters.each {|cluster| plot cluster}
37
+
38
+ print %Q{
39
+ <rect x="1" y="1" width="798" height="598"
40
+ fill="none" stroke="grey" stroke-width="2" />
41
+ </svg>
42
+ }
@@ -0,0 +1,6 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'hierclust/point'
4
+ require 'hierclust/cluster'
5
+ require 'hierclust/distances'
6
+ require 'hierclust/clusterer'
@@ -0,0 +1,61 @@
1
+ module Hierclust
2
+ # A Cluster represents a collection of Points. A Cluster has it's own
3
+ # coordinates that are the mean of the coordinates of it's points.
4
+ # Because a Cluster has coordinates, it can act as a Point and therefore
5
+ # be included in other Clusters.
6
+ class Cluster < Point
7
+ # An array of items in this cluster
8
+ attr_accessor :items
9
+
10
+ # Create a Cluster for the given set of +items+.
11
+ def initialize(items)
12
+ @items = items
13
+ end
14
+
15
+ # Returns the average coordinates of all items in this Cluster.
16
+ def coordinates
17
+ return nil if size == 0
18
+ @coordinates ||= begin
19
+ coords = self.points.map {|p| p.coordinates }
20
+ coords = coords.shift.zip(*coords)
21
+ coords.map {|points| points.inject(0.0) {|sum, p| sum + p } / points.size }
22
+ end
23
+ end
24
+
25
+ # Add an +item+ to this Cluster.
26
+ def <<(item)
27
+ @coordinates = nil
28
+ @items << item
29
+ end
30
+
31
+ # Returns the number of items in this Cluster.
32
+ def size
33
+ @items.size
34
+ end
35
+
36
+ # Returns the distance from the center of this Cluster to the edge.
37
+ def radius
38
+ return nil if @items.empty?
39
+ return 0 if @items.size == 1
40
+ return (@items[0].distance_to(@items[1]) + @items[0].radius + @items[1].radius) / 2.0
41
+ raise "radius not implemented for clusters with more than two items"
42
+ end
43
+
44
+ # Returns a flat list of all the points contained in either this cluster
45
+ # or any of the clusters it contains.
46
+ def points
47
+ @items.map {|item| item.points}.flatten
48
+ end
49
+
50
+ # Returns +true+ if this Cluster includes the given +item+, otherwise
51
+ # returns +false+.
52
+ def include?(item)
53
+ @items.include? item
54
+ end
55
+
56
+ # Returns a legible representation of this Cluster and it's items.
57
+ def to_s
58
+ "[#{@items.join(', ')}]"
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,70 @@
1
+ module Hierclust
2
+ # Clusters a set of Points using Hierarchical Clustering, stopping either
3
+ # when the hierarchy is complete or the clusters are separated by a given
4
+ # minimum distance.
5
+ class Clusterer
6
+ # The Distances for the items being clustered
7
+ attr_reader :distances
8
+
9
+ # Create a new Clusterer for the given data.
10
+ #
11
+ # Specify +separation+ to stop the clustering process once all the
12
+ # items are at least +separation+ units apart.
13
+ #
14
+ # Specify +resolution+ to give a minimum size for clusters. Points that
15
+ # are within this distance from each other will not be hierarchically
16
+ # clustered, but will be put into clusters based strictly on coordinates.
17
+ # The clusters generated by this "pre-clustering" will then be
18
+ # hierarchically clustered as normal.
19
+ def initialize(data, separation = nil, resolution = nil)
20
+ @separation = separation
21
+ @resolution = resolution
22
+ @data = precluster(data)
23
+ @distances = Distances.new(@data)
24
+ end
25
+
26
+ # Calculates and returns the set of clusters.
27
+ def clusters
28
+ return @data if @separation && @distances.separation > @separation
29
+ while @data.length > 1
30
+ @distances = Distances.new(@data)
31
+ return @data if @separation && @distances.separation > @separation
32
+ @data = find_cluster
33
+ end
34
+ @data
35
+ end
36
+
37
+ private
38
+
39
+ def find_cluster
40
+ case @data.length
41
+ when 0
42
+ []
43
+ when 1, 2
44
+ [Cluster.new(@data)]
45
+ else
46
+ nearest = @distances.nearest
47
+ outliers = @distances.outliers
48
+ [Cluster.new(nearest), *outliers]
49
+ end
50
+ end
51
+
52
+ def precluster(points)
53
+ unless @resolution && @separation
54
+ # preclustering is only applicable given lower bound on resolution
55
+ # can't precluster w/ no min separation given
56
+ return points.dup
57
+ end
58
+ if @separation == 0
59
+ # if no separation is asked for, it's all one cluster
60
+ return [Cluster.new(points)]
61
+ end
62
+ points.inject({}) do |grid_clusters, point|
63
+ grid_coordinates = point.coordinates.map {|coord| (coord / @resolution).floor }
64
+ grid_clusters[grid_coordinates] ||= Cluster.new([])
65
+ grid_clusters[grid_coordinates] << point
66
+ grid_clusters
67
+ end.values
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ module Hierclust
2
+ # Represents the pair-wise distances between a set of items.
3
+ class Distances
4
+ attr_reader :nearest, :outliers, :separation
5
+
6
+ # Create a new Distances for the given +items+
7
+ def initialize(items)
8
+ @items = items
9
+ @separation = 0
10
+ @nearest = []
11
+ items = @items.dup
12
+ while !items.empty?
13
+ origin = items.shift
14
+ items.each do |other|
15
+ distance = origin.distance_to(other)
16
+ if @separation == 0 or distance < @separation
17
+ @separation = distance
18
+ @nearest = [origin, other]
19
+ end
20
+ end
21
+ end
22
+ @outliers = @items - @nearest
23
+ end
24
+
25
+ =begin
26
+
27
+ old idea
28
+
29
+ 1 calculate all distances
30
+ 2 update distances when a new cluster is created from two existing points
31
+ 3 keep distances sorted by separation so that we always know which is shortest
32
+
33
+ new idea
34
+
35
+ don't worry about the lower level clusters
36
+ don't worry about the higher level clusters
37
+ just form clusters of the desired separation
38
+ start by dividing the points into a grid of 0.5 * sep
39
+ and put all points in the same grid cells together
40
+ ...
41
+ and then do regular hierarchical clustering! we should be fine at that point.
42
+ sweet....
43
+
44
+ =end
45
+
46
+ end
47
+ end