Empact-hierclust 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
@@ -0,0 +1,82 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{Empact-hierclust}
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Brandt Kurowski", "Ben Woosley"]
12
+ s.date = %q{2010-11-01}
13
+ s.description = %q{performs hierarchical clustering on points in Euclidian space}
14
+ s.email = %q{ben.woosley@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "Empact-hierclust.gemspec",
22
+ "History.txt",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "examples/visualize.rb",
28
+ "lib/hierclust.rb",
29
+ "lib/hierclust/cluster.rb",
30
+ "lib/hierclust/clusterer.rb",
31
+ "lib/hierclust/distances.rb",
32
+ "lib/hierclust/point.rb",
33
+ "log/debug.log",
34
+ "script/destroy",
35
+ "script/generate",
36
+ "script/txt2html",
37
+ "spec/hierclust/cluster_spec.rb",
38
+ "spec/hierclust/clusterer_spec.rb",
39
+ "spec/hierclust/distances_spec.rb",
40
+ "spec/hierclust/point_spec.rb",
41
+ "spec/hierclust_spec.rb",
42
+ "spec/spec.opts",
43
+ "spec/spec_helper.rb",
44
+ "tasks/deployment.rake",
45
+ "tasks/environment.rake",
46
+ "tasks/rspec.rake",
47
+ "tasks/website.rake",
48
+ "website/index.txt",
49
+ "website/javascripts/rounded_corners_lite.inc.js",
50
+ "website/stylesheets/screen.css",
51
+ "website/template.rhtml"
52
+ ]
53
+ s.homepage = %q{http://github.com/Empact/hierclust}
54
+ s.rdoc_options = ["--charset=UTF-8"]
55
+ s.require_paths = ["lib"]
56
+ s.rubyforge_project = %q{hierclust}
57
+ s.rubygems_version = %q{1.3.7}
58
+ s.summary = %q{performs hierarchical clustering in N dimensions}
59
+ s.test_files = [
60
+ "spec/hierclust/cluster_spec.rb",
61
+ "spec/hierclust/clusterer_spec.rb",
62
+ "spec/hierclust/distances_spec.rb",
63
+ "spec/hierclust/point_spec.rb",
64
+ "spec/hierclust_spec.rb",
65
+ "spec/spec_helper.rb",
66
+ "examples/visualize.rb"
67
+ ]
68
+
69
+ if s.respond_to? :specification_version then
70
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
71
+ s.specification_version = 3
72
+
73
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
74
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
75
+ else
76
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
77
+ end
78
+ else
79
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
80
+ end
81
+ end
82
+
@@ -0,0 +1,47 @@
1
+ == 0.2.0 2010-11-01
2
+
3
+ * 1 major enhancement:
4
+ * Clustering & Points now support N dimensions (up from 2)
5
+ * 1 minor enhancement:
6
+ * Switch to jeweler
7
+
8
+ == 0.1.5 2008-03-21
9
+
10
+ * 1 minor enhancement:
11
+ * added cluster radius
12
+ * 1 new example script:
13
+ * demonstrates SVG rendering of points and clusters
14
+
15
+ == 0.1.4 2008-02-13
16
+
17
+ * 1 minor enhancement:
18
+ * gave linear-time preclustering an independent "resolution" parameter
19
+ * 1 bugfix:
20
+ * corrected cluster coordinate calculation
21
+
22
+ == 0.1.3 2008-02-10
23
+
24
+ * 1 performance improvement
25
+ * added linear-time preclustering based on minimum separation distance
26
+ * 1 major change:
27
+ * when minimum separation is given, the clusterer will no longer calculate
28
+ and return clusters smaller than "separation / 2.0"
29
+
30
+ == 0.1.2 2008-02-07
31
+
32
+ * 1 performance improvement
33
+ * refactored Distances to be more intelligent about precalculated values
34
+ * 1 bugfix:
35
+ * correct degenerate case of Hierclust::Point.points
36
+
37
+ == 0.1.1 2008-02-04
38
+
39
+ * 1 minor enhancement:
40
+ * add method for returning flattened list of points in a cluster
41
+ * 1 bugfix:
42
+ * correct intermittent failure of Clusterer spec
43
+
44
+ == 0.1.0 2008-02-01
45
+
46
+ * 1 major enhancement:
47
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Brandt Kurowski <brandt@kurowski.net>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,28 @@
1
+ = Hierclust
2
+
3
+ A simple hierarchical clustering library for spatial data.
4
+
5
+ == Example
6
+
7
+ require 'hierclust'
8
+ points = (1..6).map { Hierclust::Point.new(rand(10), rand(10)) }
9
+ clusterer = Hierclust::Clusterer.new(points)
10
+ puts clusterer.clusters => [[[(4, 9), (4, 8)], (9, 6)], [[(1, 4), (3, 1)], (6, 3)]]
11
+
12
+ == Contact for this fork
13
+
14
+ Ben Woosley <ben.woosley@gmail.com>
15
+
16
+ == Note on Patches/Pull Requests
17
+
18
+ * Fork the project.
19
+ * Make your feature addition or bug fix.
20
+ * Add tests for it. This is important so I don't break it in a
21
+ future version unintentionally.
22
+ * Commit, do not mess with rakefile, version, or history.
23
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
24
+ * Send me a pull request. Bonus points for topic branches.
25
+
26
+ == Copyright
27
+
28
+ Copyright (c) 2010 Brandt Kurowski. See LICENSE for details.
@@ -0,0 +1,49 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "Empact-hierclust"
8
+ gem.summary = %Q{performs hierarchical clustering in N dimensions}
9
+ gem.description = %Q{performs hierarchical clustering on points in Euclidian space}
10
+ gem.email = "ben.woosley@gmail.com"
11
+ gem.homepage = "http://github.com/Empact/hierclust"
12
+ gem.authors = ["Brandt Kurowski", "Ben Woosley"]
13
+ gem.rubyforge_project = "hierclust"
14
+ gem.add_development_dependency "rspec", ">= 1.2.9"
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ Jeweler::RubyforgeTasks.new do |rubyforge|
19
+ rubyforge.doc_task = "rdoc"
20
+ end
21
+ rescue LoadError
22
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
23
+ end
24
+
25
+ require 'spec/rake/spectask'
26
+ Spec::Rake::SpecTask.new(:spec) do |spec|
27
+ spec.libs << 'lib' << 'spec'
28
+ spec.spec_files = FileList['spec/**/*_spec.rb']
29
+ end
30
+
31
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
32
+ spec.libs << 'lib' << 'spec'
33
+ spec.pattern = 'spec/**/*_spec.rb'
34
+ spec.rcov = true
35
+ end
36
+
37
+ task :spec => :check_dependencies
38
+
39
+ task :default => :spec
40
+
41
+ require 'rake/rdoctask'
42
+ Rake::RDocTask.new do |rdoc|
43
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
+
45
+ rdoc.rdoc_dir = 'rdoc'
46
+ rdoc.title = "hierclust #{version}"
47
+ rdoc.rdoc_files.include('README*')
48
+ rdoc.rdoc_files.include('lib/**/*.rb')
49
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
5
+ require 'hierclust'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'hierclust'
9
+ end
10
+
11
+ points = (1..20).map { Hierclust::Point.new(rand(800), rand(600)) }
12
+ clusterer = Hierclust::Clusterer.new(points)
13
+
14
+ print %Q{<?xml version="1.0" standalone="no"?>
15
+ <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
16
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
17
+ <svg width="800" height="600" version="1.1"
18
+ xmlns="http://www.w3.org/2000/svg">
19
+ }
20
+
21
+ def plot(cluster)
22
+ if cluster.kind_of? Hierclust::Cluster
23
+ print %Q{
24
+ <circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="#{cluster.radius}"
25
+ fill="none" stroke="black" stroke-width="#{cluster.items}"/>
26
+ }
27
+ cluster.items.each {|item| plot(item)}
28
+ else
29
+ print %Q{
30
+ <circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="2"
31
+ fill="red" stroke="none"/>
32
+ }
33
+ end
34
+ end
35
+
36
+ clusterer.clusters.each {|cluster| plot cluster}
37
+
38
+ print %Q{
39
+ <rect x="1" y="1" width="798" height="598"
40
+ fill="none" stroke="grey" stroke-width="2" />
41
+ </svg>
42
+ }
@@ -0,0 +1,6 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'hierclust/point'
4
+ require 'hierclust/cluster'
5
+ require 'hierclust/distances'
6
+ require 'hierclust/clusterer'
@@ -0,0 +1,61 @@
1
+ module Hierclust
2
+ # A Cluster represents a collection of Points. A Cluster has it's own
3
+ # coordinates that are the mean of the coordinates of it's points.
4
+ # Because a Cluster has coordinates, it can act as a Point and therefore
5
+ # be included in other Clusters.
6
+ class Cluster < Point
7
+ # An array of items in this cluster
8
+ attr_accessor :items
9
+
10
+ # Create a Cluster for the given set of +items+.
11
+ def initialize(items)
12
+ @items = items
13
+ end
14
+
15
+ # Returns the average coordinates of all items in this Cluster.
16
+ def coordinates
17
+ return nil if size == 0
18
+ @coordinates ||= begin
19
+ coords = self.points.map {|p| p.coordinates }
20
+ coords = coords.shift.zip(*coords)
21
+ coords.map {|points| points.inject(0.0) {|sum, p| sum + p } / points.size }
22
+ end
23
+ end
24
+
25
+ # Add an +item+ to this Cluster.
26
+ def <<(item)
27
+ @coordinates = nil
28
+ @items << item
29
+ end
30
+
31
+ # Returns the number of items in this Cluster.
32
+ def size
33
+ @items.size
34
+ end
35
+
36
+ # Returns the distance from the center of this Cluster to the edge.
37
+ def radius
38
+ return nil if @items.empty?
39
+ return 0 if @items.size == 1
40
+ return (@items[0].distance_to(@items[1]) + @items[0].radius + @items[1].radius) / 2.0
41
+ raise "radius not implemented for clusters with more than two items"
42
+ end
43
+
44
+ # Returns a flat list of all the points contained in either this cluster
45
+ # or any of the clusters it contains.
46
+ def points
47
+ @items.map {|item| item.points}.flatten
48
+ end
49
+
50
+ # Returns +true+ if this Cluster includes the given +item+, otherwise
51
+ # returns +false+.
52
+ def include?(item)
53
+ @items.include? item
54
+ end
55
+
56
+ # Returns a legible representation of this Cluster and it's items.
57
+ def to_s
58
+ "[#{@items.join(', ')}]"
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,70 @@
1
+ module Hierclust
2
+ # Clusters a set of Points using Hierarchical Clustering, stopping either
3
+ # when the hierarchy is complete or the clusters are separated by a given
4
+ # minimum distance.
5
+ class Clusterer
6
+ # The Distances for the items being clustered
7
+ attr_reader :distances
8
+
9
+ # Create a new Clusterer for the given data.
10
+ #
11
+ # Specify +separation+ to stop the clustering process once all the
12
+ # items are at least +separation+ units apart.
13
+ #
14
+ # Specify +resolution+ to give a minimum size for clusters. Points that
15
+ # are within this distance from each other will not be hierarchically
16
+ # clustered, but will be put into clusters based strictly on coordinates.
17
+ # The clusters generated by this "pre-clustering" will then be
18
+ # hierarchically clustered as normal.
19
+ def initialize(data, separation = nil, resolution = nil)
20
+ @separation = separation
21
+ @resolution = resolution
22
+ @data = precluster(data)
23
+ @distances = Distances.new(@data)
24
+ end
25
+
26
+ # Calculates and returns the set of clusters.
27
+ def clusters
28
+ return @data if @separation && @distances.separation > @separation
29
+ while @data.length > 1
30
+ @distances = Distances.new(@data)
31
+ return @data if @separation && @distances.separation > @separation
32
+ @data = find_cluster
33
+ end
34
+ @data
35
+ end
36
+
37
+ private
38
+
39
+ def find_cluster
40
+ case @data.length
41
+ when 0
42
+ []
43
+ when 1, 2
44
+ [Cluster.new(@data)]
45
+ else
46
+ nearest = @distances.nearest
47
+ outliers = @distances.outliers
48
+ [Cluster.new(nearest), *outliers]
49
+ end
50
+ end
51
+
52
+ def precluster(points)
53
+ unless @resolution && @separation
54
+ # preclustering is only applicable given lower bound on resolution
55
+ # can't precluster w/ no min separation given
56
+ return points.dup
57
+ end
58
+ if @separation == 0
59
+ # if no separation is asked for, it's all one cluster
60
+ return [Cluster.new(points)]
61
+ end
62
+ points.inject({}) do |grid_clusters, point|
63
+ grid_coordinates = point.coordinates.map {|coord| (coord / @resolution).floor }
64
+ grid_clusters[grid_coordinates] ||= Cluster.new([])
65
+ grid_clusters[grid_coordinates] << point
66
+ grid_clusters
67
+ end.values
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ module Hierclust
2
+ # Represents the pair-wise distances between a set of items.
3
+ class Distances
4
+ attr_reader :nearest, :outliers, :separation
5
+
6
+ # Create a new Distances for the given +items+
7
+ def initialize(items)
8
+ @items = items
9
+ @separation = 0
10
+ @nearest = []
11
+ items = @items.dup
12
+ while !items.empty?
13
+ origin = items.shift
14
+ items.each do |other|
15
+ distance = origin.distance_to(other)
16
+ if @separation == 0 or distance < @separation
17
+ @separation = distance
18
+ @nearest = [origin, other]
19
+ end
20
+ end
21
+ end
22
+ @outliers = @items - @nearest
23
+ end
24
+
25
+ =begin
26
+
27
+ old idea
28
+
29
+ 1 calculate all distances
30
+ 2 update distances when a new cluster is created from two existing points
31
+ 3 keep distances sorted by separation so that we always know which is shortest
32
+
33
+ new idea
34
+
35
+ don't worry about the lower level clusters
36
+ don't worry about the higher level clusters
37
+ just form clusters of the desired separation
38
+ start by dividing the points into a grid of 0.5 * sep
39
+ and put all points in the same grid cells together
40
+ ...
41
+ and then do regular hierarchical clustering! we should be fine at that point.
42
+ sweet....
43
+
44
+ =end
45
+
46
+ end
47
+ end