Empact-hierclust 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/Empact-hierclust.gemspec +82 -0
- data/History.txt +47 -0
- data/LICENSE +20 -0
- data/README.rdoc +28 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/examples/visualize.rb +42 -0
- data/lib/hierclust.rb +6 -0
- data/lib/hierclust/cluster.rb +61 -0
- data/lib/hierclust/clusterer.rb +70 -0
- data/lib/hierclust/distances.rb +47 -0
- data/lib/hierclust/point.rb +53 -0
- data/log/debug.log +0 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +74 -0
- data/spec/hierclust/cluster_spec.rb +90 -0
- data/spec/hierclust/clusterer_spec.rb +208 -0
- data/spec/hierclust/distances_spec.rb +44 -0
- data/spec/hierclust/point_spec.rb +22 -0
- data/spec/hierclust_spec.rb +4 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +17 -0
- data/website/index.txt +82 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.rhtml +48 -0
- metadata +121 -0
data/.gitignore
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{Empact-hierclust}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Brandt Kurowski", "Ben Woosley"]
|
12
|
+
s.date = %q{2010-11-01}
|
13
|
+
s.description = %q{performs hierarchical clustering on points in Euclidian space}
|
14
|
+
s.email = %q{ben.woosley@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"Empact-hierclust.gemspec",
|
22
|
+
"History.txt",
|
23
|
+
"LICENSE",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"examples/visualize.rb",
|
28
|
+
"lib/hierclust.rb",
|
29
|
+
"lib/hierclust/cluster.rb",
|
30
|
+
"lib/hierclust/clusterer.rb",
|
31
|
+
"lib/hierclust/distances.rb",
|
32
|
+
"lib/hierclust/point.rb",
|
33
|
+
"log/debug.log",
|
34
|
+
"script/destroy",
|
35
|
+
"script/generate",
|
36
|
+
"script/txt2html",
|
37
|
+
"spec/hierclust/cluster_spec.rb",
|
38
|
+
"spec/hierclust/clusterer_spec.rb",
|
39
|
+
"spec/hierclust/distances_spec.rb",
|
40
|
+
"spec/hierclust/point_spec.rb",
|
41
|
+
"spec/hierclust_spec.rb",
|
42
|
+
"spec/spec.opts",
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"tasks/deployment.rake",
|
45
|
+
"tasks/environment.rake",
|
46
|
+
"tasks/rspec.rake",
|
47
|
+
"tasks/website.rake",
|
48
|
+
"website/index.txt",
|
49
|
+
"website/javascripts/rounded_corners_lite.inc.js",
|
50
|
+
"website/stylesheets/screen.css",
|
51
|
+
"website/template.rhtml"
|
52
|
+
]
|
53
|
+
s.homepage = %q{http://github.com/Empact/hierclust}
|
54
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
55
|
+
s.require_paths = ["lib"]
|
56
|
+
s.rubyforge_project = %q{hierclust}
|
57
|
+
s.rubygems_version = %q{1.3.7}
|
58
|
+
s.summary = %q{performs hierarchical clustering in N dimensions}
|
59
|
+
s.test_files = [
|
60
|
+
"spec/hierclust/cluster_spec.rb",
|
61
|
+
"spec/hierclust/clusterer_spec.rb",
|
62
|
+
"spec/hierclust/distances_spec.rb",
|
63
|
+
"spec/hierclust/point_spec.rb",
|
64
|
+
"spec/hierclust_spec.rb",
|
65
|
+
"spec/spec_helper.rb",
|
66
|
+
"examples/visualize.rb"
|
67
|
+
]
|
68
|
+
|
69
|
+
if s.respond_to? :specification_version then
|
70
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
|
+
s.specification_version = 3
|
72
|
+
|
73
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
74
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
75
|
+
else
|
76
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
77
|
+
end
|
78
|
+
else
|
79
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
data/History.txt
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
== 0.2.0 2010-11-01
|
2
|
+
|
3
|
+
* 1 major enhancement:
|
4
|
+
* Clustering & Points now support N dimensions (up from 2)
|
5
|
+
* 1 minor enhancement:
|
6
|
+
* Switch to jeweler
|
7
|
+
|
8
|
+
== 0.1.5 2008-03-21
|
9
|
+
|
10
|
+
* 1 minor enhancement:
|
11
|
+
* added cluster radius
|
12
|
+
* 1 new example script:
|
13
|
+
* demonstrates SVG rendering of points and clusters
|
14
|
+
|
15
|
+
== 0.1.4 2008-02-13
|
16
|
+
|
17
|
+
* 1 minor enhancement:
|
18
|
+
* gave linear-time preclustering an independent "resolution" parameter
|
19
|
+
* 1 bugfix:
|
20
|
+
* corrected cluster coordinate calculation
|
21
|
+
|
22
|
+
== 0.1.3 2008-02-10
|
23
|
+
|
24
|
+
* 1 performance improvement
|
25
|
+
* added linear-time preclustering based on minimum separation distance
|
26
|
+
* 1 major change:
|
27
|
+
* when minimum separation is given, the clusterer will no longer calculate
|
28
|
+
and return clusters smaller than "separation / 2.0"
|
29
|
+
|
30
|
+
== 0.1.2 2008-02-07
|
31
|
+
|
32
|
+
* 1 performance improvement
|
33
|
+
* refactored Distances to be more intelligent about precalculated values
|
34
|
+
* 1 bugfix:
|
35
|
+
* correct degenerate case of Hierclust::Point.points
|
36
|
+
|
37
|
+
== 0.1.1 2008-02-04
|
38
|
+
|
39
|
+
* 1 minor enhancement:
|
40
|
+
* add method for returning flattened list of points in a cluster
|
41
|
+
* 1 bugfix:
|
42
|
+
* correct intermittent failure of Clusterer spec
|
43
|
+
|
44
|
+
== 0.1.0 2008-02-01
|
45
|
+
|
46
|
+
* 1 major enhancement:
|
47
|
+
* Initial release
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Brandt Kurowski <brandt@kurowski.net>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= Hierclust
|
2
|
+
|
3
|
+
A simple hierarchical clustering library for spatial data.
|
4
|
+
|
5
|
+
== Example
|
6
|
+
|
7
|
+
require 'hierclust'
|
8
|
+
points = (1..6).map { Hierclust::Point.new(rand(10), rand(10)) }
|
9
|
+
clusterer = Hierclust::Clusterer.new(points)
|
10
|
+
puts clusterer.clusters => [[[(4, 9), (4, 8)], (9, 6)], [[(1, 4), (3, 1)], (6, 3)]]
|
11
|
+
|
12
|
+
== Contact for this fork
|
13
|
+
|
14
|
+
Ben Woosley <ben.woosley@gmail.com>
|
15
|
+
|
16
|
+
== Note on Patches/Pull Requests
|
17
|
+
|
18
|
+
* Fork the project.
|
19
|
+
* Make your feature addition or bug fix.
|
20
|
+
* Add tests for it. This is important so I don't break it in a
|
21
|
+
future version unintentionally.
|
22
|
+
* Commit, do not mess with rakefile, version, or history.
|
23
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
24
|
+
* Send me a pull request. Bonus points for topic branches.
|
25
|
+
|
26
|
+
== Copyright
|
27
|
+
|
28
|
+
Copyright (c) 2010 Brandt Kurowski. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "Empact-hierclust"
|
8
|
+
gem.summary = %Q{performs hierarchical clustering in N dimensions}
|
9
|
+
gem.description = %Q{performs hierarchical clustering on points in Euclidian space}
|
10
|
+
gem.email = "ben.woosley@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/Empact/hierclust"
|
12
|
+
gem.authors = ["Brandt Kurowski", "Ben Woosley"]
|
13
|
+
gem.rubyforge_project = "hierclust"
|
14
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
19
|
+
rubyforge.doc_task = "rdoc"
|
20
|
+
end
|
21
|
+
rescue LoadError
|
22
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'spec/rake/spectask'
|
26
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
27
|
+
spec.libs << 'lib' << 'spec'
|
28
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
29
|
+
end
|
30
|
+
|
31
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
32
|
+
spec.libs << 'lib' << 'spec'
|
33
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
34
|
+
spec.rcov = true
|
35
|
+
end
|
36
|
+
|
37
|
+
task :spec => :check_dependencies
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rake/rdoctask'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "hierclust #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
5
|
+
require 'hierclust'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'hierclust'
|
9
|
+
end
|
10
|
+
|
11
|
+
points = (1..20).map { Hierclust::Point.new(rand(800), rand(600)) }
|
12
|
+
clusterer = Hierclust::Clusterer.new(points)
|
13
|
+
|
14
|
+
print %Q{<?xml version="1.0" standalone="no"?>
|
15
|
+
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
16
|
+
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
17
|
+
<svg width="800" height="600" version="1.1"
|
18
|
+
xmlns="http://www.w3.org/2000/svg">
|
19
|
+
}
|
20
|
+
|
21
|
+
def plot(cluster)
|
22
|
+
if cluster.kind_of? Hierclust::Cluster
|
23
|
+
print %Q{
|
24
|
+
<circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="#{cluster.radius}"
|
25
|
+
fill="none" stroke="black" stroke-width="#{cluster.items}"/>
|
26
|
+
}
|
27
|
+
cluster.items.each {|item| plot(item)}
|
28
|
+
else
|
29
|
+
print %Q{
|
30
|
+
<circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="2"
|
31
|
+
fill="red" stroke="none"/>
|
32
|
+
}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
clusterer.clusters.each {|cluster| plot cluster}
|
37
|
+
|
38
|
+
print %Q{
|
39
|
+
<rect x="1" y="1" width="798" height="598"
|
40
|
+
fill="none" stroke="grey" stroke-width="2" />
|
41
|
+
</svg>
|
42
|
+
}
|
data/lib/hierclust.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# A Cluster represents a collection of Points. A Cluster has it's own
|
3
|
+
# coordinates that are the mean of the coordinates of it's points.
|
4
|
+
# Because a Cluster has coordinates, it can act as a Point and therefore
|
5
|
+
# be included in other Clusters.
|
6
|
+
class Cluster < Point
|
7
|
+
# An array of items in this cluster
|
8
|
+
attr_accessor :items
|
9
|
+
|
10
|
+
# Create a Cluster for the given set of +items+.
|
11
|
+
def initialize(items)
|
12
|
+
@items = items
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the average coordinates of all items in this Cluster.
|
16
|
+
def coordinates
|
17
|
+
return nil if size == 0
|
18
|
+
@coordinates ||= begin
|
19
|
+
coords = self.points.map {|p| p.coordinates }
|
20
|
+
coords = coords.shift.zip(*coords)
|
21
|
+
coords.map {|points| points.inject(0.0) {|sum, p| sum + p } / points.size }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add an +item+ to this Cluster.
|
26
|
+
def <<(item)
|
27
|
+
@coordinates = nil
|
28
|
+
@items << item
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the number of items in this Cluster.
|
32
|
+
def size
|
33
|
+
@items.size
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns the distance from the center of this Cluster to the edge.
|
37
|
+
def radius
|
38
|
+
return nil if @items.empty?
|
39
|
+
return 0 if @items.size == 1
|
40
|
+
return (@items[0].distance_to(@items[1]) + @items[0].radius + @items[1].radius) / 2.0
|
41
|
+
raise "radius not implemented for clusters with more than two items"
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns a flat list of all the points contained in either this cluster
|
45
|
+
# or any of the clusters it contains.
|
46
|
+
def points
|
47
|
+
@items.map {|item| item.points}.flatten
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns +true+ if this Cluster includes the given +item+, otherwise
|
51
|
+
# returns +false+.
|
52
|
+
def include?(item)
|
53
|
+
@items.include? item
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns a legible representation of this Cluster and it's items.
|
57
|
+
def to_s
|
58
|
+
"[#{@items.join(', ')}]"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# Clusters a set of Points using Hierarchical Clustering, stopping either
|
3
|
+
# when the hierarchy is complete or the clusters are separated by a given
|
4
|
+
# minimum distance.
|
5
|
+
class Clusterer
|
6
|
+
# The Distances for the items being clustered
|
7
|
+
attr_reader :distances
|
8
|
+
|
9
|
+
# Create a new Clusterer for the given data.
|
10
|
+
#
|
11
|
+
# Specify +separation+ to stop the clustering process once all the
|
12
|
+
# items are at least +separation+ units apart.
|
13
|
+
#
|
14
|
+
# Specify +resolution+ to give a minimum size for clusters. Points that
|
15
|
+
# are within this distance from each other will not be hierarchically
|
16
|
+
# clustered, but will be put into clusters based strictly on coordinates.
|
17
|
+
# The clusters generated by this "pre-clustering" will then be
|
18
|
+
# hierarchically clustered as normal.
|
19
|
+
def initialize(data, separation = nil, resolution = nil)
|
20
|
+
@separation = separation
|
21
|
+
@resolution = resolution
|
22
|
+
@data = precluster(data)
|
23
|
+
@distances = Distances.new(@data)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Calculates and returns the set of clusters.
|
27
|
+
def clusters
|
28
|
+
return @data if @separation && @distances.separation > @separation
|
29
|
+
while @data.length > 1
|
30
|
+
@distances = Distances.new(@data)
|
31
|
+
return @data if @separation && @distances.separation > @separation
|
32
|
+
@data = find_cluster
|
33
|
+
end
|
34
|
+
@data
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def find_cluster
|
40
|
+
case @data.length
|
41
|
+
when 0
|
42
|
+
[]
|
43
|
+
when 1, 2
|
44
|
+
[Cluster.new(@data)]
|
45
|
+
else
|
46
|
+
nearest = @distances.nearest
|
47
|
+
outliers = @distances.outliers
|
48
|
+
[Cluster.new(nearest), *outliers]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def precluster(points)
|
53
|
+
unless @resolution && @separation
|
54
|
+
# preclustering is only applicable given lower bound on resolution
|
55
|
+
# can't precluster w/ no min separation given
|
56
|
+
return points.dup
|
57
|
+
end
|
58
|
+
if @separation == 0
|
59
|
+
# if no separation is asked for, it's all one cluster
|
60
|
+
return [Cluster.new(points)]
|
61
|
+
end
|
62
|
+
points.inject({}) do |grid_clusters, point|
|
63
|
+
grid_coordinates = point.coordinates.map {|coord| (coord / @resolution).floor }
|
64
|
+
grid_clusters[grid_coordinates] ||= Cluster.new([])
|
65
|
+
grid_clusters[grid_coordinates] << point
|
66
|
+
grid_clusters
|
67
|
+
end.values
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# Represents the pair-wise distances between a set of items.
|
3
|
+
class Distances
|
4
|
+
attr_reader :nearest, :outliers, :separation
|
5
|
+
|
6
|
+
# Create a new Distances for the given +items+
|
7
|
+
def initialize(items)
|
8
|
+
@items = items
|
9
|
+
@separation = 0
|
10
|
+
@nearest = []
|
11
|
+
items = @items.dup
|
12
|
+
while !items.empty?
|
13
|
+
origin = items.shift
|
14
|
+
items.each do |other|
|
15
|
+
distance = origin.distance_to(other)
|
16
|
+
if @separation == 0 or distance < @separation
|
17
|
+
@separation = distance
|
18
|
+
@nearest = [origin, other]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
@outliers = @items - @nearest
|
23
|
+
end
|
24
|
+
|
25
|
+
=begin
|
26
|
+
|
27
|
+
old idea
|
28
|
+
|
29
|
+
1 calculate all distances
|
30
|
+
2 update distances when a new cluster is created from two existing points
|
31
|
+
3 keep distances sorted by separation so that we always know which is shortest
|
32
|
+
|
33
|
+
new idea
|
34
|
+
|
35
|
+
don't worry about the lower level clusters
|
36
|
+
don't worry about the higher level clusters
|
37
|
+
just form clusters of the desired separation
|
38
|
+
start by dividing the points into a grid of 0.5 * sep
|
39
|
+
and put all points in the same grid cells together
|
40
|
+
...
|
41
|
+
and then do regular hierarchical clustering! we should be fine at that point.
|
42
|
+
sweet....
|
43
|
+
|
44
|
+
=end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|