Empact-hierclust 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/Empact-hierclust.gemspec +82 -0
- data/History.txt +47 -0
- data/LICENSE +20 -0
- data/README.rdoc +28 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/examples/visualize.rb +42 -0
- data/lib/hierclust.rb +6 -0
- data/lib/hierclust/cluster.rb +61 -0
- data/lib/hierclust/clusterer.rb +70 -0
- data/lib/hierclust/distances.rb +47 -0
- data/lib/hierclust/point.rb +53 -0
- data/log/debug.log +0 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +74 -0
- data/spec/hierclust/cluster_spec.rb +90 -0
- data/spec/hierclust/clusterer_spec.rb +208 -0
- data/spec/hierclust/distances_spec.rb +44 -0
- data/spec/hierclust/point_spec.rb +22 -0
- data/spec/hierclust_spec.rb +4 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +21 -0
- data/tasks/website.rake +17 -0
- data/website/index.txt +82 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.rhtml +48 -0
- metadata +121 -0
data/.gitignore
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{Empact-hierclust}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Brandt Kurowski", "Ben Woosley"]
|
12
|
+
s.date = %q{2010-11-01}
|
13
|
+
s.description = %q{performs hierarchical clustering on points in Euclidian space}
|
14
|
+
s.email = %q{ben.woosley@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"Empact-hierclust.gemspec",
|
22
|
+
"History.txt",
|
23
|
+
"LICENSE",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"examples/visualize.rb",
|
28
|
+
"lib/hierclust.rb",
|
29
|
+
"lib/hierclust/cluster.rb",
|
30
|
+
"lib/hierclust/clusterer.rb",
|
31
|
+
"lib/hierclust/distances.rb",
|
32
|
+
"lib/hierclust/point.rb",
|
33
|
+
"log/debug.log",
|
34
|
+
"script/destroy",
|
35
|
+
"script/generate",
|
36
|
+
"script/txt2html",
|
37
|
+
"spec/hierclust/cluster_spec.rb",
|
38
|
+
"spec/hierclust/clusterer_spec.rb",
|
39
|
+
"spec/hierclust/distances_spec.rb",
|
40
|
+
"spec/hierclust/point_spec.rb",
|
41
|
+
"spec/hierclust_spec.rb",
|
42
|
+
"spec/spec.opts",
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"tasks/deployment.rake",
|
45
|
+
"tasks/environment.rake",
|
46
|
+
"tasks/rspec.rake",
|
47
|
+
"tasks/website.rake",
|
48
|
+
"website/index.txt",
|
49
|
+
"website/javascripts/rounded_corners_lite.inc.js",
|
50
|
+
"website/stylesheets/screen.css",
|
51
|
+
"website/template.rhtml"
|
52
|
+
]
|
53
|
+
s.homepage = %q{http://github.com/Empact/hierclust}
|
54
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
55
|
+
s.require_paths = ["lib"]
|
56
|
+
s.rubyforge_project = %q{hierclust}
|
57
|
+
s.rubygems_version = %q{1.3.7}
|
58
|
+
s.summary = %q{performs hierarchical clustering in N dimensions}
|
59
|
+
s.test_files = [
|
60
|
+
"spec/hierclust/cluster_spec.rb",
|
61
|
+
"spec/hierclust/clusterer_spec.rb",
|
62
|
+
"spec/hierclust/distances_spec.rb",
|
63
|
+
"spec/hierclust/point_spec.rb",
|
64
|
+
"spec/hierclust_spec.rb",
|
65
|
+
"spec/spec_helper.rb",
|
66
|
+
"examples/visualize.rb"
|
67
|
+
]
|
68
|
+
|
69
|
+
if s.respond_to? :specification_version then
|
70
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
|
+
s.specification_version = 3
|
72
|
+
|
73
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
74
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
75
|
+
else
|
76
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
77
|
+
end
|
78
|
+
else
|
79
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
data/History.txt
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
== 0.2.0 2010-11-01
|
2
|
+
|
3
|
+
* 1 major enhancement:
|
4
|
+
* Clustering & Points now support N dimensions (up from 2)
|
5
|
+
* 1 minor enhancement:
|
6
|
+
* Switch to jeweler
|
7
|
+
|
8
|
+
== 0.1.5 2008-03-21
|
9
|
+
|
10
|
+
* 1 minor enhancement:
|
11
|
+
* added cluster radius
|
12
|
+
* 1 new example script:
|
13
|
+
* demonstrates SVG rendering of points and clusters
|
14
|
+
|
15
|
+
== 0.1.4 2008-02-13
|
16
|
+
|
17
|
+
* 1 minor enhancement:
|
18
|
+
* gave linear-time preclustering an independent "resolution" parameter
|
19
|
+
* 1 bugfix:
|
20
|
+
* corrected cluster coordinate calculation
|
21
|
+
|
22
|
+
== 0.1.3 2008-02-10
|
23
|
+
|
24
|
+
* 1 performance improvement
|
25
|
+
* added linear-time preclustering based on minimum separation distance
|
26
|
+
* 1 major change:
|
27
|
+
* when minimum separation is given, the clusterer will no longer calculate
|
28
|
+
and return clusters smaller than "separation / 2.0"
|
29
|
+
|
30
|
+
== 0.1.2 2008-02-07
|
31
|
+
|
32
|
+
* 1 performance improvement
|
33
|
+
* refactored Distances to be more intelligent about precalculated values
|
34
|
+
* 1 bugfix:
|
35
|
+
* correct degenerate case of Hierclust::Point.points
|
36
|
+
|
37
|
+
== 0.1.1 2008-02-04
|
38
|
+
|
39
|
+
* 1 minor enhancement:
|
40
|
+
* add method for returning flattened list of points in a cluster
|
41
|
+
* 1 bugfix:
|
42
|
+
* correct intermittent failure of Clusterer spec
|
43
|
+
|
44
|
+
== 0.1.0 2008-02-01
|
45
|
+
|
46
|
+
* 1 major enhancement:
|
47
|
+
* Initial release
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Brandt Kurowski <brandt@kurowski.net>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= Hierclust
|
2
|
+
|
3
|
+
A simple hierarchical clustering library for spatial data.
|
4
|
+
|
5
|
+
== Example
|
6
|
+
|
7
|
+
require 'hierclust'
|
8
|
+
points = (1..6).map { Hierclust::Point.new(rand(10), rand(10)) }
|
9
|
+
clusterer = Hierclust::Clusterer.new(points)
|
10
|
+
puts clusterer.clusters => [[[(4, 9), (4, 8)], (9, 6)], [[(1, 4), (3, 1)], (6, 3)]]
|
11
|
+
|
12
|
+
== Contact for this fork
|
13
|
+
|
14
|
+
Ben Woosley <ben.woosley@gmail.com>
|
15
|
+
|
16
|
+
== Note on Patches/Pull Requests
|
17
|
+
|
18
|
+
* Fork the project.
|
19
|
+
* Make your feature addition or bug fix.
|
20
|
+
* Add tests for it. This is important so I don't break it in a
|
21
|
+
future version unintentionally.
|
22
|
+
* Commit, do not mess with rakefile, version, or history.
|
23
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
24
|
+
* Send me a pull request. Bonus points for topic branches.
|
25
|
+
|
26
|
+
== Copyright
|
27
|
+
|
28
|
+
Copyright (c) 2010 Brandt Kurowski. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "Empact-hierclust"
|
8
|
+
gem.summary = %Q{performs hierarchical clustering in N dimensions}
|
9
|
+
gem.description = %Q{performs hierarchical clustering on points in Euclidian space}
|
10
|
+
gem.email = "ben.woosley@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/Empact/hierclust"
|
12
|
+
gem.authors = ["Brandt Kurowski", "Ben Woosley"]
|
13
|
+
gem.rubyforge_project = "hierclust"
|
14
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
15
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
19
|
+
rubyforge.doc_task = "rdoc"
|
20
|
+
end
|
21
|
+
rescue LoadError
|
22
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'spec/rake/spectask'
|
26
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
27
|
+
spec.libs << 'lib' << 'spec'
|
28
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
29
|
+
end
|
30
|
+
|
31
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
32
|
+
spec.libs << 'lib' << 'spec'
|
33
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
34
|
+
spec.rcov = true
|
35
|
+
end
|
36
|
+
|
37
|
+
task :spec => :check_dependencies
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'rake/rdoctask'
|
42
|
+
Rake::RDocTask.new do |rdoc|
|
43
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "hierclust #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
5
|
+
require 'hierclust'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'hierclust'
|
9
|
+
end
|
10
|
+
|
11
|
+
points = (1..20).map { Hierclust::Point.new(rand(800), rand(600)) }
|
12
|
+
clusterer = Hierclust::Clusterer.new(points)
|
13
|
+
|
14
|
+
print %Q{<?xml version="1.0" standalone="no"?>
|
15
|
+
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
|
16
|
+
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
17
|
+
<svg width="800" height="600" version="1.1"
|
18
|
+
xmlns="http://www.w3.org/2000/svg">
|
19
|
+
}
|
20
|
+
|
21
|
+
def plot(cluster)
|
22
|
+
if cluster.kind_of? Hierclust::Cluster
|
23
|
+
print %Q{
|
24
|
+
<circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="#{cluster.radius}"
|
25
|
+
fill="none" stroke="black" stroke-width="#{cluster.items}"/>
|
26
|
+
}
|
27
|
+
cluster.items.each {|item| plot(item)}
|
28
|
+
else
|
29
|
+
print %Q{
|
30
|
+
<circle cx="#{cluster.coordinates[0]}" cy="#{cluster.coordinates[1]}" r="2"
|
31
|
+
fill="red" stroke="none"/>
|
32
|
+
}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
clusterer.clusters.each {|cluster| plot cluster}
|
37
|
+
|
38
|
+
print %Q{
|
39
|
+
<rect x="1" y="1" width="798" height="598"
|
40
|
+
fill="none" stroke="grey" stroke-width="2" />
|
41
|
+
</svg>
|
42
|
+
}
|
data/lib/hierclust.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# A Cluster represents a collection of Points. A Cluster has it's own
|
3
|
+
# coordinates that are the mean of the coordinates of it's points.
|
4
|
+
# Because a Cluster has coordinates, it can act as a Point and therefore
|
5
|
+
# be included in other Clusters.
|
6
|
+
class Cluster < Point
|
7
|
+
# An array of items in this cluster
|
8
|
+
attr_accessor :items
|
9
|
+
|
10
|
+
# Create a Cluster for the given set of +items+.
|
11
|
+
def initialize(items)
|
12
|
+
@items = items
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the average coordinates of all items in this Cluster.
|
16
|
+
def coordinates
|
17
|
+
return nil if size == 0
|
18
|
+
@coordinates ||= begin
|
19
|
+
coords = self.points.map {|p| p.coordinates }
|
20
|
+
coords = coords.shift.zip(*coords)
|
21
|
+
coords.map {|points| points.inject(0.0) {|sum, p| sum + p } / points.size }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Add an +item+ to this Cluster.
|
26
|
+
def <<(item)
|
27
|
+
@coordinates = nil
|
28
|
+
@items << item
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the number of items in this Cluster.
|
32
|
+
def size
|
33
|
+
@items.size
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns the distance from the center of this Cluster to the edge.
|
37
|
+
def radius
|
38
|
+
return nil if @items.empty?
|
39
|
+
return 0 if @items.size == 1
|
40
|
+
return (@items[0].distance_to(@items[1]) + @items[0].radius + @items[1].radius) / 2.0
|
41
|
+
raise "radius not implemented for clusters with more than two items"
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns a flat list of all the points contained in either this cluster
|
45
|
+
# or any of the clusters it contains.
|
46
|
+
def points
|
47
|
+
@items.map {|item| item.points}.flatten
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns +true+ if this Cluster includes the given +item+, otherwise
|
51
|
+
# returns +false+.
|
52
|
+
def include?(item)
|
53
|
+
@items.include? item
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns a legible representation of this Cluster and it's items.
|
57
|
+
def to_s
|
58
|
+
"[#{@items.join(', ')}]"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# Clusters a set of Points using Hierarchical Clustering, stopping either
|
3
|
+
# when the hierarchy is complete or the clusters are separated by a given
|
4
|
+
# minimum distance.
|
5
|
+
class Clusterer
|
6
|
+
# The Distances for the items being clustered
|
7
|
+
attr_reader :distances
|
8
|
+
|
9
|
+
# Create a new Clusterer for the given data.
|
10
|
+
#
|
11
|
+
# Specify +separation+ to stop the clustering process once all the
|
12
|
+
# items are at least +separation+ units apart.
|
13
|
+
#
|
14
|
+
# Specify +resolution+ to give a minimum size for clusters. Points that
|
15
|
+
# are within this distance from each other will not be hierarchically
|
16
|
+
# clustered, but will be put into clusters based strictly on coordinates.
|
17
|
+
# The clusters generated by this "pre-clustering" will then be
|
18
|
+
# hierarchically clustered as normal.
|
19
|
+
def initialize(data, separation = nil, resolution = nil)
|
20
|
+
@separation = separation
|
21
|
+
@resolution = resolution
|
22
|
+
@data = precluster(data)
|
23
|
+
@distances = Distances.new(@data)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Calculates and returns the set of clusters.
|
27
|
+
def clusters
|
28
|
+
return @data if @separation && @distances.separation > @separation
|
29
|
+
while @data.length > 1
|
30
|
+
@distances = Distances.new(@data)
|
31
|
+
return @data if @separation && @distances.separation > @separation
|
32
|
+
@data = find_cluster
|
33
|
+
end
|
34
|
+
@data
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def find_cluster
|
40
|
+
case @data.length
|
41
|
+
when 0
|
42
|
+
[]
|
43
|
+
when 1, 2
|
44
|
+
[Cluster.new(@data)]
|
45
|
+
else
|
46
|
+
nearest = @distances.nearest
|
47
|
+
outliers = @distances.outliers
|
48
|
+
[Cluster.new(nearest), *outliers]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def precluster(points)
|
53
|
+
unless @resolution && @separation
|
54
|
+
# preclustering is only applicable given lower bound on resolution
|
55
|
+
# can't precluster w/ no min separation given
|
56
|
+
return points.dup
|
57
|
+
end
|
58
|
+
if @separation == 0
|
59
|
+
# if no separation is asked for, it's all one cluster
|
60
|
+
return [Cluster.new(points)]
|
61
|
+
end
|
62
|
+
points.inject({}) do |grid_clusters, point|
|
63
|
+
grid_coordinates = point.coordinates.map {|coord| (coord / @resolution).floor }
|
64
|
+
grid_clusters[grid_coordinates] ||= Cluster.new([])
|
65
|
+
grid_clusters[grid_coordinates] << point
|
66
|
+
grid_clusters
|
67
|
+
end.values
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Hierclust
|
2
|
+
# Represents the pair-wise distances between a set of items.
|
3
|
+
class Distances
|
4
|
+
attr_reader :nearest, :outliers, :separation
|
5
|
+
|
6
|
+
# Create a new Distances for the given +items+
|
7
|
+
def initialize(items)
|
8
|
+
@items = items
|
9
|
+
@separation = 0
|
10
|
+
@nearest = []
|
11
|
+
items = @items.dup
|
12
|
+
while !items.empty?
|
13
|
+
origin = items.shift
|
14
|
+
items.each do |other|
|
15
|
+
distance = origin.distance_to(other)
|
16
|
+
if @separation == 0 or distance < @separation
|
17
|
+
@separation = distance
|
18
|
+
@nearest = [origin, other]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
@outliers = @items - @nearest
|
23
|
+
end
|
24
|
+
|
25
|
+
=begin
|
26
|
+
|
27
|
+
old idea
|
28
|
+
|
29
|
+
1 calculate all distances
|
30
|
+
2 update distances when a new cluster is created from two existing points
|
31
|
+
3 keep distances sorted by separation so that we always know which is shortest
|
32
|
+
|
33
|
+
new idea
|
34
|
+
|
35
|
+
don't worry about the lower level clusters
|
36
|
+
don't worry about the higher level clusters
|
37
|
+
just form clusters of the desired separation
|
38
|
+
start by dividing the points into a grid of 0.5 * sep
|
39
|
+
and put all points in the same grid cells together
|
40
|
+
...
|
41
|
+
and then do regular hierarchical clustering! we should be fine at that point.
|
42
|
+
sweet....
|
43
|
+
|
44
|
+
=end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|