agglomerative_clustering 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -1
- data/README.md +7 -2
- data/agglomerative_clustering.gemspec +2 -2
- data/lib/agglomerative_clustering.rb +1 -0
- data/lib/agglomerative_clustering/distance_matrix.rb +37 -0
- data/lib/agglomerative_clustering/linkage/average.rb +0 -3
- data/lib/agglomerative_clustering/linkage/base.rb +0 -15
- data/lib/agglomerative_clustering/linkage/center.rb +1 -5
- data/lib/agglomerative_clustering/linkage/complete.rb +0 -4
- data/lib/agglomerative_clustering/linkage/single.rb +0 -3
- data/lib/agglomerative_clustering/point.rb +1 -3
- data/lib/agglomerative_clustering/set.rb +50 -18
- data/lib/agglomerative_clustering/version.rb +1 -1
- data/spec/lib/agglomerative_clustering/distance_matrix_spec.rb +18 -0
- data/spec/lib/agglomerative_clustering/set_spec.rb +12 -18
- metadata +6 -8
- data/LICENSE +0 -22
- data/spec/lib/agglomerative_clustering/linkage/base_spec.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6368eb116e76afdf46d5dd16ffeaed04a3d590e8
|
4
|
+
data.tar.gz: 39447a07d51c37d5743dc910bb71ff011bfcd6d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 198911398132736c1cf06117e48e4f0026c94271586e8f172bf72733828f8bf1b218e03659b148663c56b2f8a3ca793e45b335527e90bc51577663221ddb4202
|
7
|
+
data.tar.gz: 5b04529399622524c8d7b36cd8b387c7dae73c83700e0e63455091e6357b39745bb78df0d01e6df0f8e29f0d264f2df89b23a79d9b744f2df393ceb10062249a
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
Hierarchical Agglomerative Clustering Algorithm
|
4
4
|
|
5
|
-
Input Set of 3 dimensional points, group into nearest k clusters
|
5
|
+
Input Set of 3 dimensional points, group into nearest k clusters based on Euclidean Distance.
|
6
|
+
Currently the Clustering Algorithm supports 4 different types of Linkage
|
7
|
+
* Single Linkage (Distance between clusters is based on nearest points)
|
8
|
+
* Complete Linkage (Distance between clusters is based on farthest points)
|
9
|
+
* Average Linkage (Distance between clusters is based on average distance of points)
|
10
|
+
* Center Linkage (Distance between clusters is based on center of cluster)
|
6
11
|
|
7
12
|
## Installation
|
8
13
|
|
@@ -21,8 +26,8 @@ Or install it yourself as:
|
|
21
26
|
$ gem install agglomerative_clustering
|
22
27
|
|
23
28
|
## Usage
|
29
|
+
Please see cluster.rb for a sample until I have a chance to write something up here
|
24
30
|
|
25
|
-
TODO: Write usage instructions here
|
26
31
|
|
27
32
|
## Contributing
|
28
33
|
|
@@ -7,9 +7,9 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "agglomerative_clustering"
|
8
8
|
spec.version = AgglomerativeClustering::VERSION
|
9
9
|
spec.authors = ["Bryan Mulvihill"]
|
10
|
-
spec.email = ["
|
10
|
+
spec.email = ["mulvihill.bryan@gmail.com"]
|
11
11
|
spec.summary = %q{Ruby Agglomerative Clustering Algorithm}
|
12
|
-
spec.homepage = ""
|
12
|
+
spec.homepage = "https://github.com/bmulvihill/agglomerative_clustering"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "agglomerative_clustering/version"
|
2
2
|
require "agglomerative_clustering/euclidean_distance"
|
3
|
+
require "agglomerative_clustering/distance_matrix"
|
3
4
|
require "agglomerative_clustering/linkage/base"
|
4
5
|
require "agglomerative_clustering/linkage/single"
|
5
6
|
require "agglomerative_clustering/linkage/complete"
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module AgglomerativeClustering
|
2
|
+
class DistanceMatrix
|
3
|
+
|
4
|
+
def initialize matrix
|
5
|
+
@matrix_array = matrix.to_a
|
6
|
+
end
|
7
|
+
|
8
|
+
def matrix
|
9
|
+
Matrix.build(matrix_array.size, matrix_array.first.size) do |row, column|
|
10
|
+
matrix_array[row][column]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def print_matrix
|
15
|
+
puts matrix.to_a.map(&:inspect)
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove_edge index
|
19
|
+
matrix_array.delete_at(index)
|
20
|
+
matrix_array.each { |row| row.delete_at(index) }
|
21
|
+
Matrix.rows(matrix_array)
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_edge weights
|
25
|
+
matrix_array.each_with_index { |row, index| row << weights[index] }
|
26
|
+
matrix_array << weights
|
27
|
+
Matrix.rows(matrix_array)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def matrix_array
|
33
|
+
@matrix_array ||= []
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -2,21 +2,6 @@ module AgglomerativeClustering
|
|
2
2
|
module Linkage
|
3
3
|
class Base
|
4
4
|
include EuclideanDistance
|
5
|
-
|
6
|
-
def cluster(clusters)
|
7
|
-
min_cluster_dist = 1.0/0
|
8
|
-
clusters.each_with_index do |cluster1, index|
|
9
|
-
clusters[index + 1..clusters.size].each do |cluster2|
|
10
|
-
distance = calculate_distance(cluster1, cluster2)
|
11
|
-
if distance < min_cluster_dist
|
12
|
-
min_cluster_dist = distance
|
13
|
-
@clusters_to_merge = [cluster1, cluster2]
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
clusters_to_merge
|
18
|
-
end
|
19
|
-
|
20
5
|
end
|
21
6
|
end
|
22
7
|
end
|
@@ -10,11 +10,7 @@ module AgglomerativeClustering
|
|
10
10
|
def center_point cluster
|
11
11
|
cluster.points.first.zip(*cluster.points[1..cluster.points.size-1]).map { |a,b| (a + b)/cluster.points.size.to_f }
|
12
12
|
end
|
13
|
-
|
14
|
-
def clusters_to_merge
|
15
|
-
@clusters_to_merge ||= []
|
16
|
-
end
|
17
|
-
|
13
|
+
|
18
14
|
end
|
19
15
|
end
|
20
16
|
end
|
@@ -2,15 +2,17 @@ require 'matrix'
|
|
2
2
|
module AgglomerativeClustering
|
3
3
|
class Set
|
4
4
|
include EuclideanDistance
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :linkage
|
6
6
|
|
7
7
|
def initialize(linkage)
|
8
8
|
@linkage = linkage
|
9
|
-
|
9
|
+
end
|
10
|
+
|
11
|
+
def points
|
12
|
+
@points ||= []
|
10
13
|
end
|
11
14
|
|
12
15
|
def push point
|
13
|
-
point.index = points.size
|
14
16
|
points << point
|
15
17
|
end
|
16
18
|
|
@@ -22,23 +24,28 @@ module AgglomerativeClustering
|
|
22
24
|
@distance_matrix ||= build_distance_matrix
|
23
25
|
end
|
24
26
|
|
25
|
-
def print_distance_matrix
|
26
|
-
puts distance_matrix.to_a.map(&:inspect)
|
27
|
-
end
|
28
|
-
|
29
27
|
def cluster total_clusters
|
30
|
-
clusters_to_merge =[]
|
31
28
|
while clusters.size > total_clusters
|
32
|
-
|
33
|
-
merge_clusters(clusters_to_merge)
|
29
|
+
merge_clusters(shortest_distance)
|
34
30
|
end
|
35
31
|
clusters
|
36
32
|
end
|
37
33
|
|
38
|
-
def merge_clusters
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
def merge_clusters indexes
|
35
|
+
index1, index2 = indexes
|
36
|
+
new_cluster = clusters[index1].merge(clusters[index2])
|
37
|
+
remove_cluster(index1)
|
38
|
+
remove_cluster(index2 - 1)
|
39
|
+
add_cluster(new_cluster)
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_distance_matrix new_cluster
|
43
|
+
distances = []
|
44
|
+
clusters.each do |cluster|
|
45
|
+
distances << linkage.calculate_distance(clusters[new_cluster], cluster)
|
46
|
+
end
|
47
|
+
distance_matrix.add_edge(distances)
|
48
|
+
distance_matrix
|
42
49
|
end
|
43
50
|
|
44
51
|
def outliers
|
@@ -46,10 +53,10 @@ module AgglomerativeClustering
|
|
46
53
|
end
|
47
54
|
|
48
55
|
def find_outliers percentage_of_clusters, distance
|
49
|
-
distance_matrix.each_with_index do |index, row, column|
|
56
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
50
57
|
count_hash[row] ||= 0
|
51
|
-
count_hash[row] += 1 if distance_matrix[row, column] > distance
|
52
|
-
set_outliers << points[row] if count_hash[row]/(distance_matrix.row_count - 1) > percentage_of_clusters/100
|
58
|
+
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
59
|
+
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
53
60
|
end
|
54
61
|
points.reject! { |point| outliers.include?(point) }
|
55
62
|
outliers
|
@@ -57,6 +64,30 @@ module AgglomerativeClustering
|
|
57
64
|
|
58
65
|
private
|
59
66
|
|
67
|
+
def add_cluster new_cluster
|
68
|
+
clusters << new_cluster
|
69
|
+
update_distance_matrix(clusters.size - 1)
|
70
|
+
new_cluster
|
71
|
+
end
|
72
|
+
|
73
|
+
def remove_cluster index
|
74
|
+
clusters.delete_at(index)
|
75
|
+
distance_matrix.remove_edge(index)
|
76
|
+
end
|
77
|
+
|
78
|
+
def shortest_distance
|
79
|
+
min_cluster_dist = 1.0/0
|
80
|
+
indexes = []
|
81
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
82
|
+
distance = distance_matrix.matrix[row, column]
|
83
|
+
if distance < min_cluster_dist && distance != 0
|
84
|
+
min_cluster_dist = distance
|
85
|
+
indexes = [row, column]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
indexes
|
89
|
+
end
|
90
|
+
|
60
91
|
def set_outliers
|
61
92
|
@set_outliers ||= []
|
62
93
|
end
|
@@ -66,9 +97,10 @@ module AgglomerativeClustering
|
|
66
97
|
end
|
67
98
|
|
68
99
|
def build_distance_matrix
|
69
|
-
Matrix.build(points.size, points.size) do |row, column|
|
100
|
+
m = Matrix.build(points.size, points.size) do |row, column|
|
70
101
|
euclidean_distance(points[row], points[column]).round(2)
|
71
102
|
end
|
103
|
+
DistanceMatrix.new(m)
|
72
104
|
end
|
73
105
|
|
74
106
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
describe AgglomerativeClustering::DistanceMatrix do
|
2
|
+
|
3
|
+
context '#remove_edge' do
|
4
|
+
it 'will remove edges from the distance matrix' do
|
5
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.empty)
|
6
|
+
matrix.add_edge([1,2])
|
7
|
+
matrix.add_edge([2,2,3])
|
8
|
+
expect(matrix.remove_edge(0)).to eql(Matrix[[2,3]])
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context '#add_edge' do
|
13
|
+
it 'will add an edges to the distance matrix' do
|
14
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.rows([[1,2,3]]))
|
15
|
+
expect(matrix.add_edge([4,5,6,7])).to eql(Matrix[[1,2,3,4],[4,5,6,7]])
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -2,9 +2,9 @@ describe AgglomerativeClustering::Set do
|
|
2
2
|
|
3
3
|
before do
|
4
4
|
@set = FactoryGirl.build(:set)
|
5
|
-
@point1 = FactoryGirl.build(:point, x:
|
6
|
-
@point2 = FactoryGirl.build(:point, x:
|
7
|
-
@point3 = FactoryGirl.build(:point, x:
|
5
|
+
@point1 = FactoryGirl.build(:point, x:1, y:2, z:3)
|
6
|
+
@point2 = FactoryGirl.build(:point, x:2, y:4, z:1)
|
7
|
+
@point3 = FactoryGirl.build(:point, x:4, y:2, z:2)
|
8
8
|
@point4 = FactoryGirl.build(:point, x:5, y:2, z:3)
|
9
9
|
@set.push(@point1)
|
10
10
|
@set.push(@point2)
|
@@ -14,36 +14,30 @@ describe AgglomerativeClustering::Set do
|
|
14
14
|
|
15
15
|
context '#cluster' do
|
16
16
|
it 'will return clusters of points based on requested number of clusters' do
|
17
|
-
expect(@set.cluster(
|
17
|
+
expect(@set.cluster(2).size).to eql(2)
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'will cluster points that are closest to each other' do
|
21
|
-
@point5 = FactoryGirl.build(:point, x:
|
22
|
-
@point6 = FactoryGirl.build(:point, x:
|
21
|
+
@point5 = FactoryGirl.build(:point, x:6, y:2, z:4)
|
22
|
+
@point6 = FactoryGirl.build(:point, x:7, y:3, z:4)
|
23
23
|
@point7 = FactoryGirl.build(:point, x:15, y:20, z:21)
|
24
|
-
@point8 = FactoryGirl.build(:point, x:
|
25
|
-
@point9 = FactoryGirl.build(:point, x:
|
24
|
+
@point8 = FactoryGirl.build(:point, x:16, y:21, z:21)
|
25
|
+
@point9 = FactoryGirl.build(:point, x:18, y:22, z:21)
|
26
26
|
@set.push(@point5)
|
27
27
|
@set.push(@point6)
|
28
28
|
@set.push(@point7)
|
29
29
|
@set.push(@point8)
|
30
30
|
@set.push(@point9)
|
31
31
|
clusters = @set.cluster(3)
|
32
|
-
clusters
|
33
|
-
|
34
|
-
|
35
|
-
clusters[1].points.each do |point|
|
36
|
-
expect([@point3, @point4, @point5, @point6].include?(point)).to be true
|
37
|
-
end
|
38
|
-
clusters[2].points.each do |point|
|
39
|
-
expect([@point7, @point8, @point9].include?(point)).to be true
|
40
|
-
end
|
32
|
+
points = clusters.map(&:points).each {|cluster| cluster.sort_by!(&:x) }
|
33
|
+
expect([[@point1, @point2],[@point3, @point4, @point5, @point6], [@point7, @point8, @point9]] - points).to eql([])
|
34
|
+
|
41
35
|
end
|
42
36
|
end
|
43
37
|
|
44
38
|
context '#merge_clusters' do
|
45
39
|
it 'will merge two clusters into one and update the distance matrix' do
|
46
|
-
expect(@set.merge_clusters([
|
40
|
+
expect(@set.merge_clusters([0,1]).points).to eql([@point1, @point2])
|
47
41
|
end
|
48
42
|
end
|
49
43
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: agglomerative_clustering
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Mulvihill
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
description:
|
56
56
|
email:
|
57
|
-
-
|
57
|
+
- mulvihill.bryan@gmail.com
|
58
58
|
executables: []
|
59
59
|
extensions: []
|
60
60
|
extra_rdoc_files: []
|
@@ -63,7 +63,6 @@ files:
|
|
63
63
|
- ".rspec"
|
64
64
|
- Gemfile
|
65
65
|
- Gemfile.lock
|
66
|
-
- LICENSE
|
67
66
|
- LICENSE.txt
|
68
67
|
- README.md
|
69
68
|
- Rakefile
|
@@ -71,6 +70,7 @@ files:
|
|
71
70
|
- cluster.rb
|
72
71
|
- lib/agglomerative_clustering.rb
|
73
72
|
- lib/agglomerative_clustering/cluster.rb
|
73
|
+
- lib/agglomerative_clustering/distance_matrix.rb
|
74
74
|
- lib/agglomerative_clustering/euclidean_distance.rb
|
75
75
|
- lib/agglomerative_clustering/linkage/average.rb
|
76
76
|
- lib/agglomerative_clustering/linkage/base.rb
|
@@ -80,21 +80,19 @@ files:
|
|
80
80
|
- lib/agglomerative_clustering/point.rb
|
81
81
|
- lib/agglomerative_clustering/set.rb
|
82
82
|
- lib/agglomerative_clustering/version.rb
|
83
|
-
- outliers.csv
|
84
|
-
- points.csv
|
85
83
|
- spec/factories/lib/agglomerative_clustering/cluster.rb
|
86
84
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
87
85
|
- spec/factories/lib/agglomerative_clustering/set.rb
|
88
86
|
- spec/lib/agglomerative_clustering/cluster_spec.rb
|
87
|
+
- spec/lib/agglomerative_clustering/distance_matrix_spec.rb
|
89
88
|
- spec/lib/agglomerative_clustering/euclidean_distance_spec.rb
|
90
89
|
- spec/lib/agglomerative_clustering/linkage/average_spec.rb
|
91
|
-
- spec/lib/agglomerative_clustering/linkage/base_spec.rb
|
92
90
|
- spec/lib/agglomerative_clustering/linkage/center_spec.rb
|
93
91
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
94
92
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
95
93
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
96
94
|
- spec/spec_helper.rb
|
97
|
-
homepage:
|
95
|
+
homepage: https://github.com/bmulvihill/agglomerative_clustering
|
98
96
|
licenses:
|
99
97
|
- MIT
|
100
98
|
metadata: {}
|
@@ -123,9 +121,9 @@ test_files:
|
|
123
121
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
124
122
|
- spec/factories/lib/agglomerative_clustering/set.rb
|
125
123
|
- spec/lib/agglomerative_clustering/cluster_spec.rb
|
124
|
+
- spec/lib/agglomerative_clustering/distance_matrix_spec.rb
|
126
125
|
- spec/lib/agglomerative_clustering/euclidean_distance_spec.rb
|
127
126
|
- spec/lib/agglomerative_clustering/linkage/average_spec.rb
|
128
|
-
- spec/lib/agglomerative_clustering/linkage/base_spec.rb
|
129
127
|
- spec/lib/agglomerative_clustering/linkage/center_spec.rb
|
130
128
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
131
129
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
data/LICENSE
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
The MIT License (MIT)
|
2
|
-
|
3
|
-
Copyright (c) 2014 Bryan Mulvihill
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
22
|
-
|
@@ -1,13 +0,0 @@
|
|
1
|
-
describe AgglomerativeClustering::Linkage::Base do
|
2
|
-
|
3
|
-
context '#cluster' do
|
4
|
-
it 'will return the clusters where min distance is closest' do
|
5
|
-
single_linkage = AgglomerativeClustering::Linkage::Single.new
|
6
|
-
set = FactoryGirl.build(:set)
|
7
|
-
set.push(FactoryGirl.build(:point))
|
8
|
-
set.push(FactoryGirl.build(:point))
|
9
|
-
expect(single_linkage.cluster(set.clusters)).to eql([set.clusters[0], set.clusters[1]])
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
end
|