agglomerative_clustering 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -1
- data/README.md +7 -2
- data/agglomerative_clustering.gemspec +2 -2
- data/lib/agglomerative_clustering.rb +1 -0
- data/lib/agglomerative_clustering/distance_matrix.rb +37 -0
- data/lib/agglomerative_clustering/linkage/average.rb +0 -3
- data/lib/agglomerative_clustering/linkage/base.rb +0 -15
- data/lib/agglomerative_clustering/linkage/center.rb +1 -5
- data/lib/agglomerative_clustering/linkage/complete.rb +0 -4
- data/lib/agglomerative_clustering/linkage/single.rb +0 -3
- data/lib/agglomerative_clustering/point.rb +1 -3
- data/lib/agglomerative_clustering/set.rb +50 -18
- data/lib/agglomerative_clustering/version.rb +1 -1
- data/spec/lib/agglomerative_clustering/distance_matrix_spec.rb +18 -0
- data/spec/lib/agglomerative_clustering/set_spec.rb +12 -18
- metadata +6 -8
- data/LICENSE +0 -22
- data/spec/lib/agglomerative_clustering/linkage/base_spec.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6368eb116e76afdf46d5dd16ffeaed04a3d590e8
|
4
|
+
data.tar.gz: 39447a07d51c37d5743dc910bb71ff011bfcd6d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 198911398132736c1cf06117e48e4f0026c94271586e8f172bf72733828f8bf1b218e03659b148663c56b2f8a3ca793e45b335527e90bc51577663221ddb4202
|
7
|
+
data.tar.gz: 5b04529399622524c8d7b36cd8b387c7dae73c83700e0e63455091e6357b39745bb78df0d01e6df0f8e29f0d264f2df89b23a79d9b744f2df393ceb10062249a
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
Hierarchical Agglomerative Clustering Algorithm
|
4
4
|
|
5
|
-
Input Set of 3 dimensional points, group into nearest k clusters
|
5
|
+
Input Set of 3 dimensional points, group into nearest k clusters based on Euclidean Distance.
|
6
|
+
Currently the Clustering Algorithm supports 4 different types of Linkage
|
7
|
+
* Single Linkage (Distance between clusters is based on nearest points)
|
8
|
+
* Complete Linkage (Distance between clusters is based on farthest points)
|
9
|
+
* Average Linkage (Distance between clusters is based on average distance of points)
|
10
|
+
* Center Linkage (Distance between clusters is based on center of cluster)
|
6
11
|
|
7
12
|
## Installation
|
8
13
|
|
@@ -21,8 +26,8 @@ Or install it yourself as:
|
|
21
26
|
$ gem install agglomerative_clustering
|
22
27
|
|
23
28
|
## Usage
|
29
|
+
Please see cluster.rb for a sample until I have a chance to write something up here
|
24
30
|
|
25
|
-
TODO: Write usage instructions here
|
26
31
|
|
27
32
|
## Contributing
|
28
33
|
|
@@ -7,9 +7,9 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "agglomerative_clustering"
|
8
8
|
spec.version = AgglomerativeClustering::VERSION
|
9
9
|
spec.authors = ["Bryan Mulvihill"]
|
10
|
-
spec.email = ["
|
10
|
+
spec.email = ["mulvihill.bryan@gmail.com"]
|
11
11
|
spec.summary = %q{Ruby Agglomerative Clustering Algorithm}
|
12
|
-
spec.homepage = ""
|
12
|
+
spec.homepage = "https://github.com/bmulvihill/agglomerative_clustering"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "agglomerative_clustering/version"
|
2
2
|
require "agglomerative_clustering/euclidean_distance"
|
3
|
+
require "agglomerative_clustering/distance_matrix"
|
3
4
|
require "agglomerative_clustering/linkage/base"
|
4
5
|
require "agglomerative_clustering/linkage/single"
|
5
6
|
require "agglomerative_clustering/linkage/complete"
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module AgglomerativeClustering
|
2
|
+
class DistanceMatrix
|
3
|
+
|
4
|
+
def initialize matrix
|
5
|
+
@matrix_array = matrix.to_a
|
6
|
+
end
|
7
|
+
|
8
|
+
def matrix
|
9
|
+
Matrix.build(matrix_array.size, matrix_array.first.size) do |row, column|
|
10
|
+
matrix_array[row][column]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def print_matrix
|
15
|
+
puts matrix.to_a.map(&:inspect)
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove_edge index
|
19
|
+
matrix_array.delete_at(index)
|
20
|
+
matrix_array.each { |row| row.delete_at(index) }
|
21
|
+
Matrix.rows(matrix_array)
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_edge weights
|
25
|
+
matrix_array.each_with_index { |row, index| row << weights[index] }
|
26
|
+
matrix_array << weights
|
27
|
+
Matrix.rows(matrix_array)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def matrix_array
|
33
|
+
@matrix_array ||= []
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -2,21 +2,6 @@ module AgglomerativeClustering
|
|
2
2
|
module Linkage
|
3
3
|
class Base
|
4
4
|
include EuclideanDistance
|
5
|
-
|
6
|
-
def cluster(clusters)
|
7
|
-
min_cluster_dist = 1.0/0
|
8
|
-
clusters.each_with_index do |cluster1, index|
|
9
|
-
clusters[index + 1..clusters.size].each do |cluster2|
|
10
|
-
distance = calculate_distance(cluster1, cluster2)
|
11
|
-
if distance < min_cluster_dist
|
12
|
-
min_cluster_dist = distance
|
13
|
-
@clusters_to_merge = [cluster1, cluster2]
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
clusters_to_merge
|
18
|
-
end
|
19
|
-
|
20
5
|
end
|
21
6
|
end
|
22
7
|
end
|
@@ -10,11 +10,7 @@ module AgglomerativeClustering
|
|
10
10
|
def center_point cluster
|
11
11
|
cluster.points.first.zip(*cluster.points[1..cluster.points.size-1]).map { |a,b| (a + b)/cluster.points.size.to_f }
|
12
12
|
end
|
13
|
-
|
14
|
-
def clusters_to_merge
|
15
|
-
@clusters_to_merge ||= []
|
16
|
-
end
|
17
|
-
|
13
|
+
|
18
14
|
end
|
19
15
|
end
|
20
16
|
end
|
@@ -2,15 +2,17 @@ require 'matrix'
|
|
2
2
|
module AgglomerativeClustering
|
3
3
|
class Set
|
4
4
|
include EuclideanDistance
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :linkage
|
6
6
|
|
7
7
|
def initialize(linkage)
|
8
8
|
@linkage = linkage
|
9
|
-
|
9
|
+
end
|
10
|
+
|
11
|
+
def points
|
12
|
+
@points ||= []
|
10
13
|
end
|
11
14
|
|
12
15
|
def push point
|
13
|
-
point.index = points.size
|
14
16
|
points << point
|
15
17
|
end
|
16
18
|
|
@@ -22,23 +24,28 @@ module AgglomerativeClustering
|
|
22
24
|
@distance_matrix ||= build_distance_matrix
|
23
25
|
end
|
24
26
|
|
25
|
-
def print_distance_matrix
|
26
|
-
puts distance_matrix.to_a.map(&:inspect)
|
27
|
-
end
|
28
|
-
|
29
27
|
def cluster total_clusters
|
30
|
-
clusters_to_merge =[]
|
31
28
|
while clusters.size > total_clusters
|
32
|
-
|
33
|
-
merge_clusters(clusters_to_merge)
|
29
|
+
merge_clusters(shortest_distance)
|
34
30
|
end
|
35
31
|
clusters
|
36
32
|
end
|
37
33
|
|
38
|
-
def merge_clusters
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
def merge_clusters indexes
|
35
|
+
index1, index2 = indexes
|
36
|
+
new_cluster = clusters[index1].merge(clusters[index2])
|
37
|
+
remove_cluster(index1)
|
38
|
+
remove_cluster(index2 - 1)
|
39
|
+
add_cluster(new_cluster)
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_distance_matrix new_cluster
|
43
|
+
distances = []
|
44
|
+
clusters.each do |cluster|
|
45
|
+
distances << linkage.calculate_distance(clusters[new_cluster], cluster)
|
46
|
+
end
|
47
|
+
distance_matrix.add_edge(distances)
|
48
|
+
distance_matrix
|
42
49
|
end
|
43
50
|
|
44
51
|
def outliers
|
@@ -46,10 +53,10 @@ module AgglomerativeClustering
|
|
46
53
|
end
|
47
54
|
|
48
55
|
def find_outliers percentage_of_clusters, distance
|
49
|
-
distance_matrix.each_with_index do |index, row, column|
|
56
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
50
57
|
count_hash[row] ||= 0
|
51
|
-
count_hash[row] += 1 if distance_matrix[row, column] > distance
|
52
|
-
set_outliers << points[row] if count_hash[row]/(distance_matrix.row_count - 1) > percentage_of_clusters/100
|
58
|
+
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
59
|
+
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
53
60
|
end
|
54
61
|
points.reject! { |point| outliers.include?(point) }
|
55
62
|
outliers
|
@@ -57,6 +64,30 @@ module AgglomerativeClustering
|
|
57
64
|
|
58
65
|
private
|
59
66
|
|
67
|
+
def add_cluster new_cluster
|
68
|
+
clusters << new_cluster
|
69
|
+
update_distance_matrix(clusters.size - 1)
|
70
|
+
new_cluster
|
71
|
+
end
|
72
|
+
|
73
|
+
def remove_cluster index
|
74
|
+
clusters.delete_at(index)
|
75
|
+
distance_matrix.remove_edge(index)
|
76
|
+
end
|
77
|
+
|
78
|
+
def shortest_distance
|
79
|
+
min_cluster_dist = 1.0/0
|
80
|
+
indexes = []
|
81
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
82
|
+
distance = distance_matrix.matrix[row, column]
|
83
|
+
if distance < min_cluster_dist && distance != 0
|
84
|
+
min_cluster_dist = distance
|
85
|
+
indexes = [row, column]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
indexes
|
89
|
+
end
|
90
|
+
|
60
91
|
def set_outliers
|
61
92
|
@set_outliers ||= []
|
62
93
|
end
|
@@ -66,9 +97,10 @@ module AgglomerativeClustering
|
|
66
97
|
end
|
67
98
|
|
68
99
|
def build_distance_matrix
|
69
|
-
Matrix.build(points.size, points.size) do |row, column|
|
100
|
+
m = Matrix.build(points.size, points.size) do |row, column|
|
70
101
|
euclidean_distance(points[row], points[column]).round(2)
|
71
102
|
end
|
103
|
+
DistanceMatrix.new(m)
|
72
104
|
end
|
73
105
|
|
74
106
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
describe AgglomerativeClustering::DistanceMatrix do
|
2
|
+
|
3
|
+
context '#remove_edge' do
|
4
|
+
it 'will remove edges from the distance matrix' do
|
5
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.empty)
|
6
|
+
matrix.add_edge([1,2])
|
7
|
+
matrix.add_edge([2,2,3])
|
8
|
+
expect(matrix.remove_edge(0)).to eql(Matrix[[2,3]])
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context '#add_edge' do
|
13
|
+
it 'will add an edges to the distance matrix' do
|
14
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.rows([[1,2,3]]))
|
15
|
+
expect(matrix.add_edge([4,5,6,7])).to eql(Matrix[[1,2,3,4],[4,5,6,7]])
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -2,9 +2,9 @@ describe AgglomerativeClustering::Set do
|
|
2
2
|
|
3
3
|
before do
|
4
4
|
@set = FactoryGirl.build(:set)
|
5
|
-
@point1 = FactoryGirl.build(:point, x:
|
6
|
-
@point2 = FactoryGirl.build(:point, x:
|
7
|
-
@point3 = FactoryGirl.build(:point, x:
|
5
|
+
@point1 = FactoryGirl.build(:point, x:1, y:2, z:3)
|
6
|
+
@point2 = FactoryGirl.build(:point, x:2, y:4, z:1)
|
7
|
+
@point3 = FactoryGirl.build(:point, x:4, y:2, z:2)
|
8
8
|
@point4 = FactoryGirl.build(:point, x:5, y:2, z:3)
|
9
9
|
@set.push(@point1)
|
10
10
|
@set.push(@point2)
|
@@ -14,36 +14,30 @@ describe AgglomerativeClustering::Set do
|
|
14
14
|
|
15
15
|
context '#cluster' do
|
16
16
|
it 'will return clusters of points based on requested number of clusters' do
|
17
|
-
expect(@set.cluster(
|
17
|
+
expect(@set.cluster(2).size).to eql(2)
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'will cluster points that are closest to each other' do
|
21
|
-
@point5 = FactoryGirl.build(:point, x:
|
22
|
-
@point6 = FactoryGirl.build(:point, x:
|
21
|
+
@point5 = FactoryGirl.build(:point, x:6, y:2, z:4)
|
22
|
+
@point6 = FactoryGirl.build(:point, x:7, y:3, z:4)
|
23
23
|
@point7 = FactoryGirl.build(:point, x:15, y:20, z:21)
|
24
|
-
@point8 = FactoryGirl.build(:point, x:
|
25
|
-
@point9 = FactoryGirl.build(:point, x:
|
24
|
+
@point8 = FactoryGirl.build(:point, x:16, y:21, z:21)
|
25
|
+
@point9 = FactoryGirl.build(:point, x:18, y:22, z:21)
|
26
26
|
@set.push(@point5)
|
27
27
|
@set.push(@point6)
|
28
28
|
@set.push(@point7)
|
29
29
|
@set.push(@point8)
|
30
30
|
@set.push(@point9)
|
31
31
|
clusters = @set.cluster(3)
|
32
|
-
clusters
|
33
|
-
|
34
|
-
|
35
|
-
clusters[1].points.each do |point|
|
36
|
-
expect([@point3, @point4, @point5, @point6].include?(point)).to be true
|
37
|
-
end
|
38
|
-
clusters[2].points.each do |point|
|
39
|
-
expect([@point7, @point8, @point9].include?(point)).to be true
|
40
|
-
end
|
32
|
+
points = clusters.map(&:points).each {|cluster| cluster.sort_by!(&:x) }
|
33
|
+
expect([[@point1, @point2],[@point3, @point4, @point5, @point6], [@point7, @point8, @point9]] - points).to eql([])
|
34
|
+
|
41
35
|
end
|
42
36
|
end
|
43
37
|
|
44
38
|
context '#merge_clusters' do
|
45
39
|
it 'will merge two clusters into one and update the distance matrix' do
|
46
|
-
expect(@set.merge_clusters([
|
40
|
+
expect(@set.merge_clusters([0,1]).points).to eql([@point1, @point2])
|
47
41
|
end
|
48
42
|
end
|
49
43
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: agglomerative_clustering
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Mulvihill
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
description:
|
56
56
|
email:
|
57
|
-
-
|
57
|
+
- mulvihill.bryan@gmail.com
|
58
58
|
executables: []
|
59
59
|
extensions: []
|
60
60
|
extra_rdoc_files: []
|
@@ -63,7 +63,6 @@ files:
|
|
63
63
|
- ".rspec"
|
64
64
|
- Gemfile
|
65
65
|
- Gemfile.lock
|
66
|
-
- LICENSE
|
67
66
|
- LICENSE.txt
|
68
67
|
- README.md
|
69
68
|
- Rakefile
|
@@ -71,6 +70,7 @@ files:
|
|
71
70
|
- cluster.rb
|
72
71
|
- lib/agglomerative_clustering.rb
|
73
72
|
- lib/agglomerative_clustering/cluster.rb
|
73
|
+
- lib/agglomerative_clustering/distance_matrix.rb
|
74
74
|
- lib/agglomerative_clustering/euclidean_distance.rb
|
75
75
|
- lib/agglomerative_clustering/linkage/average.rb
|
76
76
|
- lib/agglomerative_clustering/linkage/base.rb
|
@@ -80,21 +80,19 @@ files:
|
|
80
80
|
- lib/agglomerative_clustering/point.rb
|
81
81
|
- lib/agglomerative_clustering/set.rb
|
82
82
|
- lib/agglomerative_clustering/version.rb
|
83
|
-
- outliers.csv
|
84
|
-
- points.csv
|
85
83
|
- spec/factories/lib/agglomerative_clustering/cluster.rb
|
86
84
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
87
85
|
- spec/factories/lib/agglomerative_clustering/set.rb
|
88
86
|
- spec/lib/agglomerative_clustering/cluster_spec.rb
|
87
|
+
- spec/lib/agglomerative_clustering/distance_matrix_spec.rb
|
89
88
|
- spec/lib/agglomerative_clustering/euclidean_distance_spec.rb
|
90
89
|
- spec/lib/agglomerative_clustering/linkage/average_spec.rb
|
91
|
-
- spec/lib/agglomerative_clustering/linkage/base_spec.rb
|
92
90
|
- spec/lib/agglomerative_clustering/linkage/center_spec.rb
|
93
91
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
94
92
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
95
93
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
96
94
|
- spec/spec_helper.rb
|
97
|
-
homepage:
|
95
|
+
homepage: https://github.com/bmulvihill/agglomerative_clustering
|
98
96
|
licenses:
|
99
97
|
- MIT
|
100
98
|
metadata: {}
|
@@ -123,9 +121,9 @@ test_files:
|
|
123
121
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
124
122
|
- spec/factories/lib/agglomerative_clustering/set.rb
|
125
123
|
- spec/lib/agglomerative_clustering/cluster_spec.rb
|
124
|
+
- spec/lib/agglomerative_clustering/distance_matrix_spec.rb
|
126
125
|
- spec/lib/agglomerative_clustering/euclidean_distance_spec.rb
|
127
126
|
- spec/lib/agglomerative_clustering/linkage/average_spec.rb
|
128
|
-
- spec/lib/agglomerative_clustering/linkage/base_spec.rb
|
129
127
|
- spec/lib/agglomerative_clustering/linkage/center_spec.rb
|
130
128
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
131
129
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
data/LICENSE
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
The MIT License (MIT)
|
2
|
-
|
3
|
-
Copyright (c) 2014 Bryan Mulvihill
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
13
|
-
copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
22
|
-
|
@@ -1,13 +0,0 @@
|
|
1
|
-
describe AgglomerativeClustering::Linkage::Base do
|
2
|
-
|
3
|
-
context '#cluster' do
|
4
|
-
it 'will return the clusters where min distance is closest' do
|
5
|
-
single_linkage = AgglomerativeClustering::Linkage::Single.new
|
6
|
-
set = FactoryGirl.build(:set)
|
7
|
-
set.push(FactoryGirl.build(:point))
|
8
|
-
set.push(FactoryGirl.build(:point))
|
9
|
-
expect(single_linkage.cluster(set.clusters)).to eql([set.clusters[0], set.clusters[1]])
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
end
|