agglomerative_clustering 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/cluster.rb +4 -0
- data/lib/agglomerative_clustering/distance_matrix.rb +23 -6
- data/lib/agglomerative_clustering/set.rb +17 -31
- data/lib/agglomerative_clustering/silhouette_coefficient.rb +38 -0
- data/lib/agglomerative_clustering/version.rb +1 -1
- data/lib/agglomerative_clustering.rb +1 -0
- data/spec/lib/agglomerative_clustering/distance_matrix_spec.rb +8 -1
- data/spec/lib/agglomerative_clustering/set_spec.rb +0 -6
- data/spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb +24 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03755a284a11c8225365f9b22df38da433e07994
|
4
|
+
data.tar.gz: 6996369a3b16c5541cd694a8315c2c40606b91e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1061fb7aae2bc9c6cee7ba054593331a6e9db4c5388d895f821aad46b6d05d86721f61c947ce5377095ff7dc3adf8f93494ce9a3697f44357521d5ecc97e8523
|
7
|
+
data.tar.gz: f481ee1c97df283e3a354ed447319fbf9041d5507ac677269288e9c7802eb743510e016a6a0fca9a5232eb0a98420a1c393e1a08c9a52a0dfac91fcc21d91fd7
|
data/cluster.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'matrix'
|
1
2
|
module AgglomerativeClustering
|
2
3
|
class DistanceMatrix
|
3
4
|
|
@@ -6,9 +7,7 @@ module AgglomerativeClustering
|
|
6
7
|
end
|
7
8
|
|
8
9
|
def matrix
|
9
|
-
|
10
|
-
matrix_array[row][column]
|
11
|
-
end
|
10
|
+
@matrix ||= build_matrix
|
12
11
|
end
|
13
12
|
|
14
13
|
def print_matrix
|
@@ -18,20 +17,38 @@ module AgglomerativeClustering
|
|
18
17
|
def remove_edge index
|
19
18
|
matrix_array.delete_at(index)
|
20
19
|
matrix_array.each { |row| row.delete_at(index) }
|
21
|
-
Matrix.rows(matrix_array)
|
22
20
|
end
|
23
21
|
|
24
22
|
def add_edge weights
|
25
23
|
matrix_array.each_with_index { |row, index| row << weights[index] }
|
26
24
|
matrix_array << weights
|
27
|
-
|
25
|
+
@matrix = build_matrix
|
26
|
+
end
|
27
|
+
|
28
|
+
def shortest_distance
|
29
|
+
min_dist = 1.0/0
|
30
|
+
indexes = []
|
31
|
+
matrix.each_with_index do |index, row, column|
|
32
|
+
distance = matrix[row, column]
|
33
|
+
if distance < min_dist && (row != column)
|
34
|
+
min_dist = distance
|
35
|
+
indexes = [row, column]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
indexes
|
28
39
|
end
|
29
|
-
|
40
|
+
|
30
41
|
private
|
31
42
|
|
32
43
|
def matrix_array
|
33
44
|
@matrix_array ||= []
|
34
45
|
end
|
35
46
|
|
47
|
+
def build_matrix
|
48
|
+
Matrix.build(matrix_array.size, matrix_array.first.size) do |row, column|
|
49
|
+
matrix_array[row][column]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
36
53
|
end
|
37
54
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'matrix'
|
2
1
|
module AgglomerativeClustering
|
3
2
|
class Set
|
4
3
|
include EuclideanDistance
|
@@ -26,11 +25,27 @@ module AgglomerativeClustering
|
|
26
25
|
|
27
26
|
def cluster total_clusters
|
28
27
|
while clusters.size > total_clusters
|
29
|
-
merge_clusters(shortest_distance)
|
28
|
+
merge_clusters(distance_matrix.shortest_distance)
|
30
29
|
end
|
31
30
|
clusters
|
32
31
|
end
|
33
32
|
|
33
|
+
def outliers
|
34
|
+
set_outliers.uniq
|
35
|
+
end
|
36
|
+
|
37
|
+
def find_outliers percentage_of_clusters, distance
|
38
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
39
|
+
count_hash[row] ||= 0
|
40
|
+
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
41
|
+
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
42
|
+
end
|
43
|
+
points.reject! { |point| outliers.include?(point) }
|
44
|
+
outliers
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
34
49
|
def merge_clusters indexes
|
35
50
|
index1, index2 = indexes
|
36
51
|
new_cluster = clusters[index1].merge(clusters[index2])
|
@@ -48,22 +63,6 @@ module AgglomerativeClustering
|
|
48
63
|
distance_matrix
|
49
64
|
end
|
50
65
|
|
51
|
-
def outliers
|
52
|
-
set_outliers.uniq
|
53
|
-
end
|
54
|
-
|
55
|
-
def find_outliers percentage_of_clusters, distance
|
56
|
-
distance_matrix.matrix.each_with_index do |index, row, column|
|
57
|
-
count_hash[row] ||= 0
|
58
|
-
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
59
|
-
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
60
|
-
end
|
61
|
-
points.reject! { |point| outliers.include?(point) }
|
62
|
-
outliers
|
63
|
-
end
|
64
|
-
|
65
|
-
private
|
66
|
-
|
67
66
|
def add_cluster new_cluster
|
68
67
|
clusters << new_cluster
|
69
68
|
update_distance_matrix(clusters.size - 1)
|
@@ -75,19 +74,6 @@ module AgglomerativeClustering
|
|
75
74
|
distance_matrix.remove_edge(index)
|
76
75
|
end
|
77
76
|
|
78
|
-
def shortest_distance
|
79
|
-
min_cluster_dist = 1.0/0
|
80
|
-
indexes = []
|
81
|
-
distance_matrix.matrix.each_with_index do |index, row, column|
|
82
|
-
distance = distance_matrix.matrix[row, column]
|
83
|
-
if distance < min_cluster_dist && distance != 0
|
84
|
-
min_cluster_dist = distance
|
85
|
-
indexes = [row, column]
|
86
|
-
end
|
87
|
-
end
|
88
|
-
indexes
|
89
|
-
end
|
90
|
-
|
91
77
|
def set_outliers
|
92
78
|
@set_outliers ||= []
|
93
79
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module AgglomerativeClustering
|
2
|
+
class SilhouetteCoefficient
|
3
|
+
include EuclideanDistance
|
4
|
+
attr_reader :main_cluster
|
5
|
+
|
6
|
+
def initialize main_cluster
|
7
|
+
@main_cluster = main_cluster
|
8
|
+
end
|
9
|
+
|
10
|
+
def measure clusters
|
11
|
+
silhouettes = []
|
12
|
+
average_distances = []
|
13
|
+
main_cluster.points.each do |point1|
|
14
|
+
a1 = calculate_a1(point1)
|
15
|
+
(clusters - [main_cluster]).each do |cluster|
|
16
|
+
distances = []
|
17
|
+
cluster.points.each do |point2|
|
18
|
+
distances << euclidean_distance(point1, point2)
|
19
|
+
end
|
20
|
+
average_distances << distances.inject(:+)/distances.size
|
21
|
+
end
|
22
|
+
b1 = average_distances.min
|
23
|
+
s1 = (b1 - a1)/[a1,b1].max
|
24
|
+
silhouettes << s1
|
25
|
+
end
|
26
|
+
silhouettes.inject(:+) / silhouettes.size
|
27
|
+
end
|
28
|
+
|
29
|
+
def calculate_a1 point
|
30
|
+
distances = []
|
31
|
+
main_cluster.points.each do |point1|
|
32
|
+
distances << euclidean_distance(point, point1)
|
33
|
+
end
|
34
|
+
distances.inject(:+)/(distances.size - 1)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -5,7 +5,7 @@ describe AgglomerativeClustering::DistanceMatrix do
|
|
5
5
|
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.empty)
|
6
6
|
matrix.add_edge([1,2])
|
7
7
|
matrix.add_edge([2,2,3])
|
8
|
-
expect(matrix.remove_edge(0)).to eql(
|
8
|
+
expect(matrix.remove_edge(0)).to eql([[2,3]])
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
@@ -15,4 +15,11 @@ describe AgglomerativeClustering::DistanceMatrix do
|
|
15
15
|
expect(matrix.add_edge([4,5,6,7])).to eql(Matrix[[1,2,3,4],[4,5,6,7]])
|
16
16
|
end
|
17
17
|
end
|
18
|
+
|
19
|
+
context '#shortest_distance' do
|
20
|
+
it 'will return the indexes of the shortest distances' do
|
21
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.rows([[0,2,3],[2,0,3]]))
|
22
|
+
expect(matrix.shortest_distance).to eql([0,1])
|
23
|
+
end
|
24
|
+
end
|
18
25
|
end
|
@@ -35,12 +35,6 @@ describe AgglomerativeClustering::Set do
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
|
-
context '#merge_clusters' do
|
39
|
-
it 'will merge two clusters into one and update the distance matrix' do
|
40
|
-
expect(@set.merge_clusters([0,1]).points).to eql([@point1, @point2])
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
38
|
context '#find_outliers' do
|
45
39
|
it 'will return a list of outliers' do
|
46
40
|
outlier1 = FactoryGirl.build(:point, x:100, y:200, z:300)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
describe AgglomerativeClustering::SilhouetteCoefficient do
|
2
|
+
|
3
|
+
context '#measure' do
|
4
|
+
it 'will return the average silhoutte coefficient of a cluster' do
|
5
|
+
p1 = FactoryGirl.build(:point, x:1, y:1, z:1)
|
6
|
+
p2 = FactoryGirl.build(:point, x:3, y:3, z:3)
|
7
|
+
p3 = FactoryGirl.build(:point, x:17, y:17, z:17)
|
8
|
+
p4 = FactoryGirl.build(:point, x:16, y:16, z:16)
|
9
|
+
p5 = FactoryGirl.build(:point, x:18, y:18, z:18)
|
10
|
+
p6 = FactoryGirl.build(:point, x:2, y:2, z:2)
|
11
|
+
cluster1 = AgglomerativeClustering::Cluster.new(p1)
|
12
|
+
cluster2 = AgglomerativeClustering::Cluster.new(p2)
|
13
|
+
cluster3 = AgglomerativeClustering::Cluster.new(p3)
|
14
|
+
cluster4 = AgglomerativeClustering::Cluster.new(p4)
|
15
|
+
cluster5 = AgglomerativeClustering::Cluster.new(p5)
|
16
|
+
cluster6 = AgglomerativeClustering::Cluster.new(p6)
|
17
|
+
cluster1.merge(cluster2).merge(cluster3)
|
18
|
+
cluster4.merge(cluster5).merge(cluster6)
|
19
|
+
sc = AgglomerativeClustering::SilhouetteCoefficient.new(cluster1)
|
20
|
+
clusters = [cluster1, cluster4]
|
21
|
+
expect(sc.measure(clusters).round(4)).to eql(-0.0893)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: agglomerative_clustering
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Mulvihill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- lib/agglomerative_clustering/linkage/single.rb
|
80
80
|
- lib/agglomerative_clustering/point.rb
|
81
81
|
- lib/agglomerative_clustering/set.rb
|
82
|
+
- lib/agglomerative_clustering/silhouette_coefficient.rb
|
82
83
|
- lib/agglomerative_clustering/version.rb
|
83
84
|
- spec/factories/lib/agglomerative_clustering/cluster.rb
|
84
85
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
@@ -91,6 +92,7 @@ files:
|
|
91
92
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
92
93
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
93
94
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
95
|
+
- spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb
|
94
96
|
- spec/spec_helper.rb
|
95
97
|
homepage: https://github.com/bmulvihill/agglomerative_clustering
|
96
98
|
licenses:
|
@@ -128,4 +130,5 @@ test_files:
|
|
128
130
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
129
131
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
130
132
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
133
|
+
- spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb
|
131
134
|
- spec/spec_helper.rb
|