agglomerative_clustering 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/cluster.rb +4 -0
- data/lib/agglomerative_clustering/distance_matrix.rb +23 -6
- data/lib/agglomerative_clustering/set.rb +17 -31
- data/lib/agglomerative_clustering/silhouette_coefficient.rb +38 -0
- data/lib/agglomerative_clustering/version.rb +1 -1
- data/lib/agglomerative_clustering.rb +1 -0
- data/spec/lib/agglomerative_clustering/distance_matrix_spec.rb +8 -1
- data/spec/lib/agglomerative_clustering/set_spec.rb +0 -6
- data/spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb +24 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03755a284a11c8225365f9b22df38da433e07994
|
4
|
+
data.tar.gz: 6996369a3b16c5541cd694a8315c2c40606b91e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1061fb7aae2bc9c6cee7ba054593331a6e9db4c5388d895f821aad46b6d05d86721f61c947ce5377095ff7dc3adf8f93494ce9a3697f44357521d5ecc97e8523
|
7
|
+
data.tar.gz: f481ee1c97df283e3a354ed447319fbf9041d5507ac677269288e9c7802eb743510e016a6a0fca9a5232eb0a98420a1c393e1a08c9a52a0dfac91fcc21d91fd7
|
data/cluster.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'matrix'
|
1
2
|
module AgglomerativeClustering
|
2
3
|
class DistanceMatrix
|
3
4
|
|
@@ -6,9 +7,7 @@ module AgglomerativeClustering
|
|
6
7
|
end
|
7
8
|
|
8
9
|
def matrix
|
9
|
-
|
10
|
-
matrix_array[row][column]
|
11
|
-
end
|
10
|
+
@matrix ||= build_matrix
|
12
11
|
end
|
13
12
|
|
14
13
|
def print_matrix
|
@@ -18,20 +17,38 @@ module AgglomerativeClustering
|
|
18
17
|
def remove_edge index
|
19
18
|
matrix_array.delete_at(index)
|
20
19
|
matrix_array.each { |row| row.delete_at(index) }
|
21
|
-
Matrix.rows(matrix_array)
|
22
20
|
end
|
23
21
|
|
24
22
|
def add_edge weights
|
25
23
|
matrix_array.each_with_index { |row, index| row << weights[index] }
|
26
24
|
matrix_array << weights
|
27
|
-
|
25
|
+
@matrix = build_matrix
|
26
|
+
end
|
27
|
+
|
28
|
+
def shortest_distance
|
29
|
+
min_dist = 1.0/0
|
30
|
+
indexes = []
|
31
|
+
matrix.each_with_index do |index, row, column|
|
32
|
+
distance = matrix[row, column]
|
33
|
+
if distance < min_dist && (row != column)
|
34
|
+
min_dist = distance
|
35
|
+
indexes = [row, column]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
indexes
|
28
39
|
end
|
29
|
-
|
40
|
+
|
30
41
|
private
|
31
42
|
|
32
43
|
def matrix_array
|
33
44
|
@matrix_array ||= []
|
34
45
|
end
|
35
46
|
|
47
|
+
def build_matrix
|
48
|
+
Matrix.build(matrix_array.size, matrix_array.first.size) do |row, column|
|
49
|
+
matrix_array[row][column]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
36
53
|
end
|
37
54
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'matrix'
|
2
1
|
module AgglomerativeClustering
|
3
2
|
class Set
|
4
3
|
include EuclideanDistance
|
@@ -26,11 +25,27 @@ module AgglomerativeClustering
|
|
26
25
|
|
27
26
|
def cluster total_clusters
|
28
27
|
while clusters.size > total_clusters
|
29
|
-
merge_clusters(shortest_distance)
|
28
|
+
merge_clusters(distance_matrix.shortest_distance)
|
30
29
|
end
|
31
30
|
clusters
|
32
31
|
end
|
33
32
|
|
33
|
+
def outliers
|
34
|
+
set_outliers.uniq
|
35
|
+
end
|
36
|
+
|
37
|
+
def find_outliers percentage_of_clusters, distance
|
38
|
+
distance_matrix.matrix.each_with_index do |index, row, column|
|
39
|
+
count_hash[row] ||= 0
|
40
|
+
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
41
|
+
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
42
|
+
end
|
43
|
+
points.reject! { |point| outliers.include?(point) }
|
44
|
+
outliers
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
34
49
|
def merge_clusters indexes
|
35
50
|
index1, index2 = indexes
|
36
51
|
new_cluster = clusters[index1].merge(clusters[index2])
|
@@ -48,22 +63,6 @@ module AgglomerativeClustering
|
|
48
63
|
distance_matrix
|
49
64
|
end
|
50
65
|
|
51
|
-
def outliers
|
52
|
-
set_outliers.uniq
|
53
|
-
end
|
54
|
-
|
55
|
-
def find_outliers percentage_of_clusters, distance
|
56
|
-
distance_matrix.matrix.each_with_index do |index, row, column|
|
57
|
-
count_hash[row] ||= 0
|
58
|
-
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
59
|
-
set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
60
|
-
end
|
61
|
-
points.reject! { |point| outliers.include?(point) }
|
62
|
-
outliers
|
63
|
-
end
|
64
|
-
|
65
|
-
private
|
66
|
-
|
67
66
|
def add_cluster new_cluster
|
68
67
|
clusters << new_cluster
|
69
68
|
update_distance_matrix(clusters.size - 1)
|
@@ -75,19 +74,6 @@ module AgglomerativeClustering
|
|
75
74
|
distance_matrix.remove_edge(index)
|
76
75
|
end
|
77
76
|
|
78
|
-
def shortest_distance
|
79
|
-
min_cluster_dist = 1.0/0
|
80
|
-
indexes = []
|
81
|
-
distance_matrix.matrix.each_with_index do |index, row, column|
|
82
|
-
distance = distance_matrix.matrix[row, column]
|
83
|
-
if distance < min_cluster_dist && distance != 0
|
84
|
-
min_cluster_dist = distance
|
85
|
-
indexes = [row, column]
|
86
|
-
end
|
87
|
-
end
|
88
|
-
indexes
|
89
|
-
end
|
90
|
-
|
91
77
|
def set_outliers
|
92
78
|
@set_outliers ||= []
|
93
79
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module AgglomerativeClustering
|
2
|
+
class SilhouetteCoefficient
|
3
|
+
include EuclideanDistance
|
4
|
+
attr_reader :main_cluster
|
5
|
+
|
6
|
+
def initialize main_cluster
|
7
|
+
@main_cluster = main_cluster
|
8
|
+
end
|
9
|
+
|
10
|
+
def measure clusters
|
11
|
+
silhouettes = []
|
12
|
+
average_distances = []
|
13
|
+
main_cluster.points.each do |point1|
|
14
|
+
a1 = calculate_a1(point1)
|
15
|
+
(clusters - [main_cluster]).each do |cluster|
|
16
|
+
distances = []
|
17
|
+
cluster.points.each do |point2|
|
18
|
+
distances << euclidean_distance(point1, point2)
|
19
|
+
end
|
20
|
+
average_distances << distances.inject(:+)/distances.size
|
21
|
+
end
|
22
|
+
b1 = average_distances.min
|
23
|
+
s1 = (b1 - a1)/[a1,b1].max
|
24
|
+
silhouettes << s1
|
25
|
+
end
|
26
|
+
silhouettes.inject(:+) / silhouettes.size
|
27
|
+
end
|
28
|
+
|
29
|
+
def calculate_a1 point
|
30
|
+
distances = []
|
31
|
+
main_cluster.points.each do |point1|
|
32
|
+
distances << euclidean_distance(point, point1)
|
33
|
+
end
|
34
|
+
distances.inject(:+)/(distances.size - 1)
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -5,7 +5,7 @@ describe AgglomerativeClustering::DistanceMatrix do
|
|
5
5
|
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.empty)
|
6
6
|
matrix.add_edge([1,2])
|
7
7
|
matrix.add_edge([2,2,3])
|
8
|
-
expect(matrix.remove_edge(0)).to eql(
|
8
|
+
expect(matrix.remove_edge(0)).to eql([[2,3]])
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
@@ -15,4 +15,11 @@ describe AgglomerativeClustering::DistanceMatrix do
|
|
15
15
|
expect(matrix.add_edge([4,5,6,7])).to eql(Matrix[[1,2,3,4],[4,5,6,7]])
|
16
16
|
end
|
17
17
|
end
|
18
|
+
|
19
|
+
context '#shortest_distance' do
|
20
|
+
it 'will return the indexes of the shortest distances' do
|
21
|
+
matrix = AgglomerativeClustering::DistanceMatrix.new(Matrix.rows([[0,2,3],[2,0,3]]))
|
22
|
+
expect(matrix.shortest_distance).to eql([0,1])
|
23
|
+
end
|
24
|
+
end
|
18
25
|
end
|
@@ -35,12 +35,6 @@ describe AgglomerativeClustering::Set do
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
|
-
context '#merge_clusters' do
|
39
|
-
it 'will merge two clusters into one and update the distance matrix' do
|
40
|
-
expect(@set.merge_clusters([0,1]).points).to eql([@point1, @point2])
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
38
|
context '#find_outliers' do
|
45
39
|
it 'will return a list of outliers' do
|
46
40
|
outlier1 = FactoryGirl.build(:point, x:100, y:200, z:300)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
describe AgglomerativeClustering::SilhouetteCoefficient do
|
2
|
+
|
3
|
+
context '#measure' do
|
4
|
+
it 'will return the average silhoutte coefficient of a cluster' do
|
5
|
+
p1 = FactoryGirl.build(:point, x:1, y:1, z:1)
|
6
|
+
p2 = FactoryGirl.build(:point, x:3, y:3, z:3)
|
7
|
+
p3 = FactoryGirl.build(:point, x:17, y:17, z:17)
|
8
|
+
p4 = FactoryGirl.build(:point, x:16, y:16, z:16)
|
9
|
+
p5 = FactoryGirl.build(:point, x:18, y:18, z:18)
|
10
|
+
p6 = FactoryGirl.build(:point, x:2, y:2, z:2)
|
11
|
+
cluster1 = AgglomerativeClustering::Cluster.new(p1)
|
12
|
+
cluster2 = AgglomerativeClustering::Cluster.new(p2)
|
13
|
+
cluster3 = AgglomerativeClustering::Cluster.new(p3)
|
14
|
+
cluster4 = AgglomerativeClustering::Cluster.new(p4)
|
15
|
+
cluster5 = AgglomerativeClustering::Cluster.new(p5)
|
16
|
+
cluster6 = AgglomerativeClustering::Cluster.new(p6)
|
17
|
+
cluster1.merge(cluster2).merge(cluster3)
|
18
|
+
cluster4.merge(cluster5).merge(cluster6)
|
19
|
+
sc = AgglomerativeClustering::SilhouetteCoefficient.new(cluster1)
|
20
|
+
clusters = [cluster1, cluster4]
|
21
|
+
expect(sc.measure(clusters).round(4)).to eql(-0.0893)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: agglomerative_clustering
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Mulvihill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- lib/agglomerative_clustering/linkage/single.rb
|
80
80
|
- lib/agglomerative_clustering/point.rb
|
81
81
|
- lib/agglomerative_clustering/set.rb
|
82
|
+
- lib/agglomerative_clustering/silhouette_coefficient.rb
|
82
83
|
- lib/agglomerative_clustering/version.rb
|
83
84
|
- spec/factories/lib/agglomerative_clustering/cluster.rb
|
84
85
|
- spec/factories/lib/agglomerative_clustering/point.rb
|
@@ -91,6 +92,7 @@ files:
|
|
91
92
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
92
93
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
93
94
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
95
|
+
- spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb
|
94
96
|
- spec/spec_helper.rb
|
95
97
|
homepage: https://github.com/bmulvihill/agglomerative_clustering
|
96
98
|
licenses:
|
@@ -128,4 +130,5 @@ test_files:
|
|
128
130
|
- spec/lib/agglomerative_clustering/linkage/complete_spec.rb
|
129
131
|
- spec/lib/agglomerative_clustering/linkage/single_spec.rb
|
130
132
|
- spec/lib/agglomerative_clustering/set_spec.rb
|
133
|
+
- spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb
|
131
134
|
- spec/spec_helper.rb
|