agglomerative_clustering 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/cluster_example.rb +93 -0
- data/lib/agglomerative_clustering/cluster.rb +2 -0
- data/lib/agglomerative_clustering/linkage/center.rb +3 -2
- data/lib/agglomerative_clustering/set.rb +4 -1
- data/lib/agglomerative_clustering/silhouette_coefficient.rb +9 -5
- data/lib/agglomerative_clustering/version.rb +1 -1
- data/spec/lib/agglomerative_clustering/cluster_spec.rb +1 -1
- data/spec/lib/agglomerative_clustering/linkage/single_spec.rb +6 -6
- data/spec/lib/agglomerative_clustering/silhouette_coefficient_spec.rb +1 -1
- metadata +3 -3
- data/cluster.rb +0 -62
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c8adc349d03e25a7a982a0479069f510eb6a97a
|
4
|
+
data.tar.gz: 91cfe58dd5b19999699d1cf331bcd12b0b2451db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ecefeb4ecb8f02f10da56b95cb883c6752631a28fc0212fb8e8f72f3b57d361c7597179ba9bf97c4a804f0540f7a81393a0c0263048994c3b14bca63bc6e364
|
7
|
+
data.tar.gz: 8b979ffcc0778884639e02ae3651792582802efd31e58ccd8d2964d1d56b30b94dd00953389a3df55654287412d4bd8ef64321a65ab3af3386570106fdce6969
|
data/cluster_example.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'agglomerative_clustering'
|
3
|
+
|
4
|
+
puts "Enter a total number of points"
|
5
|
+
points = gets.chomp.to_i
|
6
|
+
puts "Please enter a P value (Percentage of points to determine outliers)"
|
7
|
+
percentage = gets.chomp.to_f
|
8
|
+
puts "Please enter a d value (Distance of points to determine outliers)"
|
9
|
+
distance = gets.chomp.to_f
|
10
|
+
puts "Please enter a number of clusters"
|
11
|
+
k = gets.chomp.to_i
|
12
|
+
puts "Enter y to enter a center point for each pseudo cluster (Or press enter for random)"
|
13
|
+
center_points = []
|
14
|
+
cp_input = gets.chomp
|
15
|
+
if cp_input == 'y'
|
16
|
+
for l in 0..k-1
|
17
|
+
center_points << gets.chomp.split(',').map(&:to_i)
|
18
|
+
end
|
19
|
+
else
|
20
|
+
for i in 0..k-1
|
21
|
+
center_points << [Random.rand(-100..100), Random.rand(-100..100), Random.rand(-100..100)]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
puts "Processing.."
|
26
|
+
min_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Single.new)
|
27
|
+
max_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Complete.new)
|
28
|
+
average_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Average.new)
|
29
|
+
center_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Center.new)
|
30
|
+
sets = [min_set, max_set, average_set, center_set]
|
31
|
+
|
32
|
+
# Create set of 500 points based on provided parameters
|
33
|
+
radius = 25
|
34
|
+
for clusters in 0..k-1
|
35
|
+
cp = center_points[clusters]
|
36
|
+
for i in 0..(points/k-1)
|
37
|
+
x = cp[0] + (Random.rand(-radius..radius))
|
38
|
+
y = cp[1] + (Random.rand(-radius..radius))
|
39
|
+
z = cp[2] + (Random.rand(-radius..radius))
|
40
|
+
p = Point.new(x,y,z)
|
41
|
+
sets.each {|set| set.push(p)}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Find and Remove Outliers in the Set
|
46
|
+
sets.each do |set|
|
47
|
+
set.find_outliers(percentage, distance)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Output outliers to the console
|
51
|
+
if min_set.outliers.any?
|
52
|
+
puts 'Outliers Removed from Set:'
|
53
|
+
min_set.outliers.each do |outlier|
|
54
|
+
puts outlier
|
55
|
+
end
|
56
|
+
else
|
57
|
+
puts "There are no outliers where #{percentage}% of the points lie at a distance greater than #{distance}"
|
58
|
+
end
|
59
|
+
|
60
|
+
# Run the clustering algorithms
|
61
|
+
min_clusters = min_set.cluster(k)
|
62
|
+
max_clusters = max_set.cluster(k)
|
63
|
+
average_clusters = average_set.cluster(k)
|
64
|
+
center_clusters = center_set.cluster(k)
|
65
|
+
|
66
|
+
# Outputs points on each cluster
|
67
|
+
puts "=====Minimum Linkage Clusters====="
|
68
|
+
min_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
|
69
|
+
puts "=====Maximum Linkage Clusters====="
|
70
|
+
max_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
|
71
|
+
puts "=====Average Linkage Clusters====="
|
72
|
+
average_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
|
73
|
+
puts "=====Center Linkage Clusters======"
|
74
|
+
center_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
|
75
|
+
|
76
|
+
# Output Silhouette Coefficients for the clusters in each set
|
77
|
+
puts 'Minimum Linkage Silhouette Coefficients: '
|
78
|
+
s1 = min_clusters.map { |c| AgglomerativeClustering::SilhouetteCoefficient.new(c).measure(min_clusters) }
|
79
|
+
p s1
|
80
|
+
puts 'Maximum Linkage Silhouette Coefficients: '
|
81
|
+
s2 = max_clusters.map { |c| AgglomerativeClustering::SilhouetteCoefficient.new(c).measure(max_clusters) }
|
82
|
+
p s2
|
83
|
+
puts 'Average Linkage Silhouette Coefficients: '
|
84
|
+
s3 = average_clusters.map { |cluster| AgglomerativeClustering::SilhouetteCoefficient.new(cluster).measure(average_clusters) }
|
85
|
+
p s3
|
86
|
+
puts 'Center Linkage Silhouette Coefficients: '
|
87
|
+
s4 = center_clusters.map { |cluster| AgglomerativeClustering::SilhouetteCoefficient.new(cluster).measure(center_clusters) }
|
88
|
+
p s4
|
89
|
+
|
90
|
+
# output best average silhouette coefficient
|
91
|
+
type_hash = {0 => 'Minimum', 1 => 'Maximum', 2 => 'Average', 3 => 'Center'}
|
92
|
+
all = [s1,s2,s3,s4].map { |s| s.inject(:+)/s.size }
|
93
|
+
puts "The best Average Silhouette Coefficient is #{all.max} from #{type_hash[all.index(all.max)]} Linkage"
|
@@ -8,9 +8,10 @@ module AgglomerativeClustering
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def center_point cluster
|
11
|
-
cluster.points.first
|
11
|
+
return cluster.points.first if cluster.points.size == 1
|
12
|
+
cluster.points.first.zip(*cluster.points[1..cluster.points.size-1]).map { |a| a.inject(:+).to_f/a.size.to_f }
|
12
13
|
end
|
13
|
-
|
14
|
+
|
14
15
|
end
|
15
16
|
end
|
16
17
|
end
|
@@ -38,9 +38,12 @@ module AgglomerativeClustering
|
|
38
38
|
distance_matrix.matrix.each_with_index do |index, row, column|
|
39
39
|
count_hash[row] ||= 0
|
40
40
|
count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
|
41
|
-
|
41
|
+
if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
|
42
|
+
set_outliers << points[row]
|
43
|
+
end
|
42
44
|
end
|
43
45
|
points.reject! { |point| outliers.include?(point) }
|
46
|
+
@distance_matrix = build_distance_matrix
|
44
47
|
outliers
|
45
48
|
end
|
46
49
|
|
@@ -7,6 +7,8 @@ module AgglomerativeClustering
|
|
7
7
|
@main_cluster = main_cluster
|
8
8
|
end
|
9
9
|
|
10
|
+
# Measures the silhouette coefficient of a cluster compared to all other clusters
|
11
|
+
# Returns the average silhouette coefficient of a cluster
|
10
12
|
def measure clusters
|
11
13
|
silhouettes = []
|
12
14
|
average_distances = []
|
@@ -15,23 +17,25 @@ module AgglomerativeClustering
|
|
15
17
|
(clusters - [main_cluster]).each do |cluster|
|
16
18
|
distances = []
|
17
19
|
cluster.points.each do |point2|
|
18
|
-
distances << euclidean_distance(point1, point2)
|
20
|
+
distances << euclidean_distance(point1, point2).round(2)
|
19
21
|
end
|
20
22
|
average_distances << distances.inject(:+)/distances.size
|
21
23
|
end
|
22
|
-
b1 = average_distances.min
|
24
|
+
b1 = average_distances.min || 0
|
23
25
|
s1 = (b1 - a1)/[a1,b1].max
|
24
26
|
silhouettes << s1
|
25
27
|
end
|
26
|
-
silhouettes.inject(:+) / silhouettes.size
|
28
|
+
(silhouettes.inject(:+) / silhouettes.size).round(2)
|
27
29
|
end
|
28
30
|
|
31
|
+
# Calculates the a1 value of a cluster
|
29
32
|
def calculate_a1 point
|
30
33
|
distances = []
|
31
34
|
main_cluster.points.each do |point1|
|
32
|
-
distances << euclidean_distance(point, point1)
|
35
|
+
distances << euclidean_distance(point, point1).round(2)
|
33
36
|
end
|
34
|
-
distances.
|
37
|
+
return 0 if distances.size == 1
|
38
|
+
(distances.inject(:+)/(distances.size - 1)).round(2)
|
35
39
|
end
|
36
40
|
|
37
41
|
end
|
@@ -5,7 +5,7 @@ describe AgglomerativeClustering::Cluster do
|
|
5
5
|
cluster1 = FactoryGirl.build(:cluster)
|
6
6
|
cluster2 = FactoryGirl.build(:cluster)
|
7
7
|
points = cluster1.points + cluster2.points
|
8
|
-
expect(cluster1.merge(cluster2).points).to eql(points)
|
8
|
+
expect(cluster1.merge(cluster2).points).to eql(points.sort_by{|p| [p.x, p.y, p.z]})
|
9
9
|
end
|
10
10
|
end
|
11
11
|
end
|
@@ -3,17 +3,17 @@ describe AgglomerativeClustering::Linkage::Single do
|
|
3
3
|
context '#calculate_distance' do
|
4
4
|
it 'will calculate distance between clusters based on the min distnace between points' do
|
5
5
|
single_linkage = AgglomerativeClustering::Linkage::Single.new
|
6
|
-
|
7
|
-
|
8
|
-
cluster1 = AgglomerativeClustering::Cluster.new(
|
6
|
+
min_point1 = FactoryGirl.build(:point, x: 2, y: 2, z: 2)
|
7
|
+
min_point2 = FactoryGirl.build(:point, x: 5, y: 5, z: 5)
|
8
|
+
cluster1 = AgglomerativeClustering::Cluster.new(min_point1)
|
9
9
|
cluster2 = AgglomerativeClustering::Cluster.new(FactoryGirl.build(:point, x: 1, y: 1, z: 1))
|
10
10
|
cluster3 = AgglomerativeClustering::Cluster.new(FactoryGirl.build(:point, x: 7, y: 7, z: 7))
|
11
|
-
cluster4 = AgglomerativeClustering::Cluster.new(
|
11
|
+
cluster4 = AgglomerativeClustering::Cluster.new(min_point2)
|
12
12
|
cluster1 = cluster1.merge(cluster2)
|
13
13
|
cluster3 = cluster3.merge(cluster4)
|
14
|
-
min_distance = single_linkage.euclidean_distance(
|
14
|
+
min_distance = single_linkage.euclidean_distance(min_point1, min_point2)
|
15
15
|
expect(single_linkage.calculate_distance(cluster1, cluster3)).to eql(min_distance)
|
16
16
|
end
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
end
|
@@ -18,7 +18,7 @@ describe AgglomerativeClustering::SilhouetteCoefficient do
|
|
18
18
|
cluster4.merge(cluster5).merge(cluster6)
|
19
19
|
sc = AgglomerativeClustering::SilhouetteCoefficient.new(cluster1)
|
20
20
|
clusters = [cluster1, cluster4]
|
21
|
-
expect(sc.measure(clusters)
|
21
|
+
expect(sc.measure(clusters)).to eql(-0.09)
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: agglomerative_clustering
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Mulvihill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -67,7 +67,7 @@ files:
|
|
67
67
|
- README.md
|
68
68
|
- Rakefile
|
69
69
|
- agglomerative_clustering.gemspec
|
70
|
-
-
|
70
|
+
- cluster_example.rb
|
71
71
|
- lib/agglomerative_clustering.rb
|
72
72
|
- lib/agglomerative_clustering/cluster.rb
|
73
73
|
- lib/agglomerative_clustering/distance_matrix.rb
|
data/cluster.rb
DELETED
@@ -1,62 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'agglomerative_clustering'
|
3
|
-
|
4
|
-
set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Single.new)
|
5
|
-
for i in 0..99
|
6
|
-
x = Random.rand(0..100)
|
7
|
-
y = Random.rand(0..100)
|
8
|
-
z = Random.rand(0..100)
|
9
|
-
p = Point.new(x,y,z)
|
10
|
-
set.push(p)
|
11
|
-
end
|
12
|
-
for i in 100..199
|
13
|
-
x = Random.rand(200..299)
|
14
|
-
y = Random.rand(200..299)
|
15
|
-
z = Random.rand(200..299)
|
16
|
-
p = Point.new(x,y,z)
|
17
|
-
set.push(p)
|
18
|
-
end
|
19
|
-
for i in 200..299
|
20
|
-
x = Random.rand(400..499)
|
21
|
-
y = Random.rand(400..499)
|
22
|
-
z = Random.rand(400..499)
|
23
|
-
p = Point.new(x,y,z)
|
24
|
-
set.push(p)
|
25
|
-
end
|
26
|
-
|
27
|
-
percentage = 80
|
28
|
-
distance = 150
|
29
|
-
|
30
|
-
open('points.csv', 'w') do |f|
|
31
|
-
set.points.each do |point|
|
32
|
-
f << "#{point.x},#{point.y},#{point.z}\n"
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
open('outliers.csv', 'w') do |f|
|
37
|
-
set.find_outliers(percentage, distance).each do |point|
|
38
|
-
f << "#{point.x},#{point.y},#{point.z}\n"
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
if set.outliers.any?
|
43
|
-
puts 'Outliers Removed from Set:'
|
44
|
-
set.outliers.each do |outlier|
|
45
|
-
puts outlier
|
46
|
-
end
|
47
|
-
else
|
48
|
-
puts "There are no outliers where #{percentage}% of the points lie at a distance greater than #{distance}"
|
49
|
-
end
|
50
|
-
|
51
|
-
clusters = set.cluster(3)
|
52
|
-
clusters.each_with_index do |cluster, index|
|
53
|
-
open("cluster#{index}.csv", 'w') do |f|
|
54
|
-
cluster.points.each do |point|
|
55
|
-
f << "#{point.x},#{point.y},#{point.z}\n"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
puts 'Silhouette Coefficient of First Cluster: '
|
61
|
-
sc = AgglomerativeClustering::SilhouetteCoefficient.new(clusters[0])
|
62
|
-
puts sc.measure(clusters)
|