agglomerative_clustering 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 03755a284a11c8225365f9b22df38da433e07994
4
- data.tar.gz: 6996369a3b16c5541cd694a8315c2c40606b91e3
3
+ metadata.gz: 3c8adc349d03e25a7a982a0479069f510eb6a97a
4
+ data.tar.gz: 91cfe58dd5b19999699d1cf331bcd12b0b2451db
5
5
  SHA512:
6
- metadata.gz: 1061fb7aae2bc9c6cee7ba054593331a6e9db4c5388d895f821aad46b6d05d86721f61c947ce5377095ff7dc3adf8f93494ce9a3697f44357521d5ecc97e8523
7
- data.tar.gz: f481ee1c97df283e3a354ed447319fbf9041d5507ac677269288e9c7802eb743510e016a6a0fca9a5232eb0a98420a1c393e1a08c9a52a0dfac91fcc21d91fd7
6
+ metadata.gz: 7ecefeb4ecb8f02f10da56b95cb883c6752631a28fc0212fb8e8f72f3b57d361c7597179ba9bf97c4a804f0540f7a81393a0c0263048994c3b14bca63bc6e364
7
+ data.tar.gz: 8b979ffcc0778884639e02ae3651792582802efd31e58ccd8d2964d1d56b30b94dd00953389a3df55654287412d4bd8ef64321a65ab3af3386570106fdce6969
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ require 'agglomerative_clustering'
3
+
4
+ puts "Enter a total number of points"
5
+ points = gets.chomp.to_i
6
+ puts "Please enter a P value (Percentage of points to determine outliers)"
7
+ percentage = gets.chomp.to_f
8
+ puts "Please enter a d value (Distance of points to determine outliers)"
9
+ distance = gets.chomp.to_f
10
+ puts "Please enter a number of clusters"
11
+ k = gets.chomp.to_i
12
+ puts "Enter y to enter a center point for each pseudo cluster (Or press enter for random)"
13
+ center_points = []
14
+ cp_input = gets.chomp
15
+ if cp_input == 'y'
16
+ for l in 0..k-1
17
+ center_points << gets.chomp.split(',').map(&:to_i)
18
+ end
19
+ else
20
+ for i in 0..k-1
21
+ center_points << [Random.rand(-100..100), Random.rand(-100..100), Random.rand(-100..100)]
22
+ end
23
+ end
24
+
25
+ puts "Processing.."
26
+ min_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Single.new)
27
+ max_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Complete.new)
28
+ average_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Average.new)
29
+ center_set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Center.new)
30
+ sets = [min_set, max_set, average_set, center_set]
31
+
32
+ # Create set of 500 points based on provided parameters
33
+ radius = 25
34
+ for clusters in 0..k-1
35
+ cp = center_points[clusters]
36
+ for i in 0..(points/k-1)
37
+ x = cp[0] + (Random.rand(-radius..radius))
38
+ y = cp[1] + (Random.rand(-radius..radius))
39
+ z = cp[2] + (Random.rand(-radius..radius))
40
+ p = Point.new(x,y,z)
41
+ sets.each {|set| set.push(p)}
42
+ end
43
+ end
44
+
45
+ # Find and Remove Outliers in the Set
46
+ sets.each do |set|
47
+ set.find_outliers(percentage, distance)
48
+ end
49
+
50
+ # Output outliers to the console
51
+ if min_set.outliers.any?
52
+ puts 'Outliers Removed from Set:'
53
+ min_set.outliers.each do |outlier|
54
+ puts outlier
55
+ end
56
+ else
57
+ puts "There are no outliers where #{percentage}% of the points lie at a distance greater than #{distance}"
58
+ end
59
+
60
+ # Run the clustering algorithms
61
+ min_clusters = min_set.cluster(k)
62
+ max_clusters = max_set.cluster(k)
63
+ average_clusters = average_set.cluster(k)
64
+ center_clusters = center_set.cluster(k)
65
+
66
+ # Outputs points on each cluster
67
+ puts "=====Minimum Linkage Clusters====="
68
+ min_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
69
+ puts "=====Maximum Linkage Clusters====="
70
+ max_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
71
+ puts "=====Average Linkage Clusters====="
72
+ average_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
73
+ puts "=====Center Linkage Clusters======"
74
+ center_clusters.each_with_index{|c, i| puts "Cluster #{i + 1}:\n #{c.points}" }
75
+
76
+ # Output Silhouette Coefficients for the clusters in each set
77
+ puts 'Minimum Linkage Silhouette Coefficients: '
78
+ s1 = min_clusters.map { |c| AgglomerativeClustering::SilhouetteCoefficient.new(c).measure(min_clusters) }
79
+ p s1
80
+ puts 'Maximum Linkage Silhouette Coefficients: '
81
+ s2 = max_clusters.map { |c| AgglomerativeClustering::SilhouetteCoefficient.new(c).measure(max_clusters) }
82
+ p s2
83
+ puts 'Average Linkage Silhouette Coefficients: '
84
+ s3 = average_clusters.map { |cluster| AgglomerativeClustering::SilhouetteCoefficient.new(cluster).measure(average_clusters) }
85
+ p s3
86
+ puts 'Center Linkage Silhouette Coefficients: '
87
+ s4 = center_clusters.map { |cluster| AgglomerativeClustering::SilhouetteCoefficient.new(cluster).measure(center_clusters) }
88
+ p s4
89
+
90
+ # output best average silhouette coefficient
91
+ type_hash = {0 => 'Minimum', 1 => 'Maximum', 2 => 'Average', 3 => 'Center'}
92
+ all = [s1,s2,s3,s4].map { |s| s.inject(:+)/s.size }
93
+ puts "The best Average Silhouette Coefficient is #{all.max} from #{type_hash[all.index(all.max)]} Linkage"
@@ -8,6 +8,8 @@ module AgglomerativeClustering
8
8
 
9
9
  def points
10
10
  @points ||= []
11
+ @points.sort_by!{|p| [p.x, p.y, p.z] } if @points.any?
12
+ @points
11
13
  end
12
14
 
13
15
  def merge(cluster)
@@ -8,9 +8,10 @@ module AgglomerativeClustering
8
8
  end
9
9
 
10
10
  def center_point cluster
11
- cluster.points.first.zip(*cluster.points[1..cluster.points.size-1]).map { |a,b| (a + b)/cluster.points.size.to_f }
11
+ return cluster.points.first if cluster.points.size == 1
12
+ cluster.points.first.zip(*cluster.points[1..cluster.points.size-1]).map { |a| a.inject(:+).to_f/a.size.to_f }
12
13
  end
13
-
14
+
14
15
  end
15
16
  end
16
17
  end
@@ -38,9 +38,12 @@ module AgglomerativeClustering
38
38
  distance_matrix.matrix.each_with_index do |index, row, column|
39
39
  count_hash[row] ||= 0
40
40
  count_hash[row] += 1 if distance_matrix.matrix[row, column] > distance
41
- set_outliers << points[row] if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
41
+ if count_hash[row]/(distance_matrix.matrix.row_count - 1) > percentage_of_clusters/100
42
+ set_outliers << points[row]
43
+ end
42
44
  end
43
45
  points.reject! { |point| outliers.include?(point) }
46
+ @distance_matrix = build_distance_matrix
44
47
  outliers
45
48
  end
46
49
 
@@ -7,6 +7,8 @@ module AgglomerativeClustering
7
7
  @main_cluster = main_cluster
8
8
  end
9
9
 
10
+ # Measures the silhouette coefficient of a cluster compared to all other clusters
11
+ # Returns the average silhouette coefficient of a cluster
10
12
  def measure clusters
11
13
  silhouettes = []
12
14
  average_distances = []
@@ -15,23 +17,25 @@ module AgglomerativeClustering
15
17
  (clusters - [main_cluster]).each do |cluster|
16
18
  distances = []
17
19
  cluster.points.each do |point2|
18
- distances << euclidean_distance(point1, point2)
20
+ distances << euclidean_distance(point1, point2).round(2)
19
21
  end
20
22
  average_distances << distances.inject(:+)/distances.size
21
23
  end
22
- b1 = average_distances.min
24
+ b1 = average_distances.min || 0
23
25
  s1 = (b1 - a1)/[a1,b1].max
24
26
  silhouettes << s1
25
27
  end
26
- silhouettes.inject(:+) / silhouettes.size
28
+ (silhouettes.inject(:+) / silhouettes.size).round(2)
27
29
  end
28
30
 
31
+ # Calculates the a1 value of a cluster
29
32
  def calculate_a1 point
30
33
  distances = []
31
34
  main_cluster.points.each do |point1|
32
- distances << euclidean_distance(point, point1)
35
+ distances << euclidean_distance(point, point1).round(2)
33
36
  end
34
- distances.inject(:+)/(distances.size - 1)
37
+ return 0 if distances.size == 1
38
+ (distances.inject(:+)/(distances.size - 1)).round(2)
35
39
  end
36
40
 
37
41
  end
@@ -1,3 +1,3 @@
1
1
  module AgglomerativeClustering
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -5,7 +5,7 @@ describe AgglomerativeClustering::Cluster do
5
5
  cluster1 = FactoryGirl.build(:cluster)
6
6
  cluster2 = FactoryGirl.build(:cluster)
7
7
  points = cluster1.points + cluster2.points
8
- expect(cluster1.merge(cluster2).points).to eql(points)
8
+ expect(cluster1.merge(cluster2).points).to eql(points.sort_by{|p| [p.x, p.y, p.z]})
9
9
  end
10
10
  end
11
11
  end
@@ -3,17 +3,17 @@ describe AgglomerativeClustering::Linkage::Single do
3
3
  context '#calculate_distance' do
4
4
  it 'will calculate distance between clusters based on the min distnace between points' do
5
5
  single_linkage = AgglomerativeClustering::Linkage::Single.new
6
- min_point = FactoryGirl.build(:point, x: 2, y: 2, z: 2)
7
- max_point = FactoryGirl.build(:point, x: 5, y: 5, z: 5)
8
- cluster1 = AgglomerativeClustering::Cluster.new(min_point)
6
+ min_point1 = FactoryGirl.build(:point, x: 2, y: 2, z: 2)
7
+ min_point2 = FactoryGirl.build(:point, x: 5, y: 5, z: 5)
8
+ cluster1 = AgglomerativeClustering::Cluster.new(min_point1)
9
9
  cluster2 = AgglomerativeClustering::Cluster.new(FactoryGirl.build(:point, x: 1, y: 1, z: 1))
10
10
  cluster3 = AgglomerativeClustering::Cluster.new(FactoryGirl.build(:point, x: 7, y: 7, z: 7))
11
- cluster4 = AgglomerativeClustering::Cluster.new(max_point)
11
+ cluster4 = AgglomerativeClustering::Cluster.new(min_point2)
12
12
  cluster1 = cluster1.merge(cluster2)
13
13
  cluster3 = cluster3.merge(cluster4)
14
- min_distance = single_linkage.euclidean_distance(min_point, max_point)
14
+ min_distance = single_linkage.euclidean_distance(min_point1, min_point2)
15
15
  expect(single_linkage.calculate_distance(cluster1, cluster3)).to eql(min_distance)
16
16
  end
17
17
  end
18
-
18
+
19
19
  end
@@ -18,7 +18,7 @@ describe AgglomerativeClustering::SilhouetteCoefficient do
18
18
  cluster4.merge(cluster5).merge(cluster6)
19
19
  sc = AgglomerativeClustering::SilhouetteCoefficient.new(cluster1)
20
20
  clusters = [cluster1, cluster4]
21
- expect(sc.measure(clusters).round(4)).to eql(-0.0893)
21
+ expect(sc.measure(clusters)).to eql(-0.09)
22
22
  end
23
23
  end
24
24
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: agglomerative_clustering
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bryan Mulvihill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-02 00:00:00.000000000 Z
11
+ date: 2014-11-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -67,7 +67,7 @@ files:
67
67
  - README.md
68
68
  - Rakefile
69
69
  - agglomerative_clustering.gemspec
70
- - cluster.rb
70
+ - cluster_example.rb
71
71
  - lib/agglomerative_clustering.rb
72
72
  - lib/agglomerative_clustering/cluster.rb
73
73
  - lib/agglomerative_clustering/distance_matrix.rb
data/cluster.rb DELETED
@@ -1,62 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'agglomerative_clustering'
3
-
4
- set = AgglomerativeClustering::Set.new(AgglomerativeClustering::Linkage::Single.new)
5
- for i in 0..99
6
- x = Random.rand(0..100)
7
- y = Random.rand(0..100)
8
- z = Random.rand(0..100)
9
- p = Point.new(x,y,z)
10
- set.push(p)
11
- end
12
- for i in 100..199
13
- x = Random.rand(200..299)
14
- y = Random.rand(200..299)
15
- z = Random.rand(200..299)
16
- p = Point.new(x,y,z)
17
- set.push(p)
18
- end
19
- for i in 200..299
20
- x = Random.rand(400..499)
21
- y = Random.rand(400..499)
22
- z = Random.rand(400..499)
23
- p = Point.new(x,y,z)
24
- set.push(p)
25
- end
26
-
27
- percentage = 80
28
- distance = 150
29
-
30
- open('points.csv', 'w') do |f|
31
- set.points.each do |point|
32
- f << "#{point.x},#{point.y},#{point.z}\n"
33
- end
34
- end
35
-
36
- open('outliers.csv', 'w') do |f|
37
- set.find_outliers(percentage, distance).each do |point|
38
- f << "#{point.x},#{point.y},#{point.z}\n"
39
- end
40
- end
41
-
42
- if set.outliers.any?
43
- puts 'Outliers Removed from Set:'
44
- set.outliers.each do |outlier|
45
- puts outlier
46
- end
47
- else
48
- puts "There are no outliers where #{percentage}% of the points lie at a distance greater than #{distance}"
49
- end
50
-
51
- clusters = set.cluster(3)
52
- clusters.each_with_index do |cluster, index|
53
- open("cluster#{index}.csv", 'w') do |f|
54
- cluster.points.each do |point|
55
- f << "#{point.x},#{point.y},#{point.z}\n"
56
- end
57
- end
58
- end
59
-
60
- puts 'Silhouette Coefficient of First Cluster: '
61
- sc = AgglomerativeClustering::SilhouetteCoefficient.new(clusters[0])
62
- puts sc.measure(clusters)