same_same 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +44 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +39 -0
  6. data/Rakefile +1 -0
  7. data/examples/dbscan_digg.rb +25 -0
  8. data/examples/dbscan_lines.rb +35 -0
  9. data/examples/rock_digg.rb +20 -0
  10. data/examples/rock_lines.rb +31 -0
  11. data/lib/same_same.rb +15 -0
  12. data/lib/same_same/cluster.rb +27 -0
  13. data/lib/same_same/cluster_similarity.rb +10 -0
  14. data/lib/same_same/cosine_distance.rb +27 -0
  15. data/lib/same_same/cosine_similarity.rb +22 -0
  16. data/lib/same_same/data_point.rb +12 -0
  17. data/lib/same_same/dbscan_algorithm.rb +135 -0
  18. data/lib/same_same/dbscan_clusters.rb +88 -0
  19. data/lib/same_same/dbscan_neighborhood.rb +68 -0
  20. data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
  21. data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
  22. data/lib/same_same/dendrogram.rb +28 -0
  23. data/lib/same_same/dendrogram_printer.rb +74 -0
  24. data/lib/same_same/jaquard_coefficient.rb +9 -0
  25. data/lib/same_same/link_matrix.rb +62 -0
  26. data/lib/same_same/merge_goodness_measure.rb +30 -0
  27. data/lib/same_same/rock_algorithm.rb +51 -0
  28. data/lib/same_same/rock_clusters.rb +68 -0
  29. data/lib/same_same/similarity_matrix.rb +20 -0
  30. data/lib/same_same/symmetrical_matrix.rb +39 -0
  31. data/lib/same_same/term_frequency_builder.rb +20 -0
  32. data/lib/same_same/version.rb +3 -0
  33. data/same_same.gemspec +23 -0
  34. data/spec/fixtures/digg_stories.csv +49 -0
  35. data/spec/fixtures/lines.csv +899 -0
  36. data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
  37. data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
  38. data/spec/same_same/link_matrix_spec.rb +29 -0
  39. data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
  40. data/spec/same_same/rock_algorithm_spec.rb +71 -0
  41. data/spec/same_same/similarity_matrix_spec.rb +20 -0
  42. data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
  43. metadata +144 -0
@@ -0,0 +1,88 @@
1
+ require 'set'
2
+
3
+ module SameSame
4
+ class DbscanClusters
5
+
6
+ # Identifies a set of Noise points.
7
+ NOISE_ID = -1
8
+
9
+ # Identifies a set of Unclassified points.
10
+ UNCLASSIFIED_ID = 0
11
+
12
+
13
+ attr_accessor :clusters, :last_id
14
+
15
+ def initialize( unclassified )
16
+ self.clusters = Hash.new {|hash, key|
17
+ hash[key] = Set.new
18
+ }
19
+ self.last_id = 0
20
+ assign_points( unclassified, UNCLASSIFIED_ID )
21
+ end
22
+
23
+ def assign_to_noise( p )
24
+ assign_point( p, NOISE_ID)
25
+ end
26
+
27
+ def unclassified?(p)
28
+ point_in_cluster?(p, UNCLASSIFIED_ID)
29
+ end
30
+
31
+ def noise?(p)
32
+ point_in_cluster?(p, NOISE_ID)
33
+ end
34
+
35
+ def point_in_cluster?( p, cluster_id)
36
+ clusters[cluster_id].include?( p )
37
+ end
38
+
39
+ def assign_points(points, cluster_id)
40
+ points.each {|p| assign_point( p, cluster_id)}
41
+ end
42
+
43
+ def assign_point( p, cluster_id)
44
+ # Remove point from the group that it currently belongs to...
45
+ if noise?(p)
46
+ remove_point_from_cluster(p, NOISE_ID)
47
+ elsif unclassified?(p)
48
+ remove_point_from_cluster(p, UNCLASSIFIED_ID)
49
+ else
50
+ if cluster_id != UNCLASSIFIED_ID
51
+ raise ArgumentError.new("Trying to move point that has already been assigned to some other cluster. Point: #{p}, cluster_id=#{cluster_id}")
52
+ end
53
+ end
54
+
55
+ clusters[cluster_id] << p
56
+ end
57
+
58
+ def to_clusters
59
+ [].tap do |all_clusters|
60
+ clusters.each do |id, points|
61
+ all_clusters << Cluster.new(points, cluster_name(id)) unless points.empty?
62
+ end
63
+ end
64
+ end
65
+
66
+
67
+ def cluster_name(id)
68
+ case id
69
+ when NOISE_ID then "Noise"
70
+ when UNCLASSIFIED_ID then "Unclassified"
71
+ else "Cluster #{id}"
72
+ end
73
+ end
74
+
75
+ def remove_point_from_cluster(p, cluster_id)
76
+ cluster = clusters[cluster_id]
77
+
78
+ return false if cluster.nil?
79
+ cluster.include?(p).tap do
80
+ cluster.delete p
81
+ end
82
+ end
83
+
84
+ def get_next_cluster_id
85
+ self.last_id = last_id + 1
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,68 @@
1
+ module SameSame
2
+ class DbscanNeighborhood
3
+ # Contains distances between points.
4
+ attr_accessor :adjacency_matrix
5
+
6
+ # Threshold value. Determines which points will be considered as
7
+ # neighbors. Two points are neighbors if the distance between them does
8
+ # not exceed threshold value.
9
+ attr_accessor :eps
10
+
11
+ # used to cache index of points in the matrix
12
+ attr_accessor :index_mapping
13
+
14
+ attr_accessor :points
15
+
16
+ # Initializes algorithm with all data that it needs.
17
+ #
18
+ # * points - points to cluster
19
+ # * eps - distance threshold value
20
+ # * minPoints - number of neighbors for point to be considered a core point.
21
+ def initialize(attrs = {})
22
+ self.eps = attrs.fetch(:eps)
23
+ self.points = attrs.fetch(:points)
24
+
25
+ build_index_mapping
26
+
27
+ vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
28
+ distance = attrs.fetch( :distance )
29
+ use_term_frequencies = attrs[:use_term_frequencies] || false
30
+
31
+ self.adjacency_matrix =
32
+ calculate_adjacency_matrix(distance, points, vector_calculator)
33
+ end
34
+
35
+ def neighbors_of( p )
36
+ Set.new.tap do |neighbors|
37
+ i = index_mapping[p]
38
+ (0..index_mapping.size-1).each do |j|
39
+ neighbors.add(points[j]) if adjacency_matrix.lookup(i,j) <= eps
40
+ end
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def build_index_mapping
47
+ self.index_mapping = {}
48
+ points.each_with_index do |p,i|
49
+ index_mapping[p] = i
50
+ end
51
+ index_mapping
52
+ end
53
+
54
+ def calculate_adjacency_matrix(distance, points, vector_calculator)
55
+ SymmetricalMatrix.new( points.size ).tap do |m|
56
+ (0..points.size - 1).each do |i|
57
+ m.set(i,i, 0.0)
58
+ ((i+1)..(points.size - 1)).each do |j|
59
+ x, y = vector_calculator.vectors( points[i], points[j] )
60
+ d = distance.distance(x, y)
61
+ m.set(i,j,d)
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,7 @@
1
+ module SameSame
2
+ class DbscanNumericVectors
3
+ def vectors( p1, p2 )
4
+ [p1.data, p2.data]
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module SameSame
2
+ class DbscanTermFrequencyVectors
3
+ def vectors(p1, p2)
4
+ TermFrequencyBuilder.build_vectors( p1.data, p2.data )
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,28 @@
1
+ module SameSame
2
+
3
+ class Dendrogram
4
+
5
+ Level = Struct.new(:name, :clusters)
6
+
7
+ attr_accessor :levels, :level_label
8
+
9
+ def initialize(name)
10
+ self.levels = []
11
+ self.level_label = name
12
+ end
13
+
14
+ def add_level( name, clusters )
15
+ self.levels << Level.new(name, clusters.map(&:dup))
16
+ end
17
+
18
+ def [](i)
19
+ levels[i]
20
+ end
21
+
22
+ def non_singelton_leaves?
23
+ levels.last.clusters.any? {|cluster| cluster.size > 1}
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,74 @@
1
+ require 'colored'
2
+
3
+ module SameSame
4
+
5
+ class DendrogramPrinter
6
+
7
+ def print_last(dnd)
8
+ level = dnd.levels.last
9
+ print_clusters( level.clusters )
10
+ end
11
+
12
+ def print_clusters(clusters)
13
+ clusters.each do |cluster|
14
+ if cluster.size > 1
15
+ puts "## #{cluster.name}" if cluster.name
16
+ print_points( cluster.datapoints )
17
+ end
18
+ end
19
+ end
20
+
21
+ def print(dnd)
22
+ dnd.levels.each_with_index do |level, i|
23
+ single_point_clusters = level.clusters.select {|cluster| cluster.size == 1}
24
+ ungrouped = single_point_clusters.map {|c| c.datapoints}.flatten
25
+
26
+ puts
27
+ puts "-" * 80
28
+ puts "#{dnd.level_label}: #{level.name}"
29
+ puts
30
+ puts "Clusters: #{level.clusters.size - single_point_clusters.size}"
31
+ puts "Ungrouped: #{ungrouped.size}"
32
+ puts "-" * 80
33
+ puts
34
+
35
+ level.clusters.each do |cluster|
36
+ if cluster.size > 1
37
+ print_points( cluster.datapoints )
38
+ end
39
+ end
40
+ puts
41
+
42
+
43
+ if i == dnd.levels.size - 1
44
+ puts "FINAL UNGROUPED"
45
+ print_points(ungrouped)
46
+ end
47
+ end
48
+ end
49
+
50
+ def highlight_common( content, common_words )
51
+ words = content.strip.split(/\s+/)
52
+ words.map {|word| common_words.include?(word.downcase) ? word : word.bold.red}.join(" ")
53
+ end
54
+
55
+ def formatted_datapoint_name( content, common_words )
56
+ if content =~ /^(\d+:)(.*)/
57
+ "#{$1.cyan} #{highlight_common( $2, common_words) }"
58
+ else
59
+ highlight_common( content, common_words )
60
+ end
61
+ end
62
+
63
+ def print_points(datapoints)
64
+ puts
65
+ all_terms = datapoints.map(&:id).map(&:downcase).map {|id| id.split(/\s+/)}
66
+ common_words = all_terms.inject(all_terms.flatten.uniq) {|m,v| m & v}
67
+ datapoints.sort_by {|dp| dp.id.gsub(/^\d+:/, '')}.each do |dp|
68
+ puts formatted_datapoint_name( dp.id, common_words )
69
+ end
70
+ puts
71
+ end
72
+ end
73
+
74
+ end
@@ -0,0 +1,9 @@
1
+ module SameSame
2
+ class JaquardCoefficient
3
+ def similarity( x, y )
4
+ raise(ArgumentError, "both sets cannot be empty") if x.empty? && y.empty?
5
+ return 0.0 if x.empty? || y.empty?
6
+ (x & y).size.to_f / (x | y).size.to_f
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,62 @@
1
+ module SameSame
2
+ class LinkMatrix
3
+ attr_reader :links, :index_lookup
4
+
5
+ def initialize( attrs = {} )
6
+ similarity_matrix = attrs.fetch(:similarity_matrix)
7
+ datapoints = attrs.fetch(:datapoints)
8
+ th = attrs.fetch(:th)
9
+ neighbours = calculate_neighbours( datapoints, similarity_matrix, th )
10
+
11
+ @links = calculate_links( neighbours, datapoints )
12
+ @index_lookup = calculate_index_lookup( datapoints )
13
+ end
14
+
15
+ def count_links_between_clusters( cluster1, cluster2 )
16
+ cluster1.inject(0) do |sum, p1|
17
+ cluster2.inject(sum) do |sum2, p2|
18
+ sum2 + number_of_links_between_points(p1, p2)
19
+ end
20
+ end
21
+ end
22
+
23
+ def number_of_links_between_points( datapoint1, datapoint2 )
24
+ links.lookup( index_lookup[datapoint1], index_lookup[datapoint2] )
25
+ end
26
+
27
+ private
28
+
29
+ def calculate_index_lookup( datapoints )
30
+ {}.tap do |index|
31
+ datapoints.each_with_index {|p, i| index[p] = i}
32
+ end
33
+ end
34
+
35
+ def calculate_links( neighbours, datapoints )
36
+ SymmetricalMatrix.new(
37
+ neighbours.size,
38
+ ->(x,y) {number_of_links(neighbours, datapoints, x, y)}
39
+ )
40
+ end
41
+
42
+ def calculate_neighbours( datapoints, similarity_matrix, th )
43
+ SymmetricalMatrix.new(
44
+ datapoints.size,
45
+ ->(x,y) {similarity_matrix.lookup(x,y) >= th ? 1 : 0}
46
+ )
47
+ end
48
+
49
+ # 0 1 2 3
50
+ # ------------
51
+ # 0 | Y - Y -
52
+ # 1 | - Y - -
53
+ # 2 | - - - -
54
+ # 3 | - Y - -
55
+ #
56
+ def number_of_links(neighbors, datapoints, x, y)
57
+ (0..datapoints.size-1).map do |i|
58
+ neighbors.lookup(x,i) * neighbors.lookup(i,y)
59
+ end.inject(0) {|m,v| m+v}
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,30 @@
1
+ module SameSame
2
+
3
+ class MergeGoodnessMeasure
4
+ attr_reader :th, :p
5
+
6
+ # th should be >= 0 and <= 1
7
+ # 0 means all datapoints are neighbours
8
+ # 1 means no datapoints are neighbours
9
+ # (proximity)
10
+ def initialize( th )
11
+ @th = th
12
+ @p = 1.0 + 2.0 * f( th )
13
+ end
14
+
15
+ def g(number_of_links, size_x, size_y)
16
+ a = (size_x + size_y) ** p
17
+ b = size_x ** p
18
+ c = size_x ** p
19
+
20
+ number_of_links / (a - b - c)
21
+ end
22
+
23
+ private
24
+
25
+ def f( th )
26
+ (1.0 - th) / (1.0 + th)
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,51 @@
1
+ require 'same_same/dendrogram'
2
+ require 'same_same/link_matrix'
3
+ require 'same_same/merge_goodness_measure'
4
+ require 'same_same/rock_clusters'
5
+ require 'same_same/similarity_matrix'
6
+
7
+ module SameSame
8
+ class RockAlgorithm
9
+ attr_accessor :datapoints, :similarity_measure, :k, :th, :link_matrix, :similarity_matrix
10
+
11
+ def initialize( attrs = {} )
12
+ self.datapoints = attrs.fetch( :datapoints )
13
+ self.similarity_measure = attrs[ :similarity_measure ] || JaquardCoefficient.new
14
+ self.k = attrs.fetch(:k)
15
+ self.th = attrs.fetch(:th)
16
+
17
+ self.similarity_matrix = SimilarityMatrix.new( similarity_measure, datapoints.map {|d| d.data} )
18
+ self.link_matrix = LinkMatrix.new( datapoints: datapoints, similarity_matrix: similarity_matrix, th: th)
19
+ end
20
+
21
+ def cluster
22
+ Dendrogram.new( "Goodness" ).tap do |dnd|
23
+ initial_clusters = one_point_per_cluster
24
+ g = Float::INFINITY
25
+ dnd.add_level(g.to_s, initial_clusters)
26
+ goodness = MergeGoodnessMeasure.new( th )
27
+
28
+ rock_clusters = RockClusters.new(
29
+ link_matrix: link_matrix,
30
+ clusters: initial_clusters,
31
+ goodness_measure: goodness)
32
+
33
+ number_of_clusters = rock_clusters.size
34
+ while number_of_clusters > k do
35
+ number_of_clusters_before_merge = number_of_clusters
36
+ g = rock_clusters.merge_best_candidates
37
+ number_of_clusters = rock_clusters.size
38
+
39
+ # finish if there are no linked clusters to merge
40
+ break if number_of_clusters == number_of_clusters_before_merge
41
+
42
+ dnd.add_level(g.to_s, rock_clusters.clusters)
43
+ end
44
+ end
45
+ end
46
+
47
+ def one_point_per_cluster
48
+ datapoints.map {|point| Cluster.new([point])}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,68 @@
1
+ require 'same_same/cluster_similarity'
2
+
3
+ module SameSame
4
+ class RockClusters
5
+ attr_accessor :link_matrix, :clusters, :goodness_measure, :cluster_map, :closest_clusters
6
+
7
+ def initialize( attrs = {} )
8
+ self.link_matrix = attrs.fetch(:link_matrix)
9
+ self.goodness_measure = attrs.fetch(:goodness_measure)
10
+ self.cluster_map = {}
11
+ @last_key = -1
12
+
13
+ attrs[:clusters].each {|c| add_cluster(c) }
14
+ calculate_closest_clusters
15
+ end
16
+
17
+ def merge_best_candidates
18
+ key1, similarity = find_most_similar_pair
19
+ if key1
20
+ merge_clusters key1, similarity.cluster_key
21
+ similarity.goodness
22
+ end
23
+ end
24
+
25
+ def find_most_similar_pair
26
+ closest_clusters.sort_by {|_, similarity| similarity.goodness}.first || []
27
+ end
28
+
29
+ def size
30
+ cluster_map.size
31
+ end
32
+
33
+ def clusters
34
+ cluster_map.values
35
+ end
36
+
37
+ def add_cluster( c )
38
+ cluster_map[next_key] = c
39
+ end
40
+
41
+ def next_key
42
+ @last_key = @last_key + 1
43
+ end
44
+
45
+ def calculate_closest_clusters
46
+ self.closest_clusters = {}
47
+ cluster_map.each do |cluster_key, cluster|
48
+ similarity = cluster_map.map do |other_key, other_cluster|
49
+ if cluster_key != other_key
50
+ number_of_links = link_matrix.count_links_between_clusters( cluster, other_cluster )
51
+ if number_of_links > 0
52
+ goodness = goodness_measure.g( number_of_links, cluster.size, other_cluster.size)
53
+ ClusterSimilarity.new( other_key, goodness )
54
+ end
55
+ end
56
+ end.compact.sort.first
57
+ closest_clusters[cluster_key] = similarity if similarity
58
+ end
59
+ end
60
+
61
+ def merge_clusters(key1, key2)
62
+ merged_key = add_cluster( cluster_map.delete(key1) + cluster_map.delete(key2) )
63
+ calculate_closest_clusters
64
+ merged_key
65
+ end
66
+
67
+ end
68
+ end