same_same 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +44 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +39 -0
  6. data/Rakefile +1 -0
  7. data/examples/dbscan_digg.rb +25 -0
  8. data/examples/dbscan_lines.rb +35 -0
  9. data/examples/rock_digg.rb +20 -0
  10. data/examples/rock_lines.rb +31 -0
  11. data/lib/same_same.rb +15 -0
  12. data/lib/same_same/cluster.rb +27 -0
  13. data/lib/same_same/cluster_similarity.rb +10 -0
  14. data/lib/same_same/cosine_distance.rb +27 -0
  15. data/lib/same_same/cosine_similarity.rb +22 -0
  16. data/lib/same_same/data_point.rb +12 -0
  17. data/lib/same_same/dbscan_algorithm.rb +135 -0
  18. data/lib/same_same/dbscan_clusters.rb +88 -0
  19. data/lib/same_same/dbscan_neighborhood.rb +68 -0
  20. data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
  21. data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
  22. data/lib/same_same/dendrogram.rb +28 -0
  23. data/lib/same_same/dendrogram_printer.rb +74 -0
  24. data/lib/same_same/jaquard_coefficient.rb +9 -0
  25. data/lib/same_same/link_matrix.rb +62 -0
  26. data/lib/same_same/merge_goodness_measure.rb +30 -0
  27. data/lib/same_same/rock_algorithm.rb +51 -0
  28. data/lib/same_same/rock_clusters.rb +68 -0
  29. data/lib/same_same/similarity_matrix.rb +20 -0
  30. data/lib/same_same/symmetrical_matrix.rb +39 -0
  31. data/lib/same_same/term_frequency_builder.rb +20 -0
  32. data/lib/same_same/version.rb +3 -0
  33. data/same_same.gemspec +23 -0
  34. data/spec/fixtures/digg_stories.csv +49 -0
  35. data/spec/fixtures/lines.csv +899 -0
  36. data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
  37. data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
  38. data/spec/same_same/link_matrix_spec.rb +29 -0
  39. data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
  40. data/spec/same_same/rock_algorithm_spec.rb +71 -0
  41. data/spec/same_same/similarity_matrix_spec.rb +20 -0
  42. data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
  43. metadata +144 -0
@@ -0,0 +1,88 @@
1
+ require 'set'
2
+
3
+ module SameSame
4
+ class DbscanClusters
5
+
6
+ # Identifies a set of Noise points.
7
+ NOISE_ID = -1
8
+
9
+ # Identifies a set of Unclassified points.
10
+ UNCLASSIFIED_ID = 0
11
+
12
+
13
+ attr_accessor :clusters, :last_id
14
+
15
+ def initialize( unclassified )
16
+ self.clusters = Hash.new {|hash, key|
17
+ hash[key] = Set.new
18
+ }
19
+ self.last_id = 0
20
+ assign_points( unclassified, UNCLASSIFIED_ID )
21
+ end
22
+
23
+ def assign_to_noise( p )
24
+ assign_point( p, NOISE_ID)
25
+ end
26
+
27
+ def unclassified?(p)
28
+ point_in_cluster?(p, UNCLASSIFIED_ID)
29
+ end
30
+
31
+ def noise?(p)
32
+ point_in_cluster?(p, NOISE_ID)
33
+ end
34
+
35
+ def point_in_cluster?( p, cluster_id)
36
+ clusters[cluster_id].include?( p )
37
+ end
38
+
39
+ def assign_points(points, cluster_id)
40
+ points.each {|p| assign_point( p, cluster_id)}
41
+ end
42
+
43
+ def assign_point( p, cluster_id)
44
+ # Remove point from the group that it currently belongs to...
45
+ if noise?(p)
46
+ remove_point_from_cluster(p, NOISE_ID)
47
+ elsif unclassified?(p)
48
+ remove_point_from_cluster(p, UNCLASSIFIED_ID)
49
+ else
50
+ if cluster_id != UNCLASSIFIED_ID
51
+ raise ArgumentError.new("Trying to move point that has already been assigned to some other cluster. Point: #{p}, cluster_id=#{cluster_id}")
52
+ end
53
+ end
54
+
55
+ clusters[cluster_id] << p
56
+ end
57
+
58
+ def to_clusters
59
+ [].tap do |all_clusters|
60
+ clusters.each do |id, points|
61
+ all_clusters << Cluster.new(points, cluster_name(id)) unless points.empty?
62
+ end
63
+ end
64
+ end
65
+
66
+
67
+ def cluster_name(id)
68
+ case id
69
+ when NOISE_ID then "Noise"
70
+ when UNCLASSIFIED_ID then "Unclassified"
71
+ else "Cluster #{id}"
72
+ end
73
+ end
74
+
75
+ def remove_point_from_cluster(p, cluster_id)
76
+ cluster = clusters[cluster_id]
77
+
78
+ return false if cluster.nil?
79
+ cluster.include?(p).tap do
80
+ cluster.delete p
81
+ end
82
+ end
83
+
84
+ def get_next_cluster_id
85
+ self.last_id = last_id + 1
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,68 @@
1
+ module SameSame
2
+ class DbscanNeighborhood
3
+ # Contains distances between points.
4
+ attr_accessor :adjacency_matrix
5
+
6
+ # Threshold value. Determines which points will be considered as
7
+ # neighbors. Two points are neighbors if the distance between them does
8
+ # not exceed threshold value.
9
+ attr_accessor :eps
10
+
11
+ # used to cache index of points in the matrix
12
+ attr_accessor :index_mapping
13
+
14
+ attr_accessor :points
15
+
16
+ # Initializes algorithm with all data that it needs.
17
+ #
18
+ # * points - points to cluster
19
+ # * eps - distance threshold value
20
+ # * minPoints - number of neighbors for point to be considered a core point.
21
+ def initialize(attrs = {})
22
+ self.eps = attrs.fetch(:eps)
23
+ self.points = attrs.fetch(:points)
24
+
25
+ build_index_mapping
26
+
27
+ vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
28
+ distance = attrs.fetch( :distance )
29
+ use_term_frequencies = attrs[:use_term_frequencies] || false
30
+
31
+ self.adjacency_matrix =
32
+ calculate_adjacency_matrix(distance, points, vector_calculator)
33
+ end
34
+
35
+ def neighbors_of( p )
36
+ Set.new.tap do |neighbors|
37
+ i = index_mapping[p]
38
+ (0..index_mapping.size-1).each do |j|
39
+ neighbors.add(points[j]) if adjacency_matrix.lookup(i,j) <= eps
40
+ end
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def build_index_mapping
47
+ self.index_mapping = {}
48
+ points.each_with_index do |p,i|
49
+ index_mapping[p] = i
50
+ end
51
+ index_mapping
52
+ end
53
+
54
+ def calculate_adjacency_matrix(distance, points, vector_calculator)
55
+ SymmetricalMatrix.new( points.size ).tap do |m|
56
+ (0..points.size - 1).each do |i|
57
+ m.set(i,i, 0.0)
58
+ ((i+1)..(points.size - 1)).each do |j|
59
+ x, y = vector_calculator.vectors( points[i], points[j] )
60
+ d = distance.distance(x, y)
61
+ m.set(i,j,d)
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,7 @@
1
+ module SameSame
2
+ class DbscanNumericVectors
3
+ def vectors( p1, p2 )
4
+ [p1.data, p2.data]
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module SameSame
2
+ class DbscanTermFrequencyVectors
3
+ def vectors(p1, p2)
4
+ TermFrequencyBuilder.build_vectors( p1.data, p2.data )
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,28 @@
1
+ module SameSame
2
+
3
+ class Dendrogram
4
+
5
+ Level = Struct.new(:name, :clusters)
6
+
7
+ attr_accessor :levels, :level_label
8
+
9
+ def initialize(name)
10
+ self.levels = []
11
+ self.level_label = name
12
+ end
13
+
14
+ def add_level( name, clusters )
15
+ self.levels << Level.new(name, clusters.map(&:dup))
16
+ end
17
+
18
+ def [](i)
19
+ levels[i]
20
+ end
21
+
22
+ def non_singelton_leaves?
23
+ levels.last.clusters.any? {|cluster| cluster.size > 1}
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,74 @@
1
+ require 'colored'
2
+
3
+ module SameSame
4
+
5
+ class DendrogramPrinter
6
+
7
+ def print_last(dnd)
8
+ level = dnd.levels.last
9
+ print_clusters( level.clusters )
10
+ end
11
+
12
+ def print_clusters(clusters)
13
+ clusters.each do |cluster|
14
+ if cluster.size > 1
15
+ puts "## #{cluster.name}" if cluster.name
16
+ print_points( cluster.datapoints )
17
+ end
18
+ end
19
+ end
20
+
21
+ def print(dnd)
22
+ dnd.levels.each_with_index do |level, i|
23
+ single_point_clusters = level.clusters.select {|cluster| cluster.size == 1}
24
+ ungrouped = single_point_clusters.map {|c| c.datapoints}.flatten
25
+
26
+ puts
27
+ puts "-" * 80
28
+ puts "#{dnd.level_label}: #{level.name}"
29
+ puts
30
+ puts "Clusters: #{level.clusters.size - single_point_clusters.size}"
31
+ puts "Ungrouped: #{ungrouped.size}"
32
+ puts "-" * 80
33
+ puts
34
+
35
+ level.clusters.each do |cluster|
36
+ if cluster.size > 1
37
+ print_points( cluster.datapoints )
38
+ end
39
+ end
40
+ puts
41
+
42
+
43
+ if i == dnd.levels.size - 1
44
+ puts "FINAL UNGROUPED"
45
+ print_points(ungrouped)
46
+ end
47
+ end
48
+ end
49
+
50
+ def highlight_common( content, common_words )
51
+ words = content.strip.split(/\s+/)
52
+ words.map {|word| common_words.include?(word.downcase) ? word : word.bold.red}.join(" ")
53
+ end
54
+
55
+ def formatted_datapoint_name( content, common_words )
56
+ if content =~ /^(\d+:)(.*)/
57
+ "#{$1.cyan} #{highlight_common( $2, common_words) }"
58
+ else
59
+ highlight_common( content, common_words )
60
+ end
61
+ end
62
+
63
+ def print_points(datapoints)
64
+ puts
65
+ all_terms = datapoints.map(&:id).map(&:downcase).map {|id| id.split(/\s+/)}
66
+ common_words = all_terms.inject(all_terms.flatten.uniq) {|m,v| m & v}
67
+ datapoints.sort_by {|dp| dp.id.gsub(/^\d+:/, '')}.each do |dp|
68
+ puts formatted_datapoint_name( dp.id, common_words )
69
+ end
70
+ puts
71
+ end
72
+ end
73
+
74
+ end
@@ -0,0 +1,9 @@
1
+ module SameSame
2
+ class JaquardCoefficient
3
+ def similarity( x, y )
4
+ raise(ArgumentError, "both sets cannot be empty") if x.empty? && y.empty?
5
+ return 0.0 if x.empty? || y.empty?
6
+ (x & y).size.to_f / (x | y).size.to_f
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,62 @@
1
+ module SameSame
2
+ class LinkMatrix
3
+ attr_reader :links, :index_lookup
4
+
5
+ def initialize( attrs = {} )
6
+ similarity_matrix = attrs.fetch(:similarity_matrix)
7
+ datapoints = attrs.fetch(:datapoints)
8
+ th = attrs.fetch(:th)
9
+ neighbours = calculate_neighbours( datapoints, similarity_matrix, th )
10
+
11
+ @links = calculate_links( neighbours, datapoints )
12
+ @index_lookup = calculate_index_lookup( datapoints )
13
+ end
14
+
15
+ def count_links_between_clusters( cluster1, cluster2 )
16
+ cluster1.inject(0) do |sum, p1|
17
+ cluster2.inject(sum) do |sum2, p2|
18
+ sum2 + number_of_links_between_points(p1, p2)
19
+ end
20
+ end
21
+ end
22
+
23
+ def number_of_links_between_points( datapoint1, datapoint2 )
24
+ links.lookup( index_lookup[datapoint1], index_lookup[datapoint2] )
25
+ end
26
+
27
+ private
28
+
29
+ def calculate_index_lookup( datapoints )
30
+ {}.tap do |index|
31
+ datapoints.each_with_index {|p, i| index[p] = i}
32
+ end
33
+ end
34
+
35
+ def calculate_links( neighbours, datapoints )
36
+ SymmetricalMatrix.new(
37
+ neighbours.size,
38
+ ->(x,y) {number_of_links(neighbours, datapoints, x, y)}
39
+ )
40
+ end
41
+
42
+ def calculate_neighbours( datapoints, similarity_matrix, th )
43
+ SymmetricalMatrix.new(
44
+ datapoints.size,
45
+ ->(x,y) {similarity_matrix.lookup(x,y) >= th ? 1 : 0}
46
+ )
47
+ end
48
+
49
+ # 0 1 2 3
50
+ # ------------
51
+ # 0 | Y - Y -
52
+ # 1 | - Y - -
53
+ # 2 | - - - -
54
+ # 3 | - Y - -
55
+ #
56
+ def number_of_links(neighbors, datapoints, x, y)
57
+ (0..datapoints.size-1).map do |i|
58
+ neighbors.lookup(x,i) * neighbors.lookup(i,y)
59
+ end.inject(0) {|m,v| m+v}
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,30 @@
1
+ module SameSame
2
+
3
+ class MergeGoodnessMeasure
4
+ attr_reader :th, :p
5
+
6
+ # th should be >= 0 and <= 1
7
+ # 0 means all datapoints are neighbours
8
+ # 1 means no datapoints are neighbours
9
+ # (proximity)
10
+ def initialize( th )
11
+ @th = th
12
+ @p = 1.0 + 2.0 * f( th )
13
+ end
14
+
15
+ def g(number_of_links, size_x, size_y)
16
+ a = (size_x + size_y) ** p
17
+ b = size_x ** p
18
+ c = size_x ** p
19
+
20
+ number_of_links / (a - b - c)
21
+ end
22
+
23
+ private
24
+
25
+ def f( th )
26
+ (1.0 - th) / (1.0 + th)
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,51 @@
1
+ require 'same_same/dendrogram'
2
+ require 'same_same/link_matrix'
3
+ require 'same_same/merge_goodness_measure'
4
+ require 'same_same/rock_clusters'
5
+ require 'same_same/similarity_matrix'
6
+
7
+ module SameSame
8
+ class RockAlgorithm
9
+ attr_accessor :datapoints, :similarity_measure, :k, :th, :link_matrix, :similarity_matrix
10
+
11
+ def initialize( attrs = {} )
12
+ self.datapoints = attrs.fetch( :datapoints )
13
+ self.similarity_measure = attrs[ :similarity_measure ] || JaquardCoefficient.new
14
+ self.k = attrs.fetch(:k)
15
+ self.th = attrs.fetch(:th)
16
+
17
+ self.similarity_matrix = SimilarityMatrix.new( similarity_measure, datapoints.map {|d| d.data} )
18
+ self.link_matrix = LinkMatrix.new( datapoints: datapoints, similarity_matrix: similarity_matrix, th: th)
19
+ end
20
+
21
+ def cluster
22
+ Dendrogram.new( "Goodness" ).tap do |dnd|
23
+ initial_clusters = one_point_per_cluster
24
+ g = Float::INFINITY
25
+ dnd.add_level(g.to_s, initial_clusters)
26
+ goodness = MergeGoodnessMeasure.new( th )
27
+
28
+ rock_clusters = RockClusters.new(
29
+ link_matrix: link_matrix,
30
+ clusters: initial_clusters,
31
+ goodness_measure: goodness)
32
+
33
+ number_of_clusters = rock_clusters.size
34
+ while number_of_clusters > k do
35
+ number_of_clusters_before_merge = number_of_clusters
36
+ g = rock_clusters.merge_best_candidates
37
+ number_of_clusters = rock_clusters.size
38
+
39
+ # finish if there are no linked clusters to merge
40
+ break if number_of_clusters == number_of_clusters_before_merge
41
+
42
+ dnd.add_level(g.to_s, rock_clusters.clusters)
43
+ end
44
+ end
45
+ end
46
+
47
+ def one_point_per_cluster
48
+ datapoints.map {|point| Cluster.new([point])}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,68 @@
1
+ require 'same_same/cluster_similarity'
2
+
3
+ module SameSame
4
+ class RockClusters
5
+ attr_accessor :link_matrix, :clusters, :goodness_measure, :cluster_map, :closest_clusters
6
+
7
+ def initialize( attrs = {} )
8
+ self.link_matrix = attrs.fetch(:link_matrix)
9
+ self.goodness_measure = attrs.fetch(:goodness_measure)
10
+ self.cluster_map = {}
11
+ @last_key = -1
12
+
13
+ attrs[:clusters].each {|c| add_cluster(c) }
14
+ calculate_closest_clusters
15
+ end
16
+
17
+ def merge_best_candidates
18
+ key1, similarity = find_most_similar_pair
19
+ if key1
20
+ merge_clusters key1, similarity.cluster_key
21
+ similarity.goodness
22
+ end
23
+ end
24
+
25
+ def find_most_similar_pair
26
+ closest_clusters.sort_by {|_, similarity| similarity.goodness}.first || []
27
+ end
28
+
29
+ def size
30
+ cluster_map.size
31
+ end
32
+
33
+ def clusters
34
+ cluster_map.values
35
+ end
36
+
37
+ def add_cluster( c )
38
+ cluster_map[next_key] = c
39
+ end
40
+
41
+ def next_key
42
+ @last_key = @last_key + 1
43
+ end
44
+
45
+ def calculate_closest_clusters
46
+ self.closest_clusters = {}
47
+ cluster_map.each do |cluster_key, cluster|
48
+ similarity = cluster_map.map do |other_key, other_cluster|
49
+ if cluster_key != other_key
50
+ number_of_links = link_matrix.count_links_between_clusters( cluster, other_cluster )
51
+ if number_of_links > 0
52
+ goodness = goodness_measure.g( number_of_links, cluster.size, other_cluster.size)
53
+ ClusterSimilarity.new( other_key, goodness )
54
+ end
55
+ end
56
+ end.compact.sort.first
57
+ closest_clusters[cluster_key] = similarity if similarity
58
+ end
59
+ end
60
+
61
+ def merge_clusters(key1, key2)
62
+ merged_key = add_cluster( cluster_map.delete(key1) + cluster_map.delete(key2) )
63
+ calculate_closest_clusters
64
+ merged_key
65
+ end
66
+
67
+ end
68
+ end