same_same 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +1 -0
- data/examples/dbscan_digg.rb +25 -0
- data/examples/dbscan_lines.rb +35 -0
- data/examples/rock_digg.rb +20 -0
- data/examples/rock_lines.rb +31 -0
- data/lib/same_same.rb +15 -0
- data/lib/same_same/cluster.rb +27 -0
- data/lib/same_same/cluster_similarity.rb +10 -0
- data/lib/same_same/cosine_distance.rb +27 -0
- data/lib/same_same/cosine_similarity.rb +22 -0
- data/lib/same_same/data_point.rb +12 -0
- data/lib/same_same/dbscan_algorithm.rb +135 -0
- data/lib/same_same/dbscan_clusters.rb +88 -0
- data/lib/same_same/dbscan_neighborhood.rb +68 -0
- data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
- data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
- data/lib/same_same/dendrogram.rb +28 -0
- data/lib/same_same/dendrogram_printer.rb +74 -0
- data/lib/same_same/jaquard_coefficient.rb +9 -0
- data/lib/same_same/link_matrix.rb +62 -0
- data/lib/same_same/merge_goodness_measure.rb +30 -0
- data/lib/same_same/rock_algorithm.rb +51 -0
- data/lib/same_same/rock_clusters.rb +68 -0
- data/lib/same_same/similarity_matrix.rb +20 -0
- data/lib/same_same/symmetrical_matrix.rb +39 -0
- data/lib/same_same/term_frequency_builder.rb +20 -0
- data/lib/same_same/version.rb +3 -0
- data/same_same.gemspec +23 -0
- data/spec/fixtures/digg_stories.csv +49 -0
- data/spec/fixtures/lines.csv +899 -0
- data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
- data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
- data/spec/same_same/link_matrix_spec.rb +29 -0
- data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
- data/spec/same_same/rock_algorithm_spec.rb +71 -0
- data/spec/same_same/similarity_matrix_spec.rb +20 -0
- data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
- metadata +144 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
class DbscanClusters
|
5
|
+
|
6
|
+
# Identifies a set of Noise points.
|
7
|
+
NOISE_ID = -1
|
8
|
+
|
9
|
+
# Identifies a set of Unclassified points.
|
10
|
+
UNCLASSIFIED_ID = 0
|
11
|
+
|
12
|
+
|
13
|
+
attr_accessor :clusters, :last_id
|
14
|
+
|
15
|
+
def initialize( unclassified )
|
16
|
+
self.clusters = Hash.new {|hash, key|
|
17
|
+
hash[key] = Set.new
|
18
|
+
}
|
19
|
+
self.last_id = 0
|
20
|
+
assign_points( unclassified, UNCLASSIFIED_ID )
|
21
|
+
end
|
22
|
+
|
23
|
+
def assign_to_noise( p )
|
24
|
+
assign_point( p, NOISE_ID)
|
25
|
+
end
|
26
|
+
|
27
|
+
def unclassified?(p)
|
28
|
+
point_in_cluster?(p, UNCLASSIFIED_ID)
|
29
|
+
end
|
30
|
+
|
31
|
+
def noise?(p)
|
32
|
+
point_in_cluster?(p, NOISE_ID)
|
33
|
+
end
|
34
|
+
|
35
|
+
def point_in_cluster?( p, cluster_id)
|
36
|
+
clusters[cluster_id].include?( p )
|
37
|
+
end
|
38
|
+
|
39
|
+
def assign_points(points, cluster_id)
|
40
|
+
points.each {|p| assign_point( p, cluster_id)}
|
41
|
+
end
|
42
|
+
|
43
|
+
def assign_point( p, cluster_id)
|
44
|
+
# Remove point from the group that it currently belongs to...
|
45
|
+
if noise?(p)
|
46
|
+
remove_point_from_cluster(p, NOISE_ID)
|
47
|
+
elsif unclassified?(p)
|
48
|
+
remove_point_from_cluster(p, UNCLASSIFIED_ID)
|
49
|
+
else
|
50
|
+
if cluster_id != UNCLASSIFIED_ID
|
51
|
+
raise ArgumentError.new("Trying to move point that has already been assigned to some other cluster. Point: #{p}, cluster_id=#{cluster_id}")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
clusters[cluster_id] << p
|
56
|
+
end
|
57
|
+
|
58
|
+
def to_clusters
|
59
|
+
[].tap do |all_clusters|
|
60
|
+
clusters.each do |id, points|
|
61
|
+
all_clusters << Cluster.new(points, cluster_name(id)) unless points.empty?
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
def cluster_name(id)
|
68
|
+
case id
|
69
|
+
when NOISE_ID then "Noise"
|
70
|
+
when UNCLASSIFIED_ID then "Unclassified"
|
71
|
+
else "Cluster #{id}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def remove_point_from_cluster(p, cluster_id)
|
76
|
+
cluster = clusters[cluster_id]
|
77
|
+
|
78
|
+
return false if cluster.nil?
|
79
|
+
cluster.include?(p).tap do
|
80
|
+
cluster.delete p
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def get_next_cluster_id
|
85
|
+
self.last_id = last_id + 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module SameSame
|
2
|
+
class DbscanNeighborhood
|
3
|
+
# Contains distances between points.
|
4
|
+
attr_accessor :adjacency_matrix
|
5
|
+
|
6
|
+
# Threshold value. Determines which points will be considered as
|
7
|
+
# neighbors. Two points are neighbors if the distance between them does
|
8
|
+
# not exceed threshold value.
|
9
|
+
attr_accessor :eps
|
10
|
+
|
11
|
+
# used to cache index of points in the matrix
|
12
|
+
attr_accessor :index_mapping
|
13
|
+
|
14
|
+
attr_accessor :points
|
15
|
+
|
16
|
+
# Initializes algorithm with all data that it needs.
|
17
|
+
#
|
18
|
+
# * points - points to cluster
|
19
|
+
# * eps - distance threshold value
|
20
|
+
# * minPoints - number of neighbors for point to be considered a core point.
|
21
|
+
def initialize(attrs = {})
|
22
|
+
self.eps = attrs.fetch(:eps)
|
23
|
+
self.points = attrs.fetch(:points)
|
24
|
+
|
25
|
+
build_index_mapping
|
26
|
+
|
27
|
+
vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
|
28
|
+
distance = attrs.fetch( :distance )
|
29
|
+
use_term_frequencies = attrs[:use_term_frequencies] || false
|
30
|
+
|
31
|
+
self.adjacency_matrix =
|
32
|
+
calculate_adjacency_matrix(distance, points, vector_calculator)
|
33
|
+
end
|
34
|
+
|
35
|
+
def neighbors_of( p )
|
36
|
+
Set.new.tap do |neighbors|
|
37
|
+
i = index_mapping[p]
|
38
|
+
(0..index_mapping.size-1).each do |j|
|
39
|
+
neighbors.add(points[j]) if adjacency_matrix.lookup(i,j) <= eps
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def build_index_mapping
|
47
|
+
self.index_mapping = {}
|
48
|
+
points.each_with_index do |p,i|
|
49
|
+
index_mapping[p] = i
|
50
|
+
end
|
51
|
+
index_mapping
|
52
|
+
end
|
53
|
+
|
54
|
+
def calculate_adjacency_matrix(distance, points, vector_calculator)
|
55
|
+
SymmetricalMatrix.new( points.size ).tap do |m|
|
56
|
+
(0..points.size - 1).each do |i|
|
57
|
+
m.set(i,i, 0.0)
|
58
|
+
((i+1)..(points.size - 1)).each do |j|
|
59
|
+
x, y = vector_calculator.vectors( points[i], points[j] )
|
60
|
+
d = distance.distance(x, y)
|
61
|
+
m.set(i,j,d)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module SameSame
|
2
|
+
|
3
|
+
class Dendrogram
|
4
|
+
|
5
|
+
Level = Struct.new(:name, :clusters)
|
6
|
+
|
7
|
+
attr_accessor :levels, :level_label
|
8
|
+
|
9
|
+
def initialize(name)
|
10
|
+
self.levels = []
|
11
|
+
self.level_label = name
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_level( name, clusters )
|
15
|
+
self.levels << Level.new(name, clusters.map(&:dup))
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](i)
|
19
|
+
levels[i]
|
20
|
+
end
|
21
|
+
|
22
|
+
def non_singelton_leaves?
|
23
|
+
levels.last.clusters.any? {|cluster| cluster.size > 1}
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'colored'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
|
5
|
+
class DendrogramPrinter
|
6
|
+
|
7
|
+
def print_last(dnd)
|
8
|
+
level = dnd.levels.last
|
9
|
+
print_clusters( level.clusters )
|
10
|
+
end
|
11
|
+
|
12
|
+
def print_clusters(clusters)
|
13
|
+
clusters.each do |cluster|
|
14
|
+
if cluster.size > 1
|
15
|
+
puts "## #{cluster.name}" if cluster.name
|
16
|
+
print_points( cluster.datapoints )
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def print(dnd)
|
22
|
+
dnd.levels.each_with_index do |level, i|
|
23
|
+
single_point_clusters = level.clusters.select {|cluster| cluster.size == 1}
|
24
|
+
ungrouped = single_point_clusters.map {|c| c.datapoints}.flatten
|
25
|
+
|
26
|
+
puts
|
27
|
+
puts "-" * 80
|
28
|
+
puts "#{dnd.level_label}: #{level.name}"
|
29
|
+
puts
|
30
|
+
puts "Clusters: #{level.clusters.size - single_point_clusters.size}"
|
31
|
+
puts "Ungrouped: #{ungrouped.size}"
|
32
|
+
puts "-" * 80
|
33
|
+
puts
|
34
|
+
|
35
|
+
level.clusters.each do |cluster|
|
36
|
+
if cluster.size > 1
|
37
|
+
print_points( cluster.datapoints )
|
38
|
+
end
|
39
|
+
end
|
40
|
+
puts
|
41
|
+
|
42
|
+
|
43
|
+
if i == dnd.levels.size - 1
|
44
|
+
puts "FINAL UNGROUPED"
|
45
|
+
print_points(ungrouped)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def highlight_common( content, common_words )
|
51
|
+
words = content.strip.split(/\s+/)
|
52
|
+
words.map {|word| common_words.include?(word.downcase) ? word : word.bold.red}.join(" ")
|
53
|
+
end
|
54
|
+
|
55
|
+
def formatted_datapoint_name( content, common_words )
|
56
|
+
if content =~ /^(\d+:)(.*)/
|
57
|
+
"#{$1.cyan} #{highlight_common( $2, common_words) }"
|
58
|
+
else
|
59
|
+
highlight_common( content, common_words )
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def print_points(datapoints)
|
64
|
+
puts
|
65
|
+
all_terms = datapoints.map(&:id).map(&:downcase).map {|id| id.split(/\s+/)}
|
66
|
+
common_words = all_terms.inject(all_terms.flatten.uniq) {|m,v| m & v}
|
67
|
+
datapoints.sort_by {|dp| dp.id.gsub(/^\d+:/, '')}.each do |dp|
|
68
|
+
puts formatted_datapoint_name( dp.id, common_words )
|
69
|
+
end
|
70
|
+
puts
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module SameSame
|
2
|
+
class LinkMatrix
|
3
|
+
attr_reader :links, :index_lookup
|
4
|
+
|
5
|
+
def initialize( attrs = {} )
|
6
|
+
similarity_matrix = attrs.fetch(:similarity_matrix)
|
7
|
+
datapoints = attrs.fetch(:datapoints)
|
8
|
+
th = attrs.fetch(:th)
|
9
|
+
neighbours = calculate_neighbours( datapoints, similarity_matrix, th )
|
10
|
+
|
11
|
+
@links = calculate_links( neighbours, datapoints )
|
12
|
+
@index_lookup = calculate_index_lookup( datapoints )
|
13
|
+
end
|
14
|
+
|
15
|
+
def count_links_between_clusters( cluster1, cluster2 )
|
16
|
+
cluster1.inject(0) do |sum, p1|
|
17
|
+
cluster2.inject(sum) do |sum2, p2|
|
18
|
+
sum2 + number_of_links_between_points(p1, p2)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def number_of_links_between_points( datapoint1, datapoint2 )
|
24
|
+
links.lookup( index_lookup[datapoint1], index_lookup[datapoint2] )
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def calculate_index_lookup( datapoints )
|
30
|
+
{}.tap do |index|
|
31
|
+
datapoints.each_with_index {|p, i| index[p] = i}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def calculate_links( neighbours, datapoints )
|
36
|
+
SymmetricalMatrix.new(
|
37
|
+
neighbours.size,
|
38
|
+
->(x,y) {number_of_links(neighbours, datapoints, x, y)}
|
39
|
+
)
|
40
|
+
end
|
41
|
+
|
42
|
+
def calculate_neighbours( datapoints, similarity_matrix, th )
|
43
|
+
SymmetricalMatrix.new(
|
44
|
+
datapoints.size,
|
45
|
+
->(x,y) {similarity_matrix.lookup(x,y) >= th ? 1 : 0}
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
# 0 1 2 3
|
50
|
+
# ------------
|
51
|
+
# 0 | Y - Y -
|
52
|
+
# 1 | - Y - -
|
53
|
+
# 2 | - - - -
|
54
|
+
# 3 | - Y - -
|
55
|
+
#
|
56
|
+
def number_of_links(neighbors, datapoints, x, y)
|
57
|
+
(0..datapoints.size-1).map do |i|
|
58
|
+
neighbors.lookup(x,i) * neighbors.lookup(i,y)
|
59
|
+
end.inject(0) {|m,v| m+v}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module SameSame
|
2
|
+
|
3
|
+
class MergeGoodnessMeasure
|
4
|
+
attr_reader :th, :p
|
5
|
+
|
6
|
+
# th should be >= 0 and <= 1
|
7
|
+
# 0 means all datapoints are neighbours
|
8
|
+
# 1 means no datapoints are neighbours
|
9
|
+
# (proximity)
|
10
|
+
def initialize( th )
|
11
|
+
@th = th
|
12
|
+
@p = 1.0 + 2.0 * f( th )
|
13
|
+
end
|
14
|
+
|
15
|
+
def g(number_of_links, size_x, size_y)
|
16
|
+
a = (size_x + size_y) ** p
|
17
|
+
b = size_x ** p
|
18
|
+
c = size_x ** p
|
19
|
+
|
20
|
+
number_of_links / (a - b - c)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def f( th )
|
26
|
+
(1.0 - th) / (1.0 + th)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'same_same/dendrogram'
|
2
|
+
require 'same_same/link_matrix'
|
3
|
+
require 'same_same/merge_goodness_measure'
|
4
|
+
require 'same_same/rock_clusters'
|
5
|
+
require 'same_same/similarity_matrix'
|
6
|
+
|
7
|
+
module SameSame
|
8
|
+
class RockAlgorithm
|
9
|
+
attr_accessor :datapoints, :similarity_measure, :k, :th, :link_matrix, :similarity_matrix
|
10
|
+
|
11
|
+
def initialize( attrs = {} )
|
12
|
+
self.datapoints = attrs.fetch( :datapoints )
|
13
|
+
self.similarity_measure = attrs[ :similarity_measure ] || JaquardCoefficient.new
|
14
|
+
self.k = attrs.fetch(:k)
|
15
|
+
self.th = attrs.fetch(:th)
|
16
|
+
|
17
|
+
self.similarity_matrix = SimilarityMatrix.new( similarity_measure, datapoints.map {|d| d.data} )
|
18
|
+
self.link_matrix = LinkMatrix.new( datapoints: datapoints, similarity_matrix: similarity_matrix, th: th)
|
19
|
+
end
|
20
|
+
|
21
|
+
def cluster
|
22
|
+
Dendrogram.new( "Goodness" ).tap do |dnd|
|
23
|
+
initial_clusters = one_point_per_cluster
|
24
|
+
g = Float::INFINITY
|
25
|
+
dnd.add_level(g.to_s, initial_clusters)
|
26
|
+
goodness = MergeGoodnessMeasure.new( th )
|
27
|
+
|
28
|
+
rock_clusters = RockClusters.new(
|
29
|
+
link_matrix: link_matrix,
|
30
|
+
clusters: initial_clusters,
|
31
|
+
goodness_measure: goodness)
|
32
|
+
|
33
|
+
number_of_clusters = rock_clusters.size
|
34
|
+
while number_of_clusters > k do
|
35
|
+
number_of_clusters_before_merge = number_of_clusters
|
36
|
+
g = rock_clusters.merge_best_candidates
|
37
|
+
number_of_clusters = rock_clusters.size
|
38
|
+
|
39
|
+
# finish if there are no linked clusters to merge
|
40
|
+
break if number_of_clusters == number_of_clusters_before_merge
|
41
|
+
|
42
|
+
dnd.add_level(g.to_s, rock_clusters.clusters)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def one_point_per_cluster
|
48
|
+
datapoints.map {|point| Cluster.new([point])}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'same_same/cluster_similarity'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
class RockClusters
|
5
|
+
attr_accessor :link_matrix, :clusters, :goodness_measure, :cluster_map, :closest_clusters
|
6
|
+
|
7
|
+
def initialize( attrs = {} )
|
8
|
+
self.link_matrix = attrs.fetch(:link_matrix)
|
9
|
+
self.goodness_measure = attrs.fetch(:goodness_measure)
|
10
|
+
self.cluster_map = {}
|
11
|
+
@last_key = -1
|
12
|
+
|
13
|
+
attrs[:clusters].each {|c| add_cluster(c) }
|
14
|
+
calculate_closest_clusters
|
15
|
+
end
|
16
|
+
|
17
|
+
def merge_best_candidates
|
18
|
+
key1, similarity = find_most_similar_pair
|
19
|
+
if key1
|
20
|
+
merge_clusters key1, similarity.cluster_key
|
21
|
+
similarity.goodness
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_most_similar_pair
|
26
|
+
closest_clusters.sort_by {|_, similarity| similarity.goodness}.first || []
|
27
|
+
end
|
28
|
+
|
29
|
+
def size
|
30
|
+
cluster_map.size
|
31
|
+
end
|
32
|
+
|
33
|
+
def clusters
|
34
|
+
cluster_map.values
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_cluster( c )
|
38
|
+
cluster_map[next_key] = c
|
39
|
+
end
|
40
|
+
|
41
|
+
def next_key
|
42
|
+
@last_key = @last_key + 1
|
43
|
+
end
|
44
|
+
|
45
|
+
def calculate_closest_clusters
|
46
|
+
self.closest_clusters = {}
|
47
|
+
cluster_map.each do |cluster_key, cluster|
|
48
|
+
similarity = cluster_map.map do |other_key, other_cluster|
|
49
|
+
if cluster_key != other_key
|
50
|
+
number_of_links = link_matrix.count_links_between_clusters( cluster, other_cluster )
|
51
|
+
if number_of_links > 0
|
52
|
+
goodness = goodness_measure.g( number_of_links, cluster.size, other_cluster.size)
|
53
|
+
ClusterSimilarity.new( other_key, goodness )
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end.compact.sort.first
|
57
|
+
closest_clusters[cluster_key] = similarity if similarity
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def merge_clusters(key1, key2)
|
62
|
+
merged_key = add_cluster( cluster_map.delete(key1) + cluster_map.delete(key2) )
|
63
|
+
calculate_closest_clusters
|
64
|
+
merged_key
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|