same_same 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +1 -0
- data/examples/dbscan_digg.rb +25 -0
- data/examples/dbscan_lines.rb +35 -0
- data/examples/rock_digg.rb +20 -0
- data/examples/rock_lines.rb +31 -0
- data/lib/same_same.rb +15 -0
- data/lib/same_same/cluster.rb +27 -0
- data/lib/same_same/cluster_similarity.rb +10 -0
- data/lib/same_same/cosine_distance.rb +27 -0
- data/lib/same_same/cosine_similarity.rb +22 -0
- data/lib/same_same/data_point.rb +12 -0
- data/lib/same_same/dbscan_algorithm.rb +135 -0
- data/lib/same_same/dbscan_clusters.rb +88 -0
- data/lib/same_same/dbscan_neighborhood.rb +68 -0
- data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
- data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
- data/lib/same_same/dendrogram.rb +28 -0
- data/lib/same_same/dendrogram_printer.rb +74 -0
- data/lib/same_same/jaquard_coefficient.rb +9 -0
- data/lib/same_same/link_matrix.rb +62 -0
- data/lib/same_same/merge_goodness_measure.rb +30 -0
- data/lib/same_same/rock_algorithm.rb +51 -0
- data/lib/same_same/rock_clusters.rb +68 -0
- data/lib/same_same/similarity_matrix.rb +20 -0
- data/lib/same_same/symmetrical_matrix.rb +39 -0
- data/lib/same_same/term_frequency_builder.rb +20 -0
- data/lib/same_same/version.rb +3 -0
- data/same_same.gemspec +23 -0
- data/spec/fixtures/digg_stories.csv +49 -0
- data/spec/fixtures/lines.csv +899 -0
- data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
- data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
- data/spec/same_same/link_matrix_spec.rb +29 -0
- data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
- data/spec/same_same/rock_algorithm_spec.rb +71 -0
- data/spec/same_same/similarity_matrix_spec.rb +20 -0
- data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
- metadata +144 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
class DbscanClusters
|
5
|
+
|
6
|
+
# Identifies a set of Noise points.
|
7
|
+
NOISE_ID = -1
|
8
|
+
|
9
|
+
# Identifies a set of Unclassified points.
|
10
|
+
UNCLASSIFIED_ID = 0
|
11
|
+
|
12
|
+
|
13
|
+
attr_accessor :clusters, :last_id
|
14
|
+
|
15
|
+
def initialize( unclassified )
|
16
|
+
self.clusters = Hash.new {|hash, key|
|
17
|
+
hash[key] = Set.new
|
18
|
+
}
|
19
|
+
self.last_id = 0
|
20
|
+
assign_points( unclassified, UNCLASSIFIED_ID )
|
21
|
+
end
|
22
|
+
|
23
|
+
def assign_to_noise( p )
|
24
|
+
assign_point( p, NOISE_ID)
|
25
|
+
end
|
26
|
+
|
27
|
+
def unclassified?(p)
|
28
|
+
point_in_cluster?(p, UNCLASSIFIED_ID)
|
29
|
+
end
|
30
|
+
|
31
|
+
def noise?(p)
|
32
|
+
point_in_cluster?(p, NOISE_ID)
|
33
|
+
end
|
34
|
+
|
35
|
+
def point_in_cluster?( p, cluster_id)
|
36
|
+
clusters[cluster_id].include?( p )
|
37
|
+
end
|
38
|
+
|
39
|
+
def assign_points(points, cluster_id)
|
40
|
+
points.each {|p| assign_point( p, cluster_id)}
|
41
|
+
end
|
42
|
+
|
43
|
+
def assign_point( p, cluster_id)
|
44
|
+
# Remove point from the group that it currently belongs to...
|
45
|
+
if noise?(p)
|
46
|
+
remove_point_from_cluster(p, NOISE_ID)
|
47
|
+
elsif unclassified?(p)
|
48
|
+
remove_point_from_cluster(p, UNCLASSIFIED_ID)
|
49
|
+
else
|
50
|
+
if cluster_id != UNCLASSIFIED_ID
|
51
|
+
raise ArgumentError.new("Trying to move point that has already been assigned to some other cluster. Point: #{p}, cluster_id=#{cluster_id}")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
clusters[cluster_id] << p
|
56
|
+
end
|
57
|
+
|
58
|
+
def to_clusters
|
59
|
+
[].tap do |all_clusters|
|
60
|
+
clusters.each do |id, points|
|
61
|
+
all_clusters << Cluster.new(points, cluster_name(id)) unless points.empty?
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
def cluster_name(id)
|
68
|
+
case id
|
69
|
+
when NOISE_ID then "Noise"
|
70
|
+
when UNCLASSIFIED_ID then "Unclassified"
|
71
|
+
else "Cluster #{id}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def remove_point_from_cluster(p, cluster_id)
|
76
|
+
cluster = clusters[cluster_id]
|
77
|
+
|
78
|
+
return false if cluster.nil?
|
79
|
+
cluster.include?(p).tap do
|
80
|
+
cluster.delete p
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def get_next_cluster_id
|
85
|
+
self.last_id = last_id + 1
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module SameSame
|
2
|
+
class DbscanNeighborhood
|
3
|
+
# Contains distances between points.
|
4
|
+
attr_accessor :adjacency_matrix
|
5
|
+
|
6
|
+
# Threshold value. Determines which points will be considered as
|
7
|
+
# neighbors. Two points are neighbors if the distance between them does
|
8
|
+
# not exceed threshold value.
|
9
|
+
attr_accessor :eps
|
10
|
+
|
11
|
+
# used to cache index of points in the matrix
|
12
|
+
attr_accessor :index_mapping
|
13
|
+
|
14
|
+
attr_accessor :points
|
15
|
+
|
16
|
+
# Initializes algorithm with all data that it needs.
|
17
|
+
#
|
18
|
+
# * points - points to cluster
|
19
|
+
# * eps - distance threshold value
|
20
|
+
# * minPoints - number of neighbors for point to be considered a core point.
|
21
|
+
def initialize(attrs = {})
|
22
|
+
self.eps = attrs.fetch(:eps)
|
23
|
+
self.points = attrs.fetch(:points)
|
24
|
+
|
25
|
+
build_index_mapping
|
26
|
+
|
27
|
+
vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
|
28
|
+
distance = attrs.fetch( :distance )
|
29
|
+
use_term_frequencies = attrs[:use_term_frequencies] || false
|
30
|
+
|
31
|
+
self.adjacency_matrix =
|
32
|
+
calculate_adjacency_matrix(distance, points, vector_calculator)
|
33
|
+
end
|
34
|
+
|
35
|
+
def neighbors_of( p )
|
36
|
+
Set.new.tap do |neighbors|
|
37
|
+
i = index_mapping[p]
|
38
|
+
(0..index_mapping.size-1).each do |j|
|
39
|
+
neighbors.add(points[j]) if adjacency_matrix.lookup(i,j) <= eps
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def build_index_mapping
|
47
|
+
self.index_mapping = {}
|
48
|
+
points.each_with_index do |p,i|
|
49
|
+
index_mapping[p] = i
|
50
|
+
end
|
51
|
+
index_mapping
|
52
|
+
end
|
53
|
+
|
54
|
+
def calculate_adjacency_matrix(distance, points, vector_calculator)
|
55
|
+
SymmetricalMatrix.new( points.size ).tap do |m|
|
56
|
+
(0..points.size - 1).each do |i|
|
57
|
+
m.set(i,i, 0.0)
|
58
|
+
((i+1)..(points.size - 1)).each do |j|
|
59
|
+
x, y = vector_calculator.vectors( points[i], points[j] )
|
60
|
+
d = distance.distance(x, y)
|
61
|
+
m.set(i,j,d)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module SameSame
|
2
|
+
|
3
|
+
class Dendrogram
|
4
|
+
|
5
|
+
Level = Struct.new(:name, :clusters)
|
6
|
+
|
7
|
+
attr_accessor :levels, :level_label
|
8
|
+
|
9
|
+
def initialize(name)
|
10
|
+
self.levels = []
|
11
|
+
self.level_label = name
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_level( name, clusters )
|
15
|
+
self.levels << Level.new(name, clusters.map(&:dup))
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](i)
|
19
|
+
levels[i]
|
20
|
+
end
|
21
|
+
|
22
|
+
def non_singelton_leaves?
|
23
|
+
levels.last.clusters.any? {|cluster| cluster.size > 1}
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'colored'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
|
5
|
+
class DendrogramPrinter
|
6
|
+
|
7
|
+
def print_last(dnd)
|
8
|
+
level = dnd.levels.last
|
9
|
+
print_clusters( level.clusters )
|
10
|
+
end
|
11
|
+
|
12
|
+
def print_clusters(clusters)
|
13
|
+
clusters.each do |cluster|
|
14
|
+
if cluster.size > 1
|
15
|
+
puts "## #{cluster.name}" if cluster.name
|
16
|
+
print_points( cluster.datapoints )
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def print(dnd)
|
22
|
+
dnd.levels.each_with_index do |level, i|
|
23
|
+
single_point_clusters = level.clusters.select {|cluster| cluster.size == 1}
|
24
|
+
ungrouped = single_point_clusters.map {|c| c.datapoints}.flatten
|
25
|
+
|
26
|
+
puts
|
27
|
+
puts "-" * 80
|
28
|
+
puts "#{dnd.level_label}: #{level.name}"
|
29
|
+
puts
|
30
|
+
puts "Clusters: #{level.clusters.size - single_point_clusters.size}"
|
31
|
+
puts "Ungrouped: #{ungrouped.size}"
|
32
|
+
puts "-" * 80
|
33
|
+
puts
|
34
|
+
|
35
|
+
level.clusters.each do |cluster|
|
36
|
+
if cluster.size > 1
|
37
|
+
print_points( cluster.datapoints )
|
38
|
+
end
|
39
|
+
end
|
40
|
+
puts
|
41
|
+
|
42
|
+
|
43
|
+
if i == dnd.levels.size - 1
|
44
|
+
puts "FINAL UNGROUPED"
|
45
|
+
print_points(ungrouped)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def highlight_common( content, common_words )
|
51
|
+
words = content.strip.split(/\s+/)
|
52
|
+
words.map {|word| common_words.include?(word.downcase) ? word : word.bold.red}.join(" ")
|
53
|
+
end
|
54
|
+
|
55
|
+
def formatted_datapoint_name( content, common_words )
|
56
|
+
if content =~ /^(\d+:)(.*)/
|
57
|
+
"#{$1.cyan} #{highlight_common( $2, common_words) }"
|
58
|
+
else
|
59
|
+
highlight_common( content, common_words )
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def print_points(datapoints)
|
64
|
+
puts
|
65
|
+
all_terms = datapoints.map(&:id).map(&:downcase).map {|id| id.split(/\s+/)}
|
66
|
+
common_words = all_terms.inject(all_terms.flatten.uniq) {|m,v| m & v}
|
67
|
+
datapoints.sort_by {|dp| dp.id.gsub(/^\d+:/, '')}.each do |dp|
|
68
|
+
puts formatted_datapoint_name( dp.id, common_words )
|
69
|
+
end
|
70
|
+
puts
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module SameSame
|
2
|
+
class LinkMatrix
|
3
|
+
attr_reader :links, :index_lookup
|
4
|
+
|
5
|
+
def initialize( attrs = {} )
|
6
|
+
similarity_matrix = attrs.fetch(:similarity_matrix)
|
7
|
+
datapoints = attrs.fetch(:datapoints)
|
8
|
+
th = attrs.fetch(:th)
|
9
|
+
neighbours = calculate_neighbours( datapoints, similarity_matrix, th )
|
10
|
+
|
11
|
+
@links = calculate_links( neighbours, datapoints )
|
12
|
+
@index_lookup = calculate_index_lookup( datapoints )
|
13
|
+
end
|
14
|
+
|
15
|
+
def count_links_between_clusters( cluster1, cluster2 )
|
16
|
+
cluster1.inject(0) do |sum, p1|
|
17
|
+
cluster2.inject(sum) do |sum2, p2|
|
18
|
+
sum2 + number_of_links_between_points(p1, p2)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def number_of_links_between_points( datapoint1, datapoint2 )
|
24
|
+
links.lookup( index_lookup[datapoint1], index_lookup[datapoint2] )
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def calculate_index_lookup( datapoints )
|
30
|
+
{}.tap do |index|
|
31
|
+
datapoints.each_with_index {|p, i| index[p] = i}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def calculate_links( neighbours, datapoints )
|
36
|
+
SymmetricalMatrix.new(
|
37
|
+
neighbours.size,
|
38
|
+
->(x,y) {number_of_links(neighbours, datapoints, x, y)}
|
39
|
+
)
|
40
|
+
end
|
41
|
+
|
42
|
+
def calculate_neighbours( datapoints, similarity_matrix, th )
|
43
|
+
SymmetricalMatrix.new(
|
44
|
+
datapoints.size,
|
45
|
+
->(x,y) {similarity_matrix.lookup(x,y) >= th ? 1 : 0}
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
# 0 1 2 3
|
50
|
+
# ------------
|
51
|
+
# 0 | Y - Y -
|
52
|
+
# 1 | - Y - -
|
53
|
+
# 2 | - - - -
|
54
|
+
# 3 | - Y - -
|
55
|
+
#
|
56
|
+
def number_of_links(neighbors, datapoints, x, y)
|
57
|
+
(0..datapoints.size-1).map do |i|
|
58
|
+
neighbors.lookup(x,i) * neighbors.lookup(i,y)
|
59
|
+
end.inject(0) {|m,v| m+v}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module SameSame
|
2
|
+
|
3
|
+
class MergeGoodnessMeasure
|
4
|
+
attr_reader :th, :p
|
5
|
+
|
6
|
+
# th should be >= 0 and <= 1
|
7
|
+
# 0 means all datapoints are neighbours
|
8
|
+
# 1 means no datapoints are neighbours
|
9
|
+
# (proximity)
|
10
|
+
def initialize( th )
|
11
|
+
@th = th
|
12
|
+
@p = 1.0 + 2.0 * f( th )
|
13
|
+
end
|
14
|
+
|
15
|
+
def g(number_of_links, size_x, size_y)
|
16
|
+
a = (size_x + size_y) ** p
|
17
|
+
b = size_x ** p
|
18
|
+
c = size_x ** p
|
19
|
+
|
20
|
+
number_of_links / (a - b - c)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def f( th )
|
26
|
+
(1.0 - th) / (1.0 + th)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'same_same/dendrogram'
|
2
|
+
require 'same_same/link_matrix'
|
3
|
+
require 'same_same/merge_goodness_measure'
|
4
|
+
require 'same_same/rock_clusters'
|
5
|
+
require 'same_same/similarity_matrix'
|
6
|
+
|
7
|
+
module SameSame
|
8
|
+
class RockAlgorithm
|
9
|
+
attr_accessor :datapoints, :similarity_measure, :k, :th, :link_matrix, :similarity_matrix
|
10
|
+
|
11
|
+
def initialize( attrs = {} )
|
12
|
+
self.datapoints = attrs.fetch( :datapoints )
|
13
|
+
self.similarity_measure = attrs[ :similarity_measure ] || JaquardCoefficient.new
|
14
|
+
self.k = attrs.fetch(:k)
|
15
|
+
self.th = attrs.fetch(:th)
|
16
|
+
|
17
|
+
self.similarity_matrix = SimilarityMatrix.new( similarity_measure, datapoints.map {|d| d.data} )
|
18
|
+
self.link_matrix = LinkMatrix.new( datapoints: datapoints, similarity_matrix: similarity_matrix, th: th)
|
19
|
+
end
|
20
|
+
|
21
|
+
def cluster
|
22
|
+
Dendrogram.new( "Goodness" ).tap do |dnd|
|
23
|
+
initial_clusters = one_point_per_cluster
|
24
|
+
g = Float::INFINITY
|
25
|
+
dnd.add_level(g.to_s, initial_clusters)
|
26
|
+
goodness = MergeGoodnessMeasure.new( th )
|
27
|
+
|
28
|
+
rock_clusters = RockClusters.new(
|
29
|
+
link_matrix: link_matrix,
|
30
|
+
clusters: initial_clusters,
|
31
|
+
goodness_measure: goodness)
|
32
|
+
|
33
|
+
number_of_clusters = rock_clusters.size
|
34
|
+
while number_of_clusters > k do
|
35
|
+
number_of_clusters_before_merge = number_of_clusters
|
36
|
+
g = rock_clusters.merge_best_candidates
|
37
|
+
number_of_clusters = rock_clusters.size
|
38
|
+
|
39
|
+
# finish if there are no linked clusters to merge
|
40
|
+
break if number_of_clusters == number_of_clusters_before_merge
|
41
|
+
|
42
|
+
dnd.add_level(g.to_s, rock_clusters.clusters)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def one_point_per_cluster
|
48
|
+
datapoints.map {|point| Cluster.new([point])}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'same_same/cluster_similarity'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
class RockClusters
|
5
|
+
attr_accessor :link_matrix, :clusters, :goodness_measure, :cluster_map, :closest_clusters
|
6
|
+
|
7
|
+
def initialize( attrs = {} )
|
8
|
+
self.link_matrix = attrs.fetch(:link_matrix)
|
9
|
+
self.goodness_measure = attrs.fetch(:goodness_measure)
|
10
|
+
self.cluster_map = {}
|
11
|
+
@last_key = -1
|
12
|
+
|
13
|
+
attrs[:clusters].each {|c| add_cluster(c) }
|
14
|
+
calculate_closest_clusters
|
15
|
+
end
|
16
|
+
|
17
|
+
def merge_best_candidates
|
18
|
+
key1, similarity = find_most_similar_pair
|
19
|
+
if key1
|
20
|
+
merge_clusters key1, similarity.cluster_key
|
21
|
+
similarity.goodness
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_most_similar_pair
|
26
|
+
closest_clusters.sort_by {|_, similarity| similarity.goodness}.first || []
|
27
|
+
end
|
28
|
+
|
29
|
+
def size
|
30
|
+
cluster_map.size
|
31
|
+
end
|
32
|
+
|
33
|
+
def clusters
|
34
|
+
cluster_map.values
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_cluster( c )
|
38
|
+
cluster_map[next_key] = c
|
39
|
+
end
|
40
|
+
|
41
|
+
def next_key
|
42
|
+
@last_key = @last_key + 1
|
43
|
+
end
|
44
|
+
|
45
|
+
def calculate_closest_clusters
|
46
|
+
self.closest_clusters = {}
|
47
|
+
cluster_map.each do |cluster_key, cluster|
|
48
|
+
similarity = cluster_map.map do |other_key, other_cluster|
|
49
|
+
if cluster_key != other_key
|
50
|
+
number_of_links = link_matrix.count_links_between_clusters( cluster, other_cluster )
|
51
|
+
if number_of_links > 0
|
52
|
+
goodness = goodness_measure.g( number_of_links, cluster.size, other_cluster.size)
|
53
|
+
ClusterSimilarity.new( other_key, goodness )
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end.compact.sort.first
|
57
|
+
closest_clusters[cluster_key] = similarity if similarity
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def merge_clusters(key1, key2)
|
62
|
+
merged_key = add_cluster( cluster_map.delete(key1) + cluster_map.delete(key2) )
|
63
|
+
calculate_closest_clusters
|
64
|
+
merged_key
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|