similarity_tree 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +96 -0
- data/Rakefile +1 -0
- data/lib/similarity_tree.rb +4 -0
- data/lib/similarity_tree/node.rb +53 -0
- data/lib/similarity_tree/similarity_matrix.rb +147 -0
- data/lib/similarity_tree/similarity_tree.rb +81 -0
- data/lib/similarity_tree/version.rb +3 -0
- data/similarity_tree.gemspec +26 -0
- data/test/cc_licences/CC-BY-3.0.html +401 -0
- data/test/cc_licences/CC-BY-NC-3.0.html +423 -0
- data/test/cc_licences/CC-BY-NC-ND-3.0.html +385 -0
- data/test/cc_licences/CC-BY-NC-SA-3.0.html +459 -0
- data/test/cc_licences/CC-BY-ND-3.0.html +365 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MzViYTMyMjQzZGU0ZTc2ZDc2OGI2NWI3NDYyNWYwZGM2MDFjMzc5MQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDJmOWI2N2QxOTA2YTcwZjA4NThjZDhjZjRjMjAxOTMzY2Y5MTcyMw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
Mzk0ZTllMGRiOGJmOTI3OTAwNWQyN2U2ZjhiNWNlNGExMDczODc2YjUwYzJh
|
10
|
+
MTI3N2ZjNDM1ZTEyY2MzODEzNmUwYjgyODZkOTRlOTg0ODU5NDZlMGM5YmFi
|
11
|
+
MTU4NmNiYjE0NDM0ZmEwNGE4ZTgzZGJhNGRiY2U1ZWQ3OTJlYTA=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzE0YjkzOGI2NDQ5ODE4YmQwMmViMjQ2NjllZmZhOWQzNTU1MDVjNWVkMGY1
|
14
|
+
NTYzZGY3NjIwY2I2NDVhMjUyODllOTZkNWM5OTYwYWViMWIwMmU1OWUyM2Rh
|
15
|
+
YzMzNzE3ZmVhZGZiNDgxM2ExZDRkNDhjZDFjYTE4NmZkMDdjYjA=
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2013 Kent Mewhort
|
2
|
+
Copyright (c) 2012 Open North Inc.
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# SimilarityTree
|
2
|
+
|
3
|
+
This library allows you to generate a tree representing branches/revisions to a set of text HTML files, without any
|
4
|
+
prior knowledge of the timelines or change history necessary. You simply need to know the original source document and
|
5
|
+
this library builds a tree based on the extent of differences between each document.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'similarity_tree'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install similarity_tree
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Build a "similarity matrix" of the diff scores between the different documents, then generate the tree from this matrix.
|
24
|
+
First, build the "similarity matrix" of the diff scores between the different documents. You must input a set of HTML or
|
25
|
+
text documents. Then, to build the tree itself, you need to specify the document id or filename of the original/root
|
26
|
+
document. Eg. for the set of different Creative Commons licences in the test dir:
|
27
|
+
|
28
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
29
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
30
|
+
put tree.to_s # to_h and to_json are also available as other tree output formats
|
31
|
+
|
32
|
+
Result:
|
33
|
+
|
34
|
+
CC-BY-3.0.html
|
35
|
+
-CC-BY-NC-3.0.html (0.9197574893009985)
|
36
|
+
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
37
|
+
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
38
|
+
-CC-BY-ND-3.0.html (0.9434472109631346)
|
39
|
+
|
40
|
+
You can operate directly on **strings** rather than files (in this case, the node id's in the tree will be the file array indices):
|
41
|
+
|
42
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html').map { |f| File.read(f) }
|
43
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
44
|
+
put tree.to_s # to_h and to_json are also available as other tree output formats
|
45
|
+
|
46
|
+
CC-BY-3.0.html
|
47
|
+
-CC-BY-NC-3.0.html (0.9197574893009985)
|
48
|
+
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
49
|
+
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
50
|
+
-CC-BY-ND-3.0.html (0.9434472109631346)
|
51
|
+
|
52
|
+
Result:
|
53
|
+
0
|
54
|
+
-1 (0.9197574893009985)
|
55
|
+
--3 (0.9503146737330241)
|
56
|
+
--4 (0.9456402772710689)
|
57
|
+
-2 (0.9434472109631346)
|
58
|
+
|
59
|
+
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
|
60
|
+
|
61
|
+
class Document < ActiveRecord::Base
|
62
|
+
attr_accessible :title, :text_filename
|
63
|
+
...
|
64
|
+
end
|
65
|
+
|
66
|
+
You can call:
|
67
|
+
|
68
|
+
tree = SimilarityTree::SimilarityMatrix.new(Document.all,
|
69
|
+
id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
|
70
|
+
|
71
|
+
## Additional Options
|
72
|
+
|
73
|
+
### Calculation method
|
74
|
+
|
75
|
+
You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
|
76
|
+
standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
|
77
|
+
(that is, "cut and pastes" of sections of text into different locations) and is also often faster. However, if your intent
|
78
|
+
is to show diffs of the text, the :diff option will correlate better to your diff rendering.
|
79
|
+
|
80
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
81
|
+
tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
82
|
+
calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
|
83
|
+
diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
84
|
+
calculation_method: :diff).build_tree("CC-BY-3.0.html")
|
85
|
+
|
86
|
+
### Progress output
|
87
|
+
|
88
|
+
Performing all the diffs to build a similarity matrix can take a while for large document sets. If you're using this
|
89
|
+
gem from a script or a console, you can add a progress bar:
|
90
|
+
|
91
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents, show_progress: true).build_tree(id)
|
92
|
+
|
93
|
+
## Licence and Credits
|
94
|
+
|
95
|
+
(c) 2012-2013, Kent Mewhort (similarity tree) and Open North (original similarity_matrix implementation, see https://github.com/jpmckinney/clip-analysis),
|
96
|
+
licensed under MIT. See LICENSE.txt for details.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'json'
|
2
|
+
module SimilarityTree
|
3
|
+
class Node
|
4
|
+
attr_accessor :id, :diff_score, :parent, :children, :content
|
5
|
+
|
6
|
+
def initialize(id, diff_score, parent = nil, children = [], content = nil)
|
7
|
+
@id, @diff_score, @parent, @children, @content = id, diff_score, parent, children, content
|
8
|
+
end
|
9
|
+
|
10
|
+
# self and all descendents
|
11
|
+
def each_node
|
12
|
+
depth_first_recurse{|n, depth| yield n}
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
str = ""
|
17
|
+
depth_first_recurse do |n, depth|
|
18
|
+
str += ("-" * depth) + n.id.to_s
|
19
|
+
str += ' (' + n.diff_score.to_s + ')' unless n.diff_score.nil?
|
20
|
+
str += "\n"
|
21
|
+
end
|
22
|
+
str
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
result = {
|
27
|
+
id: id
|
28
|
+
}
|
29
|
+
result[:children] = children.map {|c| c.to_h} unless children.nil? || children.empty?
|
30
|
+
result[:diff_score] = diff_score unless diff_score.nil?
|
31
|
+
|
32
|
+
# if the content node has an as_json function, merge-in these attributes
|
33
|
+
if content.respond_to?(:as_json) && content.is_a?(Hash)
|
34
|
+
result = content.as_json.merge(result)
|
35
|
+
end
|
36
|
+
result
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_json(opts = {})
|
40
|
+
JSON.generate to_h, opts
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
# helper for recursion into descendents
|
45
|
+
def depth_first_recurse(node = nil, depth = 0, &block)
|
46
|
+
node = self if node == nil
|
47
|
+
yield node, depth
|
48
|
+
node.children.each do |child|
|
49
|
+
depth_first_recurse(child, depth+1, &block)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
require 'tf-idf-similarity'
|
3
|
+
require 'fast_html_diff'
|
4
|
+
|
5
|
+
module SimilarityTree
|
6
|
+
# Table of the diff/similarity scores between different text documents
|
7
|
+
class SimilarityMatrix
|
8
|
+
|
9
|
+
# Initialize a matrix for a set of documents
|
10
|
+
def initialize(sources, options = {})
|
11
|
+
@sources = sources
|
12
|
+
@config = default_options.merge(options)
|
13
|
+
|
14
|
+
@id = -1
|
15
|
+
@source_index = Hash.new
|
16
|
+
@matrix = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
# calculate and output results as an array of arrays;
|
20
|
+
# optional block is run each comparison to help with any progress bars
|
21
|
+
def calculate
|
22
|
+
if @config[:calculation_method] == :tf_idf
|
23
|
+
@matrix = calculate_with_tf_idf
|
24
|
+
elsif @config[:calculation_method] == :diff
|
25
|
+
@matrix = calculate_with_diff
|
26
|
+
else
|
27
|
+
raise "Unknown calculation type"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_tree(root_id, score_threshold = 0)
|
32
|
+
# build the similarity tree
|
33
|
+
@matrix = self.calculate if @matrix.nil?
|
34
|
+
tree = SimilarityTree.new(root_id, @matrix, score_threshold).build
|
35
|
+
|
36
|
+
# populate the nodes with the sources for the compatibility matrix
|
37
|
+
tree.each_node {|n| n.content = @source_index[n.id] }
|
38
|
+
tree
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def default_options
|
43
|
+
{
|
44
|
+
id_func: nil,
|
45
|
+
content_func: nil,
|
46
|
+
calculation_method: :tf_idf,
|
47
|
+
show_progress: false
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def calculate_with_tf_idf
|
52
|
+
progress_bar = nil
|
53
|
+
if @config[:show_progress]
|
54
|
+
progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
|
55
|
+
total: @sources.length
|
56
|
+
end
|
57
|
+
|
58
|
+
# iterate through the input texts and build the tf_idf corpus
|
59
|
+
corpus = []
|
60
|
+
ids = @sources.map do |source|
|
61
|
+
corpus << TfIdfSimilarity::Document.new(text_of(source))
|
62
|
+
progress_bar.increment unless progress_bar.nil?
|
63
|
+
id_of(source)
|
64
|
+
end
|
65
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
|
66
|
+
similarity_matrix = model.similarity_matrix
|
67
|
+
|
68
|
+
# compile the results into an ordinary m*n array
|
69
|
+
matrix = {}
|
70
|
+
ids.each_with_index do |a,i|
|
71
|
+
matrix[a] = {}
|
72
|
+
ids.each_with_index do |b,j|
|
73
|
+
matrix[a][b] = similarity_matrix[i, j].round(6)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
matrix
|
77
|
+
end
|
78
|
+
|
79
|
+
# Create a similarity matrix, using diff as the similarity measure, based on the difference of WORDS (not characters)
|
80
|
+
# (only counts insertions and deletions, not substitution and transposition).
|
81
|
+
def calculate_with_diff
|
82
|
+
progress_bar = nil
|
83
|
+
if @config[:show_progress]
|
84
|
+
progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
|
85
|
+
total: @sources.length*(@sources.length-1)/2
|
86
|
+
end
|
87
|
+
|
88
|
+
matrix = {}
|
89
|
+
@sources.each_with_index do |a,i|
|
90
|
+
a_id = id_of(a)
|
91
|
+
a_text = text_of(a)
|
92
|
+
|
93
|
+
@sources[i + 1..-1].each do |b|
|
94
|
+
b_id = id_of(b)
|
95
|
+
b_text = text_of(b)
|
96
|
+
|
97
|
+
stats = FastHtmlDiff::DiffBuilder.new(a_text, b_text).statistics
|
98
|
+
|
99
|
+
# http://en.wikipedia.org/wiki/Dice%27s_coefficient
|
100
|
+
total_count = 2 * stats[:matches][:words] + stats[:insertions][:words] + stats[:deletions][:words]
|
101
|
+
similarity = 2 * stats[:matches][:words] / total_count.to_f
|
102
|
+
|
103
|
+
# Build the similarity matrix,
|
104
|
+
matrix[a_id] ||= {a_id => 1}
|
105
|
+
matrix[a_id][b_id] = similarity
|
106
|
+
matrix[b_id] ||= {b_id => 1}
|
107
|
+
matrix[b_id][a_id] = similarity
|
108
|
+
|
109
|
+
progress_bar.increment unless progress_bar.nil?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
matrix
|
113
|
+
end
|
114
|
+
|
115
|
+
def id_of(source)
|
116
|
+
id = nil
|
117
|
+
if !@config[:id_func].nil?
|
118
|
+
id = source.send @config[:id_func].to_s
|
119
|
+
else
|
120
|
+
if is_a_filename? source
|
121
|
+
id = File.basename(source)
|
122
|
+
else
|
123
|
+
id = @sources.find_index(source)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# maintain an index of id => source
|
128
|
+
@source_index[id] = source if @source_index[id].nil?
|
129
|
+
id
|
130
|
+
end
|
131
|
+
|
132
|
+
def text_of(source)
|
133
|
+
if !@config[:content_func].nil?
|
134
|
+
txt = source.send @config[:content_func].to_s
|
135
|
+
else
|
136
|
+
txt = source
|
137
|
+
end
|
138
|
+
txt = File.read(txt) if is_a_filename?(txt)
|
139
|
+
txt
|
140
|
+
end
|
141
|
+
|
142
|
+
# quick and dirty check on whether a string is a filename based on the string length and whether the file exists
|
143
|
+
def is_a_filename?(filename)
|
144
|
+
(filename.length < 512) && File.exists?(filename)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module SimilarityTree
|
2
|
+
|
3
|
+
# Constructs a hierarchy of nodes based on a specified root and the similarity "scores" between nodes. Each nodes is placed next
|
4
|
+
# to the node to which it is most similar; as between two nodes, the node most similar to the root is placed closest to the root.
|
5
|
+
class SimilarityTree
|
6
|
+
# initialize/build the tree hierarchy from an existing similarity matrix
|
7
|
+
def initialize(root_id, similarity_matrix, score_threshold = 0)
|
8
|
+
@nodes = similarity_matrix.map {|key, row| Node.new(key, 0)}
|
9
|
+
@root = @nodes.find {|n| n.id == root_id}
|
10
|
+
@root.diff_score = nil
|
11
|
+
@similarity_matrix = similarity_matrix
|
12
|
+
@score_threshold = score_threshold
|
13
|
+
end
|
14
|
+
|
15
|
+
# build the tree and return the root node
|
16
|
+
def build
|
17
|
+
build_tree
|
18
|
+
@root
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def build_tree
|
23
|
+
tree = @root
|
24
|
+
flat = [@root]
|
25
|
+
|
26
|
+
# for each non-root node
|
27
|
+
@nodes.delete_if{|n| n == @root}.map do |n|
|
28
|
+
# find the best match to the nodes already in the tree
|
29
|
+
closest_diff_score = 0
|
30
|
+
closest = nil
|
31
|
+
flat.each do |m|
|
32
|
+
diff_score = @similarity_matrix[n.id][m.id]
|
33
|
+
if closest.nil? || (diff_score > closest_diff_score)
|
34
|
+
closest_diff_score = diff_score
|
35
|
+
closest = m
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# if the closest match is the root node, or if the closest match's diff score with it's parent is stronger
|
40
|
+
# than between the present node and that parent, add as a child of the match
|
41
|
+
if (closest == @root) || (closest.diff_score >= @similarity_matrix[n.id][closest.parent.id])
|
42
|
+
n.parent = closest
|
43
|
+
closest.children << n
|
44
|
+
n.diff_score = @similarity_matrix[n.id][closest.id]
|
45
|
+
# else, if the new node is more similar to the parent, rotate so that the existing node becomes the child
|
46
|
+
else
|
47
|
+
# place children with the closest matching of the two
|
48
|
+
closest.children.dup.each do |child|
|
49
|
+
if @similarity_matrix[child.id][n.id] > child.diff_score
|
50
|
+
child.parent = n
|
51
|
+
closest.children.delete_if{|child_i| child_i == child }
|
52
|
+
n.children << child
|
53
|
+
child.diff_score = @similarity_matrix[child.id][n.id]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# connect the new node to the parent
|
58
|
+
n.parent = closest.parent
|
59
|
+
n.parent.children << n
|
60
|
+
n.diff_score = @similarity_matrix[n.id][n.parent.id]
|
61
|
+
|
62
|
+
# add the existing node as a child
|
63
|
+
closest.parent = n
|
64
|
+
n.parent.children.delete_if{|child_i| child_i == closest}
|
65
|
+
n.children << closest
|
66
|
+
closest.diff_score = @similarity_matrix[closest.id][n.id]
|
67
|
+
end
|
68
|
+
|
69
|
+
flat << n
|
70
|
+
end
|
71
|
+
prune(flat)
|
72
|
+
end
|
73
|
+
|
74
|
+
# prune away nodes that don't meet the configured score threshold
|
75
|
+
def prune(nodes)
|
76
|
+
nodes.each do |node|
|
77
|
+
node.parent.children.reject!{|n| n == node} if (node != @root) && (node.diff_score < @score_threshold)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|