similarity_tree 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +96 -0
- data/Rakefile +1 -0
- data/lib/similarity_tree.rb +4 -0
- data/lib/similarity_tree/node.rb +53 -0
- data/lib/similarity_tree/similarity_matrix.rb +147 -0
- data/lib/similarity_tree/similarity_tree.rb +81 -0
- data/lib/similarity_tree/version.rb +3 -0
- data/similarity_tree.gemspec +26 -0
- data/test/cc_licences/CC-BY-3.0.html +401 -0
- data/test/cc_licences/CC-BY-NC-3.0.html +423 -0
- data/test/cc_licences/CC-BY-NC-ND-3.0.html +385 -0
- data/test/cc_licences/CC-BY-NC-SA-3.0.html +459 -0
- data/test/cc_licences/CC-BY-ND-3.0.html +365 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MzViYTMyMjQzZGU0ZTc2ZDc2OGI2NWI3NDYyNWYwZGM2MDFjMzc5MQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDJmOWI2N2QxOTA2YTcwZjA4NThjZDhjZjRjMjAxOTMzY2Y5MTcyMw==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
Mzk0ZTllMGRiOGJmOTI3OTAwNWQyN2U2ZjhiNWNlNGExMDczODc2YjUwYzJh
|
10
|
+
MTI3N2ZjNDM1ZTEyY2MzODEzNmUwYjgyODZkOTRlOTg0ODU5NDZlMGM5YmFi
|
11
|
+
MTU4NmNiYjE0NDM0ZmEwNGE4ZTgzZGJhNGRiY2U1ZWQ3OTJlYTA=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzE0YjkzOGI2NDQ5ODE4YmQwMmViMjQ2NjllZmZhOWQzNTU1MDVjNWVkMGY1
|
14
|
+
NTYzZGY3NjIwY2I2NDVhMjUyODllOTZkNWM5OTYwYWViMWIwMmU1OWUyM2Rh
|
15
|
+
YzMzNzE3ZmVhZGZiNDgxM2ExZDRkNDhjZDFjYTE4NmZkMDdjYjA=
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2013 Kent Mewhort
|
2
|
+
Copyright (c) 2012 Open North Inc.
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# SimilarityTree
|
2
|
+
|
3
|
+
This library allows you to generate a tree representing branches/revisions to a set of text HTML files, without any
|
4
|
+
prior knowledge of the timelines or change history necessary. You simply need to know the original source document and
|
5
|
+
this library builds a tree based on the extent of differences between each document.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'similarity_tree'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install similarity_tree
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Build a "similarity matrix" of the diff scores between the different documents, then generate the tree from this matrix.
|
24
|
+
First, build the "similarity matrix" of the diff scores between the different documents. You must input a set of HTML or
|
25
|
+
text documents. Then, to build the tree itself, you need to specify the document id or filename of the original/root
|
26
|
+
document. Eg. for the set of different Creative Commons licences in the test dir:
|
27
|
+
|
28
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
29
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
30
|
+
put tree.to_s # to_h and to_json are also available as other tree output formats
|
31
|
+
|
32
|
+
Result:
|
33
|
+
|
34
|
+
CC-BY-3.0.html
|
35
|
+
-CC-BY-NC-3.0.html (0.9197574893009985)
|
36
|
+
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
37
|
+
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
38
|
+
-CC-BY-ND-3.0.html (0.9434472109631346)
|
39
|
+
|
40
|
+
You can operate directly on **strings** rather than files (in this case, the node id's in the tree will be the file array indices):
|
41
|
+
|
42
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html').map { |f| File.read(f) }
|
43
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
|
44
|
+
put tree.to_s # to_h and to_json are also available as other tree output formats
|
45
|
+
|
46
|
+
CC-BY-3.0.html
|
47
|
+
-CC-BY-NC-3.0.html (0.9197574893009985)
|
48
|
+
--CC-BY-NC-SA-3.0.html (0.9503146737330241)
|
49
|
+
--CC-BY-NC-ND-3.0.html (0.9456402772710689)
|
50
|
+
-CC-BY-ND-3.0.html (0.9434472109631346)
|
51
|
+
|
52
|
+
Result:
|
53
|
+
0
|
54
|
+
-1 (0.9197574893009985)
|
55
|
+
--3 (0.9503146737330241)
|
56
|
+
--4 (0.9456402772710689)
|
57
|
+
-2 (0.9434472109631346)
|
58
|
+
|
59
|
+
Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
|
60
|
+
|
61
|
+
class Document < ActiveRecord::Base
|
62
|
+
attr_accessible :title, :text_filename
|
63
|
+
...
|
64
|
+
end
|
65
|
+
|
66
|
+
You can call:
|
67
|
+
|
68
|
+
tree = SimilarityTree::SimilarityMatrix.new(Document.all,
|
69
|
+
id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
|
70
|
+
|
71
|
+
## Additional Options
|
72
|
+
|
73
|
+
### Calculation method
|
74
|
+
|
75
|
+
You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
|
76
|
+
standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
|
77
|
+
(that is, "cut and pastes" of sections of text into different locations) and is also often faster. However, if your intent
|
78
|
+
is to show diffs of the text, the :diff option will correlate better to your diff rendering.
|
79
|
+
|
80
|
+
documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
|
81
|
+
tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
82
|
+
calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
|
83
|
+
diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
|
84
|
+
calculation_method: :diff).build_tree("CC-BY-3.0.html")
|
85
|
+
|
86
|
+
### Progress output
|
87
|
+
|
88
|
+
Performing all the diffs to build a similarity matrix can take a while for large document sets. If you're using this
|
89
|
+
gem from a script or a console, you can add a progress bar:
|
90
|
+
|
91
|
+
tree = SimilarityTree::SimilarityMatrix.new(documents, show_progress: true).build_tree(id)
|
92
|
+
|
93
|
+
## Licence and Credits
|
94
|
+
|
95
|
+
(c) 2012-2013, Kent Mewhort (similarity tree) and Open North (original similarity_matrix implementation, see https://github.com/jpmckinney/clip-analysis),
|
96
|
+
licensed under MIT. See LICENSE.txt for details.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'json'
|
2
|
+
module SimilarityTree
|
3
|
+
class Node
|
4
|
+
attr_accessor :id, :diff_score, :parent, :children, :content
|
5
|
+
|
6
|
+
def initialize(id, diff_score, parent = nil, children = [], content = nil)
|
7
|
+
@id, @diff_score, @parent, @children, @content = id, diff_score, parent, children, content
|
8
|
+
end
|
9
|
+
|
10
|
+
# self and all descendents
|
11
|
+
def each_node
|
12
|
+
depth_first_recurse{|n, depth| yield n}
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
str = ""
|
17
|
+
depth_first_recurse do |n, depth|
|
18
|
+
str += ("-" * depth) + n.id.to_s
|
19
|
+
str += ' (' + n.diff_score.to_s + ')' unless n.diff_score.nil?
|
20
|
+
str += "\n"
|
21
|
+
end
|
22
|
+
str
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_h
|
26
|
+
result = {
|
27
|
+
id: id
|
28
|
+
}
|
29
|
+
result[:children] = children.map {|c| c.to_h} unless children.nil? || children.empty?
|
30
|
+
result[:diff_score] = diff_score unless diff_score.nil?
|
31
|
+
|
32
|
+
# if the content node has an as_json function, merge-in these attributes
|
33
|
+
if content.respond_to?(:as_json) && content.is_a?(Hash)
|
34
|
+
result = content.as_json.merge(result)
|
35
|
+
end
|
36
|
+
result
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_json(opts = {})
|
40
|
+
JSON.generate to_h, opts
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
# helper for recursion into descendents
|
45
|
+
def depth_first_recurse(node = nil, depth = 0, &block)
|
46
|
+
node = self if node == nil
|
47
|
+
yield node, depth
|
48
|
+
node.children.each do |child|
|
49
|
+
depth_first_recurse(child, depth+1, &block)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'matrix'
|
2
|
+
require 'tf-idf-similarity'
|
3
|
+
require 'fast_html_diff'
|
4
|
+
|
5
|
+
module SimilarityTree
|
6
|
+
# Table of the diff/similarity scores between different text documents
|
7
|
+
class SimilarityMatrix
|
8
|
+
|
9
|
+
# Initialize a matrix for a set of documents
|
10
|
+
def initialize(sources, options = {})
|
11
|
+
@sources = sources
|
12
|
+
@config = default_options.merge(options)
|
13
|
+
|
14
|
+
@id = -1
|
15
|
+
@source_index = Hash.new
|
16
|
+
@matrix = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
# calculate and output results as an array of arrays;
|
20
|
+
# optional block is run each comparison to help with any progress bars
|
21
|
+
def calculate
|
22
|
+
if @config[:calculation_method] == :tf_idf
|
23
|
+
@matrix = calculate_with_tf_idf
|
24
|
+
elsif @config[:calculation_method] == :diff
|
25
|
+
@matrix = calculate_with_diff
|
26
|
+
else
|
27
|
+
raise "Unknown calculation type"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_tree(root_id, score_threshold = 0)
|
32
|
+
# build the similarity tree
|
33
|
+
@matrix = self.calculate if @matrix.nil?
|
34
|
+
tree = SimilarityTree.new(root_id, @matrix, score_threshold).build
|
35
|
+
|
36
|
+
# populate the nodes with the sources for the compatibility matrix
|
37
|
+
tree.each_node {|n| n.content = @source_index[n.id] }
|
38
|
+
tree
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def default_options
|
43
|
+
{
|
44
|
+
id_func: nil,
|
45
|
+
content_func: nil,
|
46
|
+
calculation_method: :tf_idf,
|
47
|
+
show_progress: false
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def calculate_with_tf_idf
|
52
|
+
progress_bar = nil
|
53
|
+
if @config[:show_progress]
|
54
|
+
progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
|
55
|
+
total: @sources.length
|
56
|
+
end
|
57
|
+
|
58
|
+
# iterate through the input texts and build the tf_idf corpus
|
59
|
+
corpus = []
|
60
|
+
ids = @sources.map do |source|
|
61
|
+
corpus << TfIdfSimilarity::Document.new(text_of(source))
|
62
|
+
progress_bar.increment unless progress_bar.nil?
|
63
|
+
id_of(source)
|
64
|
+
end
|
65
|
+
model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
|
66
|
+
similarity_matrix = model.similarity_matrix
|
67
|
+
|
68
|
+
# compile the results into an ordinary m*n array
|
69
|
+
matrix = {}
|
70
|
+
ids.each_with_index do |a,i|
|
71
|
+
matrix[a] = {}
|
72
|
+
ids.each_with_index do |b,j|
|
73
|
+
matrix[a][b] = similarity_matrix[i, j].round(6)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
matrix
|
77
|
+
end
|
78
|
+
|
79
|
+
# Create a similarity matrix, using diff as the similarity measure, based on the difference of WORDS (not characters)
|
80
|
+
# (only counts insertions and deletions, not substitution and transposition).
|
81
|
+
def calculate_with_diff
|
82
|
+
progress_bar = nil
|
83
|
+
if @config[:show_progress]
|
84
|
+
progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
|
85
|
+
total: @sources.length*(@sources.length-1)/2
|
86
|
+
end
|
87
|
+
|
88
|
+
matrix = {}
|
89
|
+
@sources.each_with_index do |a,i|
|
90
|
+
a_id = id_of(a)
|
91
|
+
a_text = text_of(a)
|
92
|
+
|
93
|
+
@sources[i + 1..-1].each do |b|
|
94
|
+
b_id = id_of(b)
|
95
|
+
b_text = text_of(b)
|
96
|
+
|
97
|
+
stats = FastHtmlDiff::DiffBuilder.new(a_text, b_text).statistics
|
98
|
+
|
99
|
+
# http://en.wikipedia.org/wiki/Dice%27s_coefficient
|
100
|
+
total_count = 2 * stats[:matches][:words] + stats[:insertions][:words] + stats[:deletions][:words]
|
101
|
+
similarity = 2 * stats[:matches][:words] / total_count.to_f
|
102
|
+
|
103
|
+
# Build the similarity matrix,
|
104
|
+
matrix[a_id] ||= {a_id => 1}
|
105
|
+
matrix[a_id][b_id] = similarity
|
106
|
+
matrix[b_id] ||= {b_id => 1}
|
107
|
+
matrix[b_id][a_id] = similarity
|
108
|
+
|
109
|
+
progress_bar.increment unless progress_bar.nil?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
matrix
|
113
|
+
end
|
114
|
+
|
115
|
+
def id_of(source)
|
116
|
+
id = nil
|
117
|
+
if !@config[:id_func].nil?
|
118
|
+
id = source.send @config[:id_func].to_s
|
119
|
+
else
|
120
|
+
if is_a_filename? source
|
121
|
+
id = File.basename(source)
|
122
|
+
else
|
123
|
+
id = @sources.find_index(source)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# maintain an index of id => source
|
128
|
+
@source_index[id] = source if @source_index[id].nil?
|
129
|
+
id
|
130
|
+
end
|
131
|
+
|
132
|
+
def text_of(source)
|
133
|
+
if !@config[:content_func].nil?
|
134
|
+
txt = source.send @config[:content_func].to_s
|
135
|
+
else
|
136
|
+
txt = source
|
137
|
+
end
|
138
|
+
txt = File.read(txt) if is_a_filename?(txt)
|
139
|
+
txt
|
140
|
+
end
|
141
|
+
|
142
|
+
# quick and dirty check on whether a string is a filename based on the string length and whether the file exists
|
143
|
+
def is_a_filename?(filename)
|
144
|
+
(filename.length < 512) && File.exists?(filename)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module SimilarityTree
|
2
|
+
|
3
|
+
# Constructs a hierarchy of nodes based on a specified root and the similarity "scores" between nodes. Each nodes is placed next
|
4
|
+
# to the node to which it is most similar; as between two nodes, the node most similar to the root is placed closest to the root.
|
5
|
+
class SimilarityTree
|
6
|
+
# initialize/build the tree hierarchy from an existing similarity matrix
|
7
|
+
def initialize(root_id, similarity_matrix, score_threshold = 0)
|
8
|
+
@nodes = similarity_matrix.map {|key, row| Node.new(key, 0)}
|
9
|
+
@root = @nodes.find {|n| n.id == root_id}
|
10
|
+
@root.diff_score = nil
|
11
|
+
@similarity_matrix = similarity_matrix
|
12
|
+
@score_threshold = score_threshold
|
13
|
+
end
|
14
|
+
|
15
|
+
# build the tree and return the root node
|
16
|
+
def build
|
17
|
+
build_tree
|
18
|
+
@root
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def build_tree
|
23
|
+
tree = @root
|
24
|
+
flat = [@root]
|
25
|
+
|
26
|
+
# for each non-root node
|
27
|
+
@nodes.delete_if{|n| n == @root}.map do |n|
|
28
|
+
# find the best match to the nodes already in the tree
|
29
|
+
closest_diff_score = 0
|
30
|
+
closest = nil
|
31
|
+
flat.each do |m|
|
32
|
+
diff_score = @similarity_matrix[n.id][m.id]
|
33
|
+
if closest.nil? || (diff_score > closest_diff_score)
|
34
|
+
closest_diff_score = diff_score
|
35
|
+
closest = m
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# if the closest match is the root node, or if the closest match's diff score with it's parent is stronger
|
40
|
+
# than between the present node and that parent, add as a child of the match
|
41
|
+
if (closest == @root) || (closest.diff_score >= @similarity_matrix[n.id][closest.parent.id])
|
42
|
+
n.parent = closest
|
43
|
+
closest.children << n
|
44
|
+
n.diff_score = @similarity_matrix[n.id][closest.id]
|
45
|
+
# else, if the new node is more similar to the parent, rotate so that the existing node becomes the child
|
46
|
+
else
|
47
|
+
# place children with the closest matching of the two
|
48
|
+
closest.children.dup.each do |child|
|
49
|
+
if @similarity_matrix[child.id][n.id] > child.diff_score
|
50
|
+
child.parent = n
|
51
|
+
closest.children.delete_if{|child_i| child_i == child }
|
52
|
+
n.children << child
|
53
|
+
child.diff_score = @similarity_matrix[child.id][n.id]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# connect the new node to the parent
|
58
|
+
n.parent = closest.parent
|
59
|
+
n.parent.children << n
|
60
|
+
n.diff_score = @similarity_matrix[n.id][n.parent.id]
|
61
|
+
|
62
|
+
# add the existing node as a child
|
63
|
+
closest.parent = n
|
64
|
+
n.parent.children.delete_if{|child_i| child_i == closest}
|
65
|
+
n.children << closest
|
66
|
+
closest.diff_score = @similarity_matrix[closest.id][n.id]
|
67
|
+
end
|
68
|
+
|
69
|
+
flat << n
|
70
|
+
end
|
71
|
+
prune(flat)
|
72
|
+
end
|
73
|
+
|
74
|
+
# prune away nodes that don't meet the configured score threshold
|
75
|
+
def prune(nodes)
|
76
|
+
nodes.each do |node|
|
77
|
+
node.parent.children.reject!{|n| n == node} if (node != @root) && (node.diff_score < @score_threshold)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|