similarity_tree 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MzViYTMyMjQzZGU0ZTc2ZDc2OGI2NWI3NDYyNWYwZGM2MDFjMzc5MQ==
5
+ data.tar.gz: !binary |-
6
+ MDJmOWI2N2QxOTA2YTcwZjA4NThjZDhjZjRjMjAxOTMzY2Y5MTcyMw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ Mzk0ZTllMGRiOGJmOTI3OTAwNWQyN2U2ZjhiNWNlNGExMDczODc2YjUwYzJh
10
+ MTI3N2ZjNDM1ZTEyY2MzODEzNmUwYjgyODZkOTRlOTg0ODU5NDZlMGM5YmFi
11
+ MTU4NmNiYjE0NDM0ZmEwNGE4ZTgzZGJhNGRiY2U1ZWQ3OTJlYTA=
12
+ data.tar.gz: !binary |-
13
+ MzE0YjkzOGI2NDQ5ODE4YmQwMmViMjQ2NjllZmZhOWQzNTU1MDVjNWVkMGY1
14
+ NTYzZGY3NjIwY2I2NDVhMjUyODllOTZkNWM5OTYwYWViMWIwMmU1OWUyM2Rh
15
+ YzMzNzE3ZmVhZGZiNDgxM2ExZDRkNDhjZDFjYTE4NmZkMDdjYjA=
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in similarity_tree.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2013 Kent Mewhort
2
+ Copyright (c) 2012 Open North Inc.
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,96 @@
1
+ # SimilarityTree
2
+
3
+ This library allows you to generate a tree representing branches/revisions to a set of text HTML files, without any
4
+ prior knowledge of the timelines or change history necessary. You simply need to know the original source document and
5
+ this library builds a tree based on the extent of differences between each document.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'similarity_tree'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install similarity_tree
20
+
21
+ ## Usage
22
+
23
+ Build a "similarity matrix" of the diff scores between the different documents, then generate the tree from this matrix.
24
+ First, build the "similarity matrix" of the diff scores between the different documents. You must input a set of HTML or
25
+ text documents. Then, to build the tree itself, you need to specify the document id or filename of the original/root
26
+ document. Eg. for the set of different Creative Commons licences in the test dir:
27
+
28
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
29
+ tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
30
+ put tree.to_s # to_h and to_json are also available as other tree output formats
31
+
32
+ Result:
33
+
34
+ CC-BY-3.0.html
35
+ -CC-BY-NC-3.0.html (0.9197574893009985)
36
+ --CC-BY-NC-SA-3.0.html (0.9503146737330241)
37
+ --CC-BY-NC-ND-3.0.html (0.9456402772710689)
38
+ -CC-BY-ND-3.0.html (0.9434472109631346)
39
+
40
+ You can operate directly on **strings** rather than files (in this case, the node id's in the tree will be the file array indices):
41
+
42
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html').map { |f| File.read(f) }
43
+ tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
44
+ put tree.to_s # to_h and to_json are also available as other tree output formats
45
+
46
+ CC-BY-3.0.html
47
+ -CC-BY-NC-3.0.html (0.9197574893009985)
48
+ --CC-BY-NC-SA-3.0.html (0.9503146737330241)
49
+ --CC-BY-NC-ND-3.0.html (0.9456402772710689)
50
+ -CC-BY-ND-3.0.html (0.9434472109631346)
51
+
52
+ Result:
53
+ 0
54
+ -1 (0.9197574893009985)
55
+ --3 (0.9503146737330241)
56
+ --4 (0.9456402772710689)
57
+ -2 (0.9434472109631346)
58
+
59
+ Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
60
+
61
+ class Document < ActiveRecord::Base
62
+ attr_accessible :title, :text_filename
63
+ ...
64
+ end
65
+
66
+ You can call:
67
+
68
+ tree = SimilarityTree::SimilarityMatrix.new(Document.all,
69
+ id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
70
+
71
+ ## Additional Options
72
+
73
+ ### Calculation method
74
+
75
+ You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
76
+ standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
77
+ (that is, "cut and pastes" of sections of text into different locations) and is also often faster. However, if your intent
78
+ is to show diffs of the text, the :diff option will correlate better to your diff rendering.
79
+
80
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
81
+ tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
82
+ calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
83
+ diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
84
+ calculation_method: :diff).build_tree("CC-BY-3.0.html")
85
+
86
+ ### Progress output
87
+
88
+ Performing all the diffs to build a similarity matrix can take a while for large document sets. If you're using this
89
+ gem from a script or a console, you can add a progress bar:
90
+
91
+ tree = SimilarityTree::SimilarityMatrix.new(documents, show_progress: true).build_tree(id)
92
+
93
+ ## Licence and Credits
94
+
95
+ (c) 2012-2013, Kent Mewhort (similarity tree) and Open North (original similarity_matrix implementation, see https://github.com/jpmckinney/clip-analysis),
96
+ licensed under MIT. See LICENSE.txt for details.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,4 @@
1
+ require "similarity_tree/version"
2
+ require "similarity_tree/node"
3
+ require "similarity_tree/similarity_matrix"
4
+ require "similarity_tree/similarity_tree"
@@ -0,0 +1,53 @@
1
+ require 'json'
2
+ module SimilarityTree
3
+ class Node
4
+ attr_accessor :id, :diff_score, :parent, :children, :content
5
+
6
+ def initialize(id, diff_score, parent = nil, children = [], content = nil)
7
+ @id, @diff_score, @parent, @children, @content = id, diff_score, parent, children, content
8
+ end
9
+
10
+ # self and all descendents
11
+ def each_node
12
+ depth_first_recurse{|n, depth| yield n}
13
+ end
14
+
15
+ def to_s
16
+ str = ""
17
+ depth_first_recurse do |n, depth|
18
+ str += ("-" * depth) + n.id.to_s
19
+ str += ' (' + n.diff_score.to_s + ')' unless n.diff_score.nil?
20
+ str += "\n"
21
+ end
22
+ str
23
+ end
24
+
25
+ def to_h
26
+ result = {
27
+ id: id
28
+ }
29
+ result[:children] = children.map {|c| c.to_h} unless children.nil? || children.empty?
30
+ result[:diff_score] = diff_score unless diff_score.nil?
31
+
32
+ # if the content node has an as_json function, merge-in these attributes
33
+ if content.respond_to?(:as_json) && content.is_a?(Hash)
34
+ result = content.as_json.merge(result)
35
+ end
36
+ result
37
+ end
38
+
39
+ def to_json(opts = {})
40
+ JSON.generate to_h, opts
41
+ end
42
+
43
+ private
44
+ # helper for recursion into descendents
45
+ def depth_first_recurse(node = nil, depth = 0, &block)
46
+ node = self if node == nil
47
+ yield node, depth
48
+ node.children.each do |child|
49
+ depth_first_recurse(child, depth+1, &block)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,147 @@
1
+ require 'matrix'
2
+ require 'tf-idf-similarity'
3
+ require 'fast_html_diff'
4
+
5
+ module SimilarityTree
6
+ # Table of the diff/similarity scores between different text documents
7
+ class SimilarityMatrix
8
+
9
+ # Initialize a matrix for a set of documents
10
+ def initialize(sources, options = {})
11
+ @sources = sources
12
+ @config = default_options.merge(options)
13
+
14
+ @id = -1
15
+ @source_index = Hash.new
16
+ @matrix = nil
17
+ end
18
+
19
+ # calculate and output results as an array of arrays;
20
+ # optional block is run each comparison to help with any progress bars
21
+ def calculate
22
+ if @config[:calculation_method] == :tf_idf
23
+ @matrix = calculate_with_tf_idf
24
+ elsif @config[:calculation_method] == :diff
25
+ @matrix = calculate_with_diff
26
+ else
27
+ raise "Unknown calculation type"
28
+ end
29
+ end
30
+
31
+ def build_tree(root_id, score_threshold = 0)
32
+ # build the similarity tree
33
+ @matrix = self.calculate if @matrix.nil?
34
+ tree = SimilarityTree.new(root_id, @matrix, score_threshold).build
35
+
36
+ # populate the nodes with the sources for the compatibility matrix
37
+ tree.each_node {|n| n.content = @source_index[n.id] }
38
+ tree
39
+ end
40
+
41
+ private
42
+ def default_options
43
+ {
44
+ id_func: nil,
45
+ content_func: nil,
46
+ calculation_method: :tf_idf,
47
+ show_progress: false
48
+ }
49
+ end
50
+
51
+ def calculate_with_tf_idf
52
+ progress_bar = nil
53
+ if @config[:show_progress]
54
+ progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
55
+ total: @sources.length
56
+ end
57
+
58
+ # iterate through the input texts and build the tf_idf corpus
59
+ corpus = []
60
+ ids = @sources.map do |source|
61
+ corpus << TfIdfSimilarity::Document.new(text_of(source))
62
+ progress_bar.increment unless progress_bar.nil?
63
+ id_of(source)
64
+ end
65
+ model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
66
+ similarity_matrix = model.similarity_matrix
67
+
68
+ # compile the results into an ordinary m*n array
69
+ matrix = {}
70
+ ids.each_with_index do |a,i|
71
+ matrix[a] = {}
72
+ ids.each_with_index do |b,j|
73
+ matrix[a][b] = similarity_matrix[i, j].round(6)
74
+ end
75
+ end
76
+ matrix
77
+ end
78
+
79
+ # Create a similarity matrix, using diff as the similarity measure, based on the difference of WORDS (not characters)
80
+ # (only counts insertions and deletions, not substitution and transposition).
81
+ def calculate_with_diff
82
+ progress_bar = nil
83
+ if @config[:show_progress]
84
+ progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
85
+ total: @sources.length*(@sources.length-1)/2
86
+ end
87
+
88
+ matrix = {}
89
+ @sources.each_with_index do |a,i|
90
+ a_id = id_of(a)
91
+ a_text = text_of(a)
92
+
93
+ @sources[i + 1..-1].each do |b|
94
+ b_id = id_of(b)
95
+ b_text = text_of(b)
96
+
97
+ stats = FastHtmlDiff::DiffBuilder.new(a_text, b_text).statistics
98
+
99
+ # http://en.wikipedia.org/wiki/Dice%27s_coefficient
100
+ total_count = 2 * stats[:matches][:words] + stats[:insertions][:words] + stats[:deletions][:words]
101
+ similarity = 2 * stats[:matches][:words] / total_count.to_f
102
+
103
+ # Build the similarity matrix,
104
+ matrix[a_id] ||= {a_id => 1}
105
+ matrix[a_id][b_id] = similarity
106
+ matrix[b_id] ||= {b_id => 1}
107
+ matrix[b_id][a_id] = similarity
108
+
109
+ progress_bar.increment unless progress_bar.nil?
110
+ end
111
+ end
112
+ matrix
113
+ end
114
+
115
+ def id_of(source)
116
+ id = nil
117
+ if !@config[:id_func].nil?
118
+ id = source.send @config[:id_func].to_s
119
+ else
120
+ if is_a_filename? source
121
+ id = File.basename(source)
122
+ else
123
+ id = @sources.find_index(source)
124
+ end
125
+ end
126
+
127
+ # maintain an index of id => source
128
+ @source_index[id] = source if @source_index[id].nil?
129
+ id
130
+ end
131
+
132
+ def text_of(source)
133
+ if !@config[:content_func].nil?
134
+ txt = source.send @config[:content_func].to_s
135
+ else
136
+ txt = source
137
+ end
138
+ txt = File.read(txt) if is_a_filename?(txt)
139
+ txt
140
+ end
141
+
142
+ # quick and dirty check on whether a string is a filename based on the string length and whether the file exists
143
+ def is_a_filename?(filename)
144
+ (filename.length < 512) && File.exists?(filename)
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,81 @@
1
+ module SimilarityTree
2
+
3
+ # Constructs a hierarchy of nodes based on a specified root and the similarity "scores" between nodes. Each nodes is placed next
4
+ # to the node to which it is most similar; as between two nodes, the node most similar to the root is placed closest to the root.
5
+ class SimilarityTree
6
+ # initialize/build the tree hierarchy from an existing similarity matrix
7
+ def initialize(root_id, similarity_matrix, score_threshold = 0)
8
+ @nodes = similarity_matrix.map {|key, row| Node.new(key, 0)}
9
+ @root = @nodes.find {|n| n.id == root_id}
10
+ @root.diff_score = nil
11
+ @similarity_matrix = similarity_matrix
12
+ @score_threshold = score_threshold
13
+ end
14
+
15
+ # build the tree and return the root node
16
+ def build
17
+ build_tree
18
+ @root
19
+ end
20
+
21
+ private
22
+ def build_tree
23
+ tree = @root
24
+ flat = [@root]
25
+
26
+ # for each non-root node
27
+ @nodes.delete_if{|n| n == @root}.map do |n|
28
+ # find the best match to the nodes already in the tree
29
+ closest_diff_score = 0
30
+ closest = nil
31
+ flat.each do |m|
32
+ diff_score = @similarity_matrix[n.id][m.id]
33
+ if closest.nil? || (diff_score > closest_diff_score)
34
+ closest_diff_score = diff_score
35
+ closest = m
36
+ end
37
+ end
38
+
39
+ # if the closest match is the root node, or if the closest match's diff score with it's parent is stronger
40
+ # than between the present node and that parent, add as a child of the match
41
+ if (closest == @root) || (closest.diff_score >= @similarity_matrix[n.id][closest.parent.id])
42
+ n.parent = closest
43
+ closest.children << n
44
+ n.diff_score = @similarity_matrix[n.id][closest.id]
45
+ # else, if the new node is more similar to the parent, rotate so that the existing node becomes the child
46
+ else
47
+ # place children with the closest matching of the two
48
+ closest.children.dup.each do |child|
49
+ if @similarity_matrix[child.id][n.id] > child.diff_score
50
+ child.parent = n
51
+ closest.children.delete_if{|child_i| child_i == child }
52
+ n.children << child
53
+ child.diff_score = @similarity_matrix[child.id][n.id]
54
+ end
55
+ end
56
+
57
+ # connect the new node to the parent
58
+ n.parent = closest.parent
59
+ n.parent.children << n
60
+ n.diff_score = @similarity_matrix[n.id][n.parent.id]
61
+
62
+ # add the existing node as a child
63
+ closest.parent = n
64
+ n.parent.children.delete_if{|child_i| child_i == closest}
65
+ n.children << closest
66
+ closest.diff_score = @similarity_matrix[closest.id][n.id]
67
+ end
68
+
69
+ flat << n
70
+ end
71
+ prune(flat)
72
+ end
73
+
74
+ # prune away nodes that don't meet the configured score threshold
75
+ def prune(nodes)
76
+ nodes.each do |node|
77
+ node.parent.children.reject!{|n| n == node} if (node != @root) && (node.diff_score < @score_threshold)
78
+ end
79
+ end
80
+ end
81
+ end