similarity_tree 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MzViYTMyMjQzZGU0ZTc2ZDc2OGI2NWI3NDYyNWYwZGM2MDFjMzc5MQ==
5
+ data.tar.gz: !binary |-
6
+ MDJmOWI2N2QxOTA2YTcwZjA4NThjZDhjZjRjMjAxOTMzY2Y5MTcyMw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ Mzk0ZTllMGRiOGJmOTI3OTAwNWQyN2U2ZjhiNWNlNGExMDczODc2YjUwYzJh
10
+ MTI3N2ZjNDM1ZTEyY2MzODEzNmUwYjgyODZkOTRlOTg0ODU5NDZlMGM5YmFi
11
+ MTU4NmNiYjE0NDM0ZmEwNGE4ZTgzZGJhNGRiY2U1ZWQ3OTJlYTA=
12
+ data.tar.gz: !binary |-
13
+ MzE0YjkzOGI2NDQ5ODE4YmQwMmViMjQ2NjllZmZhOWQzNTU1MDVjNWVkMGY1
14
+ NTYzZGY3NjIwY2I2NDVhMjUyODllOTZkNWM5OTYwYWViMWIwMmU1OWUyM2Rh
15
+ YzMzNzE3ZmVhZGZiNDgxM2ExZDRkNDhjZDFjYTE4NmZkMDdjYjA=
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in similarity_tree.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2013 Kent Mewhort
2
+ Copyright (c) 2012 Open North Inc.
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,96 @@
1
+ # SimilarityTree
2
+
3
+ This library allows you to generate a tree representing branches/revisions to a set of text HTML files, without any
4
+ prior knowledge of the timelines or change history necessary. You simply need to know the original source document and
5
+ this library builds a tree based on the extent of differences between each document.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'similarity_tree'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install similarity_tree
20
+
21
+ ## Usage
22
+
23
+ Build a "similarity matrix" of the diff scores between the different documents, then generate the tree from this matrix.
24
+ First, build the "similarity matrix" of the diff scores between the different documents. You must input a set of HTML or
25
+ text documents. Then, to build the tree itself, you need to specify the document id or filename of the original/root
26
+ document. Eg. for the set of different Creative Commons licences in the test dir:
27
+
28
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
29
+ tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
30
+ put tree.to_s # to_h and to_json are also available as other tree output formats
31
+
32
+ Result:
33
+
34
+ CC-BY-3.0.html
35
+ -CC-BY-NC-3.0.html (0.9197574893009985)
36
+ --CC-BY-NC-SA-3.0.html (0.9503146737330241)
37
+ --CC-BY-NC-ND-3.0.html (0.9456402772710689)
38
+ -CC-BY-ND-3.0.html (0.9434472109631346)
39
+
40
+ You can operate directly on **strings** rather than files (in this case, the node id's in the tree will be the file array indices):
41
+
42
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html').map { |f| File.read(f) }
43
+ tree = SimilarityTree::SimilarityMatrix.new(documents).build_tree("CC-BY-3.0.html")
44
+ put tree.to_s # to_h and to_json are also available as other tree output formats
45
+
46
+ CC-BY-3.0.html
47
+ -CC-BY-NC-3.0.html (0.9197574893009985)
48
+ --CC-BY-NC-SA-3.0.html (0.9503146737330241)
49
+ --CC-BY-NC-ND-3.0.html (0.9456402772710689)
50
+ -CC-BY-ND-3.0.html (0.9434472109631346)
51
+
52
+ Result:
53
+ 0
54
+ -1 (0.9197574893009985)
55
+ --3 (0.9503146737330241)
56
+ --4 (0.9456402772710689)
57
+ -2 (0.9434472109631346)
58
+
59
+ Or, you can use any **enumerable list of objects** (eg. ActiveRecords) as the inputs. Consider:
60
+
61
+ class Document < ActiveRecord::Base
62
+ attr_accessible :title, :text_filename
63
+ ...
64
+ end
65
+
66
+ You can call:
67
+
68
+ tree = SimilarityTree::SimilarityMatrix.new(Document.all,
69
+ id_func: :title, content_func: :text_filename).build_tree(Document.first.title)
70
+
71
+ ## Additional Options
72
+
73
+ ### Calculation method
74
+
75
+ You can use either the **term frequency–inverse document frequency** (:tf_idf, the default) or **Dice's coefficient** from a
76
+ standard unix-style diff to calculate the diff scores. Tf-idf works much better where a document has a lot of translations
77
+ (that is, "cut and pastes" of sections of text into different locations) and is also often faster. However, if your intent
78
+ is to show diffs of the text, the :diff option will correlate better to your diff rendering.
79
+
80
+ documents = Dir.glob('../../similarity_tree/test/cc_licences/*.html')
81
+ tf_idf_tree = SimilarityTree::SimilarityMatrix.new(documents,
82
+ calculation_method: :tf_idf).build_tree("CC-BY-3.0.html")
83
+ diff_tree = SimilarityTree::SimilarityMatrix.new(documents,
84
+ calculation_method: :diff).build_tree("CC-BY-3.0.html")
85
+
86
+ ### Progress output
87
+
88
+ Performing all the diffs to build a similarity matrix can take a while for large document sets. If you're using this
89
+ gem from a script or a console, you can add a progress bar:
90
+
91
+ tree = SimilarityTree::SimilarityMatrix.new(documents, show_progress: true).build_tree(id)
92
+
93
+ ## Licence and Credits
94
+
95
+ (c) 2012-2013, Kent Mewhort (similarity tree) and Open North (original similarity_matrix implementation, see https://github.com/jpmckinney/clip-analysis),
96
+ licensed under MIT. See LICENSE.txt for details.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,4 @@
1
+ require "similarity_tree/version"
2
+ require "similarity_tree/node"
3
+ require "similarity_tree/similarity_matrix"
4
+ require "similarity_tree/similarity_tree"
@@ -0,0 +1,53 @@
1
+ require 'json'
2
+ module SimilarityTree
3
+ class Node
4
+ attr_accessor :id, :diff_score, :parent, :children, :content
5
+
6
+ def initialize(id, diff_score, parent = nil, children = [], content = nil)
7
+ @id, @diff_score, @parent, @children, @content = id, diff_score, parent, children, content
8
+ end
9
+
10
+ # self and all descendents
11
+ def each_node
12
+ depth_first_recurse{|n, depth| yield n}
13
+ end
14
+
15
+ def to_s
16
+ str = ""
17
+ depth_first_recurse do |n, depth|
18
+ str += ("-" * depth) + n.id.to_s
19
+ str += ' (' + n.diff_score.to_s + ')' unless n.diff_score.nil?
20
+ str += "\n"
21
+ end
22
+ str
23
+ end
24
+
25
+ def to_h
26
+ result = {
27
+ id: id
28
+ }
29
+ result[:children] = children.map {|c| c.to_h} unless children.nil? || children.empty?
30
+ result[:diff_score] = diff_score unless diff_score.nil?
31
+
32
+ # if the content node has an as_json function, merge-in these attributes
33
+ if content.respond_to?(:as_json) && content.is_a?(Hash)
34
+ result = content.as_json.merge(result)
35
+ end
36
+ result
37
+ end
38
+
39
+ def to_json(opts = {})
40
+ JSON.generate to_h, opts
41
+ end
42
+
43
+ private
44
+ # helper for recursion into descendents
45
+ def depth_first_recurse(node = nil, depth = 0, &block)
46
+ node = self if node == nil
47
+ yield node, depth
48
+ node.children.each do |child|
49
+ depth_first_recurse(child, depth+1, &block)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,147 @@
1
+ require 'matrix'
2
+ require 'tf-idf-similarity'
3
+ require 'fast_html_diff'
4
+
5
+ module SimilarityTree
6
+ # Table of the diff/similarity scores between different text documents
7
+ class SimilarityMatrix
8
+
9
+ # Initialize a matrix for a set of documents
10
+ def initialize(sources, options = {})
11
+ @sources = sources
12
+ @config = default_options.merge(options)
13
+
14
+ @id = -1
15
+ @source_index = Hash.new
16
+ @matrix = nil
17
+ end
18
+
19
+ # calculate and output results as an array of arrays;
20
+ # optional block is run each comparison to help with any progress bars
21
+ def calculate
22
+ if @config[:calculation_method] == :tf_idf
23
+ @matrix = calculate_with_tf_idf
24
+ elsif @config[:calculation_method] == :diff
25
+ @matrix = calculate_with_diff
26
+ else
27
+ raise "Unknown calculation type"
28
+ end
29
+ end
30
+
31
+ def build_tree(root_id, score_threshold = 0)
32
+ # build the similarity tree
33
+ @matrix = self.calculate if @matrix.nil?
34
+ tree = SimilarityTree.new(root_id, @matrix, score_threshold).build
35
+
36
+ # populate the nodes with the sources for the compatibility matrix
37
+ tree.each_node {|n| n.content = @source_index[n.id] }
38
+ tree
39
+ end
40
+
41
+ private
42
+ def default_options
43
+ {
44
+ id_func: nil,
45
+ content_func: nil,
46
+ calculation_method: :tf_idf,
47
+ show_progress: false
48
+ }
49
+ end
50
+
51
+ def calculate_with_tf_idf
52
+ progress_bar = nil
53
+ if @config[:show_progress]
54
+ progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
55
+ total: @sources.length
56
+ end
57
+
58
+ # iterate through the input texts and build the tf_idf corpus
59
+ corpus = []
60
+ ids = @sources.map do |source|
61
+ corpus << TfIdfSimilarity::Document.new(text_of(source))
62
+ progress_bar.increment unless progress_bar.nil?
63
+ id_of(source)
64
+ end
65
+ model = TfIdfSimilarity::TfIdfModel.new(corpus, function: :tf_idf)
66
+ similarity_matrix = model.similarity_matrix
67
+
68
+ # compile the results into an ordinary m*n array
69
+ matrix = {}
70
+ ids.each_with_index do |a,i|
71
+ matrix[a] = {}
72
+ ids.each_with_index do |b,j|
73
+ matrix[a][b] = similarity_matrix[i, j].round(6)
74
+ end
75
+ end
76
+ matrix
77
+ end
78
+
79
+ # Create a similarity matrix, using diff as the similarity measure, based on the difference of WORDS (not characters)
80
+ # (only counts insertions and deletions, not substitution and transposition).
81
+ def calculate_with_diff
82
+ progress_bar = nil
83
+ if @config[:show_progress]
84
+ progress_bar = ProgressBar.create format: '%a |%B| %p%% %e', length: 80, smoothing: 0.5,
85
+ total: @sources.length*(@sources.length-1)/2
86
+ end
87
+
88
+ matrix = {}
89
+ @sources.each_with_index do |a,i|
90
+ a_id = id_of(a)
91
+ a_text = text_of(a)
92
+
93
+ @sources[i + 1..-1].each do |b|
94
+ b_id = id_of(b)
95
+ b_text = text_of(b)
96
+
97
+ stats = FastHtmlDiff::DiffBuilder.new(a_text, b_text).statistics
98
+
99
+ # http://en.wikipedia.org/wiki/Dice%27s_coefficient
100
+ total_count = 2 * stats[:matches][:words] + stats[:insertions][:words] + stats[:deletions][:words]
101
+ similarity = 2 * stats[:matches][:words] / total_count.to_f
102
+
103
+ # Build the similarity matrix,
104
+ matrix[a_id] ||= {a_id => 1}
105
+ matrix[a_id][b_id] = similarity
106
+ matrix[b_id] ||= {b_id => 1}
107
+ matrix[b_id][a_id] = similarity
108
+
109
+ progress_bar.increment unless progress_bar.nil?
110
+ end
111
+ end
112
+ matrix
113
+ end
114
+
115
+ def id_of(source)
116
+ id = nil
117
+ if !@config[:id_func].nil?
118
+ id = source.send @config[:id_func].to_s
119
+ else
120
+ if is_a_filename? source
121
+ id = File.basename(source)
122
+ else
123
+ id = @sources.find_index(source)
124
+ end
125
+ end
126
+
127
+ # maintain an index of id => source
128
+ @source_index[id] = source if @source_index[id].nil?
129
+ id
130
+ end
131
+
132
+ def text_of(source)
133
+ if !@config[:content_func].nil?
134
+ txt = source.send @config[:content_func].to_s
135
+ else
136
+ txt = source
137
+ end
138
+ txt = File.read(txt) if is_a_filename?(txt)
139
+ txt
140
+ end
141
+
142
+ # quick and dirty check on whether a string is a filename based on the string length and whether the file exists
143
+ def is_a_filename?(filename)
144
+ (filename.length < 512) && File.exists?(filename)
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,81 @@
1
+ module SimilarityTree
2
+
3
+ # Constructs a hierarchy of nodes based on a specified root and the similarity "scores" between nodes. Each nodes is placed next
4
+ # to the node to which it is most similar; as between two nodes, the node most similar to the root is placed closest to the root.
5
+ class SimilarityTree
6
+ # initialize/build the tree hierarchy from an existing similarity matrix
7
+ def initialize(root_id, similarity_matrix, score_threshold = 0)
8
+ @nodes = similarity_matrix.map {|key, row| Node.new(key, 0)}
9
+ @root = @nodes.find {|n| n.id == root_id}
10
+ @root.diff_score = nil
11
+ @similarity_matrix = similarity_matrix
12
+ @score_threshold = score_threshold
13
+ end
14
+
15
+ # build the tree and return the root node
16
+ def build
17
+ build_tree
18
+ @root
19
+ end
20
+
21
+ private
22
+ def build_tree
23
+ tree = @root
24
+ flat = [@root]
25
+
26
+ # for each non-root node
27
+ @nodes.delete_if{|n| n == @root}.map do |n|
28
+ # find the best match to the nodes already in the tree
29
+ closest_diff_score = 0
30
+ closest = nil
31
+ flat.each do |m|
32
+ diff_score = @similarity_matrix[n.id][m.id]
33
+ if closest.nil? || (diff_score > closest_diff_score)
34
+ closest_diff_score = diff_score
35
+ closest = m
36
+ end
37
+ end
38
+
39
+ # if the closest match is the root node, or if the closest match's diff score with it's parent is stronger
40
+ # than between the present node and that parent, add as a child of the match
41
+ if (closest == @root) || (closest.diff_score >= @similarity_matrix[n.id][closest.parent.id])
42
+ n.parent = closest
43
+ closest.children << n
44
+ n.diff_score = @similarity_matrix[n.id][closest.id]
45
+ # else, if the new node is more similar to the parent, rotate so that the existing node becomes the child
46
+ else
47
+ # place children with the closest matching of the two
48
+ closest.children.dup.each do |child|
49
+ if @similarity_matrix[child.id][n.id] > child.diff_score
50
+ child.parent = n
51
+ closest.children.delete_if{|child_i| child_i == child }
52
+ n.children << child
53
+ child.diff_score = @similarity_matrix[child.id][n.id]
54
+ end
55
+ end
56
+
57
+ # connect the new node to the parent
58
+ n.parent = closest.parent
59
+ n.parent.children << n
60
+ n.diff_score = @similarity_matrix[n.id][n.parent.id]
61
+
62
+ # add the existing node as a child
63
+ closest.parent = n
64
+ n.parent.children.delete_if{|child_i| child_i == closest}
65
+ n.children << closest
66
+ closest.diff_score = @similarity_matrix[closest.id][n.id]
67
+ end
68
+
69
+ flat << n
70
+ end
71
+ prune(flat)
72
+ end
73
+
74
+ # prune away nodes that don't meet the configured score threshold
75
+ def prune(nodes)
76
+ nodes.each do |node|
77
+ node.parent.children.reject!{|n| n == node} if (node != @root) && (node.diff_score < @score_threshold)
78
+ end
79
+ end
80
+ end
81
+ end