html-dom-diff 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c64fe7c87ae9884bf398b8c8699ee352deaa766ef92c078a1a8789badf0ad8ac
4
+ data.tar.gz: cebe07112a9f6ec7b2e573d59e95037bbec4af6fc8937afac112124730fa9bbe
5
+ SHA512:
6
+ metadata.gz: 76f6659021fbdf482dcf640c9211fbd814a6c21cb92aba508ab24a20e6be79af5b9fd1dbe349a46dec00a198e8aeacb8ed74a5ec1e8362df1e8a16c4dc027c8a
7
+ data.tar.gz: 97222b9c82eff526515ce3e2040478e0890202c3fb91d45678461ae58ebd331825109276b8dff33eec971032a00a5d8e6b21ea3ca7606becbff151890d2879b5
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Frederik Fix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,8 @@
1
+ # html-diff
2
+
3
+ A library for providing useful diffs between related HTML documents. Algorithm based on:
4
+
5
+ > G. Cobena, S. Abiteboul, and A. Marian, “Detecting changes in XML
6
+ > documents,” in International Conference on Data Engineering, 2002,
7
+ > pp. 41–52.
8
+ > http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf
@@ -0,0 +1,8 @@
1
+ require 'html-dom-diff/version'
2
+ require 'html-dom-diff/delta_tree_builder'
3
+ require 'html-dom-diff/differ'
4
+ require 'html-dom-diff/node'
5
+
6
+ module HTMLDOMDiff
7
+
8
+ end
@@ -0,0 +1,41 @@
1
+ module HTMLDOMDiff
2
+ class DeltaTreeBuilder
3
+ attr_reader :ldoc, :rdoc
4
+ def initialize(ldoc, rdoc, weights, forward, backward)
5
+ @ldoc = ldoc
6
+ @rdoc = rdoc
7
+ @weights = weights
8
+ @forward = forward
9
+ @backward = backward
10
+ end
11
+
12
+ def build
13
+ wrap @rdoc
14
+ end
15
+
16
+ private
17
+
18
+ def wrap(rnode, parent=nil)
19
+ result = Node.new rnode, @backward[rnode], parent
20
+ rnode.children.each do |child|
21
+ wrap child, result
22
+ end
23
+ if parent
24
+ parent.add_child result
25
+ end
26
+ if @backward[rnode]
27
+ @backward[rnode].children.each do |child|
28
+ reverse_wrap(child, result)
29
+ end
30
+ end
31
+ result
32
+ end
33
+
34
+ def reverse_wrap(lnode, parent)
35
+ return if @forward[lnode]
36
+ result = Node.new nil, lnode
37
+ lnode.children.each { |c| reverse_wrap c, result }
38
+ parent.add_child result
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,196 @@
1
+ require 'nokogiri'
2
+ require 'digest'
3
+ require 'pqueue'
4
+
5
+ module HTMLDOMDiff
6
+ class Differ
7
+ def diff_strings(left, right)
8
+ diff parse(left).root, parse(right).root
9
+ end
10
+
11
+ def diff_fragments(left, right)
12
+ diff parse_fragments(left).child, parse_fragments(right).child
13
+ end
14
+
15
+ def diff(ldoc, rdoc)
16
+ reset
17
+
18
+ match_by_ids ldoc, rdoc
19
+ prep_with @lsignatures, ldoc
20
+ prep_with @rsignatures, rdoc
21
+
22
+ perform_initial_top_down_matching [ldoc], [rdoc]
23
+
24
+ @matchqueue.push(rdoc)
25
+ perform_initial_matching
26
+
27
+ match_bottom_up ldoc
28
+ match_top_down ldoc
29
+
30
+ DeltaTreeBuilder.new(ldoc, rdoc, @weights, @forward, @backward).build
31
+ end
32
+
33
+ private
34
+
35
+ def parse(string)
36
+ Nokogiri::HTML(string, nil, nil, (Nokogiri::XML::ParseOptions::DEFAULT_HTML & Nokogiri::XML::ParseOptions::NOBLANKS))
37
+ end
38
+
39
+ def parse_fragments(string)
40
+ Nokogiri::HTML::DocumentFragment.parse(string)
41
+ end
42
+
43
+ def reset
44
+ @forward = {}
45
+ @backward = {}
46
+ @weights = {}
47
+ @depths = {}
48
+ @lsignatures = {}
49
+ @rsignatures = {}
50
+ @matchqueue = PQueue.new() { |a, b| @weights[a] > @weights[b] }
51
+ end
52
+
53
+ def match_by_ids(ldoc, rdoc)
54
+ rightside = rdoc.css("[id]").to_a
55
+ ldoc.css("[id]").each do |element|
56
+ rindex = rightside.find_index { |e| e[:id] == element[:id] }
57
+ if rindex
58
+ record_matching element, rightside[rindex]
59
+ rightside.delete_at(rindex)
60
+ end
61
+ end
62
+ end
63
+
64
+ def prep_with(sig_hash, element, level=0)
65
+ weights = weight_for(element)
66
+ signatures = [signature_part_for(element)]
67
+ element.children.each do |child|
68
+ weight, signature = prep_with(sig_hash, child, level+1)
69
+ weights += weight
70
+ signatures << signature
71
+ end
72
+
73
+ @weights[element] = weights
74
+ sig_hash[element] = hash_for(signatures)
75
+ @depths[element] = level
76
+
77
+ [ @weights[element], sig_hash[element] ]
78
+ end
79
+
80
+ def weight_for(element)
81
+ if element.text? or element.cdata?
82
+ 1 + Math.log(element.text.size)
83
+ else
84
+ 1
85
+ end
86
+ end
87
+
88
+ def signature_part_for(element)
89
+ if element.text? or element.cdata?
90
+ element.text
91
+ else
92
+ element.name
93
+ end
94
+ end
95
+
96
+ def hash_for(array)
97
+ Digest::SHA256.digest array.join(";")
98
+ end
99
+
100
+ def record_matching(left, right)
101
+ @forward[left] = right
102
+ @backward[right] = left
103
+ end
104
+
105
+ def perform_initial_top_down_matching(lnodes, rnodes)
106
+ _lnodes = lnodes.reject(&:text?)
107
+ _rnodes = rnodes.reject(&:text?)
108
+
109
+ _lnodes.each do |lnode|
110
+ lcounts = _lnodes.count { |c| c.name == lnode.name }
111
+ candidates = _rnodes.select { |c| c.name == lnode.name }
112
+ if lcounts == 1 && candidates.size == 1
113
+ record_matching lnode, candidates.first
114
+ perform_initial_top_down_matching lnode.children, candidates.first.children
115
+ end
116
+ end
117
+ end
118
+
119
+ def perform_initial_matching
120
+ while @matchqueue.size > 0
121
+ element = @matchqueue.pop
122
+ if @backward[element].nil? && (match = find_best_match(element))
123
+ match_all_children match, element
124
+ match_parents match, element
125
+ else
126
+ element.children.each { |c| @matchqueue.push c }
127
+ end
128
+ end
129
+ end
130
+
131
+ def find_best_match(element)
132
+ candidates = []
133
+ @lsignatures.each do |left, sig|
134
+ if @forward[left].nil? && sig == @rsignatures[element]
135
+ candidates << left
136
+ end
137
+ end
138
+
139
+ if candidates.size == 0
140
+ return
141
+ elsif candidates.size == 1
142
+ return candidates.first
143
+ else
144
+ matching_parents = candidates.select do |left|
145
+ @forward[left.parent] == element.parent
146
+ end
147
+
148
+ if matching_parents.size == 1
149
+ return matching_parents.first
150
+ else
151
+ return
152
+ end
153
+ end
154
+ end
155
+
156
+ def match_all_children(left, right)
157
+ record_matching left, right
158
+ left.children.zip(right.children).each do |a, b|
159
+ match_all_children a, b
160
+ end
161
+ end
162
+
163
+ def match_parents(left, right)
164
+ # TODO implement multi-ancestor matching
165
+ return if @forward[left.parent] || @backward[right.parent]
166
+ if left.parent.name == right.parent.name
167
+ record_matching left, right
168
+ end
169
+ end
170
+
171
+ def match_bottom_up(element)
172
+ element.children.each do |child|
173
+ match_bottom_up child
174
+ end
175
+
176
+ if element.respond_to?(:parent) && @forward[element.parent]
177
+ match = @forward[element.parent].children.find { |c| @backward[c].nil? && c.name == element.name }
178
+ record_matching(element, match) if match
179
+ end
180
+ end
181
+
182
+ def match_top_down(element)
183
+ if @forward[element].nil?
184
+ childmatches = element.children.map { |c| @forward[c] && @forward[c].parent }.compact.uniq
185
+ childmatches.reject! { |e| @backward[e] }
186
+ if childmatches.size == 1 && childmatches.first.name == element.name
187
+ record_matching(element, childmatches.first)
188
+ end
189
+ end
190
+
191
+ element.children.each do |child|
192
+ match_top_down child
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,122 @@
1
+ module HTMLDOMDiff
2
+ class Node
3
+ attr_reader :parent, :children
4
+
5
+ attr_reader :rnode
6
+
7
+ def initialize(rnode, lnode, parent=nil)
8
+ @rnode = rnode
9
+ @lnode = lnode
10
+ @parent = parent
11
+ @children = []
12
+ end
13
+
14
+ def add_child(child)
15
+ @children << child
16
+ end
17
+
18
+ def self_and_all_children
19
+ [self] + @children.map(&:self_and_all_children).flatten
20
+ end
21
+
22
+ # attributes
23
+ def attributes
24
+ @rnode.attributes if @rnode
25
+ end
26
+
27
+ def original_attributes
28
+ @lnode.attributes if @lnode
29
+ end
30
+
31
+ def changed_attribute_names
32
+ result = []
33
+ attributes.each do |k, v|
34
+ result << k if original_attributes[k].nil? || (v.value != original_attributes[k].value)
35
+ end
36
+ original_attributes.each do |k, v|
37
+ result << k if attributes[k].nil? || (v.value != attributes[k].value)
38
+ end
39
+ result.uniq
40
+ end
41
+
42
+ def text
43
+ @rnode.text
44
+ end
45
+
46
+ def original_text
47
+ @lnode&.text
48
+ end
49
+
50
+ def name
51
+ if @rnode
52
+ @rnode.name
53
+ else
54
+ @lnode.name
55
+ end
56
+ end
57
+
58
+ def text?
59
+ (@rnode||@lnode).text?
60
+ end
61
+
62
+ def to_html
63
+ @rnode.to_html
64
+ end
65
+
66
+ def as_tree_string(level=0)
67
+ result = [(" "*level) + "#{name} - :#{state}"]
68
+ result += children.map { |c| c.as_tree_string(level+1) }
69
+ result.join("\n")
70
+ end
71
+
72
+ # states
73
+ def changed?
74
+ if @rnode.text?
75
+ text != original_text
76
+ else
77
+ attributes_changed?
78
+ end
79
+ end
80
+
81
+ def state
82
+ [:moved, :inserted, :removed, :matched].each do |_state|
83
+ return _state if send("#{_state}?")
84
+ end
85
+ end
86
+
87
+ def matched?
88
+ ! (inserted? || removed? || moved?)
89
+ end
90
+
91
+ def inserted?
92
+ @lnode.nil?
93
+ end
94
+
95
+ def removed?
96
+ @rnode.nil?
97
+ end
98
+
99
+ def moved?
100
+ return false if inserted? || removed? || @parent.nil?
101
+ !@parent.parent_of? @lnode, @rnode
102
+ end
103
+
104
+ protected
105
+
106
+ def attributes_changed?
107
+ return true if attributes.size != original_attributes.size
108
+ attributes.each do |k, v|
109
+ if original_attributes[k].nil? || v.value != original_attributes[k].value
110
+ return true
111
+ end
112
+ end
113
+ false
114
+ end
115
+
116
+ def parent_of?(lchild, rchild)
117
+ return false if @lnode.nil? || @rnode.nil?
118
+ return false unless @lnode.children.include?(lchild)
119
+ @rnode.children.include?(rchild)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,3 @@
1
+ module HTMLDOMDiff
2
+ VERSION = "0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html-dom-diff
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Frederik Fix
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-04-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pqueue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: 'Given two closely related HTML documents find a useful diff
56
+
57
+ '
58
+ email:
59
+ - ich@derfred.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - LICENSE
65
+ - README.md
66
+ - lib/html-dom-diff.rb
67
+ - lib/html-dom-diff/delta_tree_builder.rb
68
+ - lib/html-dom-diff/differ.rb
69
+ - lib/html-dom-diff/node.rb
70
+ - lib/html-dom-diff/version.rb
71
+ homepage: https://github.com/derfred/html-dom-diff
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.7.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Diff between HTML documents
95
+ test_files: []