html-dom-diff 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c64fe7c87ae9884bf398b8c8699ee352deaa766ef92c078a1a8789badf0ad8ac
4
+ data.tar.gz: cebe07112a9f6ec7b2e573d59e95037bbec4af6fc8937afac112124730fa9bbe
5
+ SHA512:
6
+ metadata.gz: 76f6659021fbdf482dcf640c9211fbd814a6c21cb92aba508ab24a20e6be79af5b9fd1dbe349a46dec00a198e8aeacb8ed74a5ec1e8362df1e8a16c4dc027c8a
7
+ data.tar.gz: 97222b9c82eff526515ce3e2040478e0890202c3fb91d45678461ae58ebd331825109276b8dff33eec971032a00a5d8e6b21ea3ca7606becbff151890d2879b5
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Frederik Fix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,8 @@
1
+ # html-diff
2
+
3
+ A library for providing useful diffs between related HTML documents. Algorithm based on:
4
+
5
+ > G. Cobena, S. Abiteboul, and A. Marian, “Detecting changes in XML
6
+ > documents,” in International Conference on Data Engineering, 2002,
7
+ > pp. 41–52.
8
+ > http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf
@@ -0,0 +1,8 @@
1
+ require 'html-dom-diff/version'
2
+ require 'html-dom-diff/delta_tree_builder'
3
+ require 'html-dom-diff/differ'
4
+ require 'html-dom-diff/node'
5
+
6
+ module HTMLDOMDiff
7
+
8
+ end
@@ -0,0 +1,41 @@
1
+ module HTMLDOMDiff
2
+ class DeltaTreeBuilder
3
+ attr_reader :ldoc, :rdoc
4
+ def initialize(ldoc, rdoc, weights, forward, backward)
5
+ @ldoc = ldoc
6
+ @rdoc = rdoc
7
+ @weights = weights
8
+ @forward = forward
9
+ @backward = backward
10
+ end
11
+
12
+ def build
13
+ wrap @rdoc
14
+ end
15
+
16
+ private
17
+
18
+ def wrap(rnode, parent=nil)
19
+ result = Node.new rnode, @backward[rnode], parent
20
+ rnode.children.each do |child|
21
+ wrap child, result
22
+ end
23
+ if parent
24
+ parent.add_child result
25
+ end
26
+ if @backward[rnode]
27
+ @backward[rnode].children.each do |child|
28
+ reverse_wrap(child, result)
29
+ end
30
+ end
31
+ result
32
+ end
33
+
34
+ def reverse_wrap(lnode, parent)
35
+ return if @forward[lnode]
36
+ result = Node.new nil, lnode
37
+ lnode.children.each { |c| reverse_wrap c, result }
38
+ parent.add_child result
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,196 @@
1
+ require 'nokogiri'
2
+ require 'digest'
3
+ require 'pqueue'
4
+
5
+ module HTMLDOMDiff
6
+ class Differ
7
+ def diff_strings(left, right)
8
+ diff parse(left).root, parse(right).root
9
+ end
10
+
11
+ def diff_fragments(left, right)
12
+ diff parse_fragments(left).child, parse_fragments(right).child
13
+ end
14
+
15
+ def diff(ldoc, rdoc)
16
+ reset
17
+
18
+ match_by_ids ldoc, rdoc
19
+ prep_with @lsignatures, ldoc
20
+ prep_with @rsignatures, rdoc
21
+
22
+ perform_initial_top_down_matching [ldoc], [rdoc]
23
+
24
+ @matchqueue.push(rdoc)
25
+ perform_initial_matching
26
+
27
+ match_bottom_up ldoc
28
+ match_top_down ldoc
29
+
30
+ DeltaTreeBuilder.new(ldoc, rdoc, @weights, @forward, @backward).build
31
+ end
32
+
33
+ private
34
+
35
+ def parse(string)
36
+ Nokogiri::HTML(string, nil, nil, (Nokogiri::XML::ParseOptions::DEFAULT_HTML & Nokogiri::XML::ParseOptions::NOBLANKS))
37
+ end
38
+
39
+ def parse_fragments(string)
40
+ Nokogiri::HTML::DocumentFragment.parse(string)
41
+ end
42
+
43
+ def reset
44
+ @forward = {}
45
+ @backward = {}
46
+ @weights = {}
47
+ @depths = {}
48
+ @lsignatures = {}
49
+ @rsignatures = {}
50
+ @matchqueue = PQueue.new() { |a, b| @weights[a] > @weights[b] }
51
+ end
52
+
53
+ def match_by_ids(ldoc, rdoc)
54
+ rightside = rdoc.css("[id]").to_a
55
+ ldoc.css("[id]").each do |element|
56
+ rindex = rightside.find_index { |e| e[:id] == element[:id] }
57
+ if rindex
58
+ record_matching element, rightside[rindex]
59
+ rightside.delete_at(rindex)
60
+ end
61
+ end
62
+ end
63
+
64
+ def prep_with(sig_hash, element, level=0)
65
+ weights = weight_for(element)
66
+ signatures = [signature_part_for(element)]
67
+ element.children.each do |child|
68
+ weight, signature = prep_with(sig_hash, child, level+1)
69
+ weights += weight
70
+ signatures << signature
71
+ end
72
+
73
+ @weights[element] = weights
74
+ sig_hash[element] = hash_for(signatures)
75
+ @depths[element] = level
76
+
77
+ [ @weights[element], sig_hash[element] ]
78
+ end
79
+
80
+ def weight_for(element)
81
+ if element.text? or element.cdata?
82
+ 1 + Math.log(element.text.size)
83
+ else
84
+ 1
85
+ end
86
+ end
87
+
88
+ def signature_part_for(element)
89
+ if element.text? or element.cdata?
90
+ element.text
91
+ else
92
+ element.name
93
+ end
94
+ end
95
+
96
+ def hash_for(array)
97
+ Digest::SHA256.digest array.join(";")
98
+ end
99
+
100
+ def record_matching(left, right)
101
+ @forward[left] = right
102
+ @backward[right] = left
103
+ end
104
+
105
+ def perform_initial_top_down_matching(lnodes, rnodes)
106
+ _lnodes = lnodes.reject(&:text?)
107
+ _rnodes = rnodes.reject(&:text?)
108
+
109
+ _lnodes.each do |lnode|
110
+ lcounts = _lnodes.count { |c| c.name == lnode.name }
111
+ candidates = _rnodes.select { |c| c.name == lnode.name }
112
+ if lcounts == 1 && candidates.size == 1
113
+ record_matching lnode, candidates.first
114
+ perform_initial_top_down_matching lnode.children, candidates.first.children
115
+ end
116
+ end
117
+ end
118
+
119
+ def perform_initial_matching
120
+ while @matchqueue.size > 0
121
+ element = @matchqueue.pop
122
+ if @backward[element].nil? && (match = find_best_match(element))
123
+ match_all_children match, element
124
+ match_parents match, element
125
+ else
126
+ element.children.each { |c| @matchqueue.push c }
127
+ end
128
+ end
129
+ end
130
+
131
+ def find_best_match(element)
132
+ candidates = []
133
+ @lsignatures.each do |left, sig|
134
+ if @forward[left].nil? && sig == @rsignatures[element]
135
+ candidates << left
136
+ end
137
+ end
138
+
139
+ if candidates.size == 0
140
+ return
141
+ elsif candidates.size == 1
142
+ return candidates.first
143
+ else
144
+ matching_parents = candidates.select do |left|
145
+ @forward[left.parent] == element.parent
146
+ end
147
+
148
+ if matching_parents.size == 1
149
+ return matching_parents.first
150
+ else
151
+ return
152
+ end
153
+ end
154
+ end
155
+
156
+ def match_all_children(left, right)
157
+ record_matching left, right
158
+ left.children.zip(right.children).each do |a, b|
159
+ match_all_children a, b
160
+ end
161
+ end
162
+
163
+ def match_parents(left, right)
164
+ # TODO implement multi-ancestor matching
165
+ return if @forward[left.parent] || @backward[right.parent]
166
+ if left.parent.name == right.parent.name
167
+ record_matching left, right
168
+ end
169
+ end
170
+
171
+ def match_bottom_up(element)
172
+ element.children.each do |child|
173
+ match_bottom_up child
174
+ end
175
+
176
+ if element.respond_to?(:parent) && @forward[element.parent]
177
+ match = @forward[element.parent].children.find { |c| @backward[c].nil? && c.name == element.name }
178
+ record_matching(element, match) if match
179
+ end
180
+ end
181
+
182
+ def match_top_down(element)
183
+ if @forward[element].nil?
184
+ childmatches = element.children.map { |c| @forward[c] && @forward[c].parent }.compact.uniq
185
+ childmatches.reject! { |e| @backward[e] }
186
+ if childmatches.size == 1 && childmatches.first.name == element.name
187
+ record_matching(element, childmatches.first)
188
+ end
189
+ end
190
+
191
+ element.children.each do |child|
192
+ match_top_down child
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,122 @@
1
+ module HTMLDOMDiff
2
+ class Node
3
+ attr_reader :parent, :children
4
+
5
+ attr_reader :rnode
6
+
7
+ def initialize(rnode, lnode, parent=nil)
8
+ @rnode = rnode
9
+ @lnode = lnode
10
+ @parent = parent
11
+ @children = []
12
+ end
13
+
14
+ def add_child(child)
15
+ @children << child
16
+ end
17
+
18
+ def self_and_all_children
19
+ [self] + @children.map(&:self_and_all_children).flatten
20
+ end
21
+
22
+ # attributes
23
+ def attributes
24
+ @rnode.attributes if @rnode
25
+ end
26
+
27
+ def original_attributes
28
+ @lnode.attributes if @lnode
29
+ end
30
+
31
+ def changed_attribute_names
32
+ result = []
33
+ attributes.each do |k, v|
34
+ result << k if original_attributes[k].nil? || (v.value != original_attributes[k].value)
35
+ end
36
+ original_attributes.each do |k, v|
37
+ result << k if attributes[k].nil? || (v.value != attributes[k].value)
38
+ end
39
+ result.uniq
40
+ end
41
+
42
+ def text
43
+ @rnode.text
44
+ end
45
+
46
+ def original_text
47
+ @lnode&.text
48
+ end
49
+
50
+ def name
51
+ if @rnode
52
+ @rnode.name
53
+ else
54
+ @lnode.name
55
+ end
56
+ end
57
+
58
+ def text?
59
+ (@rnode||@lnode).text?
60
+ end
61
+
62
+ def to_html
63
+ @rnode.to_html
64
+ end
65
+
66
+ def as_tree_string(level=0)
67
+ result = [(" "*level) + "#{name} - :#{state}"]
68
+ result += children.map { |c| c.as_tree_string(level+1) }
69
+ result.join("\n")
70
+ end
71
+
72
+ # states
73
+ def changed?
74
+ if @rnode.text?
75
+ text != original_text
76
+ else
77
+ attributes_changed?
78
+ end
79
+ end
80
+
81
+ def state
82
+ [:moved, :inserted, :removed, :matched].each do |_state|
83
+ return _state if send("#{_state}?")
84
+ end
85
+ end
86
+
87
+ def matched?
88
+ ! (inserted? || removed? || moved?)
89
+ end
90
+
91
+ def inserted?
92
+ @lnode.nil?
93
+ end
94
+
95
+ def removed?
96
+ @rnode.nil?
97
+ end
98
+
99
+ def moved?
100
+ return false if inserted? || removed? || @parent.nil?
101
+ !@parent.parent_of? @lnode, @rnode
102
+ end
103
+
104
+ protected
105
+
106
+ def attributes_changed?
107
+ return true if attributes.size != original_attributes.size
108
+ attributes.each do |k, v|
109
+ if original_attributes[k].nil? || v.value != original_attributes[k].value
110
+ return true
111
+ end
112
+ end
113
+ false
114
+ end
115
+
116
+ def parent_of?(lchild, rchild)
117
+ return false if @lnode.nil? || @rnode.nil?
118
+ return false unless @lnode.children.include?(lchild)
119
+ @rnode.children.include?(rchild)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,3 @@
1
+ module HTMLDOMDiff
2
+ VERSION = "0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html-dom-diff
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Frederik Fix
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-04-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pqueue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: 'Given two closely related HTML documents find a useful diff
56
+
57
+ '
58
+ email:
59
+ - ich@derfred.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - LICENSE
65
+ - README.md
66
+ - lib/html-dom-diff.rb
67
+ - lib/html-dom-diff/delta_tree_builder.rb
68
+ - lib/html-dom-diff/differ.rb
69
+ - lib/html-dom-diff/node.rb
70
+ - lib/html-dom-diff/version.rb
71
+ homepage: https://github.com/derfred/html-dom-diff
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.7.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Diff between HTML documents
95
+ test_files: []