html-dom-diff 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/html-dom-diff/delta_tree_builder.rb +40 -7
- data/lib/html-dom-diff/differ.rb +25 -21
- data/lib/html-dom-diff/node.rb +4 -2
- data/lib/html-dom-diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d731a32d076bd3fc64436c74dff88fa653349a1dc35fd7dfe1598bb7210b001
|
4
|
+
data.tar.gz: 79e7466f13f022fd357dd4997d5f168aa6a68b56e9e4f78f494b645658d1fbb5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca5d8dcc3952fff8d1b4f03cad348b178f23b1b7c27de085117fce57eba4596fd8e05875f7fc84d9608cbb0c4d4c0ad7665d7d62bfdf1c513d336b0bfa7cdb2d
|
7
|
+
data.tar.gz: 1590f4c2fc8325396d07c2483d2f59f82348c4485132653ca966662a3e62456e9da93ebd687b65858365479b34313d26341dcb4e5edb2de325302e0a910afc3b
|
@@ -1,22 +1,55 @@
|
|
1
1
|
module HTMLDOMDiff
|
2
2
|
class DeltaTreeBuilder
|
3
3
|
attr_reader :ldoc, :rdoc
|
4
|
-
def initialize(ldoc, rdoc
|
4
|
+
def initialize(ldoc, rdoc)
|
5
5
|
@ldoc = ldoc
|
6
6
|
@rdoc = rdoc
|
7
|
-
@weights =
|
8
|
-
@forward =
|
9
|
-
@backward =
|
7
|
+
@weights = {}
|
8
|
+
@forward = {}
|
9
|
+
@backward = {}
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def root
|
13
13
|
wrap @rdoc
|
14
14
|
end
|
15
15
|
|
16
|
+
def total_weight
|
17
|
+
@weights[ldoc].to_f + @weights[rdoc].to_f
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_weight(element, weight)
|
21
|
+
@weights[element] = weight
|
22
|
+
end
|
23
|
+
|
24
|
+
def weight(element)
|
25
|
+
@weights[element]
|
26
|
+
end
|
27
|
+
|
28
|
+
def match(left, right)
|
29
|
+
@forward[left] = right
|
30
|
+
@backward[right] = left
|
31
|
+
end
|
32
|
+
|
33
|
+
def left_matches?(lnode, rnode)
|
34
|
+
@forward[lnode] == rnode
|
35
|
+
end
|
36
|
+
|
37
|
+
def left_match(lnode)
|
38
|
+
@forward[lnode]
|
39
|
+
end
|
40
|
+
|
41
|
+
def left_matched?(lnode)
|
42
|
+
@forward.has_key?(lnode)
|
43
|
+
end
|
44
|
+
|
45
|
+
def right_matched?(rnode)
|
46
|
+
@backward.has_key?(rnode)
|
47
|
+
end
|
48
|
+
|
16
49
|
private
|
17
50
|
|
18
51
|
def wrap(rnode, parent=nil)
|
19
|
-
result = Node.new rnode, @backward[rnode], parent
|
52
|
+
result = Node.new rnode, @backward[rnode], @weights[rnode], parent
|
20
53
|
rnode.children.each do |child|
|
21
54
|
wrap child, result
|
22
55
|
end
|
@@ -33,7 +66,7 @@ module HTMLDOMDiff
|
|
33
66
|
|
34
67
|
def reverse_wrap(lnode, parent)
|
35
68
|
return if @forward[lnode]
|
36
|
-
result = Node.new nil, lnode
|
69
|
+
result = Node.new nil, lnode, @weights[lnode]
|
37
70
|
lnode.children.each { |c| reverse_wrap c, result }
|
38
71
|
parent.add_child result
|
39
72
|
end
|
data/lib/html-dom-diff/differ.rb
CHANGED
@@ -13,7 +13,7 @@ module HTMLDOMDiff
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def diff(ldoc, rdoc)
|
16
|
-
reset
|
16
|
+
reset ldoc, rdoc
|
17
17
|
|
18
18
|
match_by_ids ldoc, rdoc
|
19
19
|
prep_with @lsignatures, ldoc
|
@@ -27,11 +27,17 @@ module HTMLDOMDiff
|
|
27
27
|
match_bottom_up ldoc
|
28
28
|
match_top_down ldoc
|
29
29
|
|
30
|
-
|
30
|
+
@builder
|
31
31
|
end
|
32
32
|
|
33
33
|
private
|
34
34
|
|
35
|
+
[:left_matches?, :left_match, :left_matched?, :right_matched?].each do |m|
|
36
|
+
define_method m do |*args|
|
37
|
+
@builder.send m, *args
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
35
41
|
def parse(string)
|
36
42
|
Nokogiri::HTML(string, nil, nil, (Nokogiri::XML::ParseOptions::DEFAULT_HTML & Nokogiri::XML::ParseOptions::NOBLANKS))
|
37
43
|
end
|
@@ -40,14 +46,12 @@ module HTMLDOMDiff
|
|
40
46
|
Nokogiri::HTML::DocumentFragment.parse(string)
|
41
47
|
end
|
42
48
|
|
43
|
-
def reset
|
44
|
-
@
|
45
|
-
@backward = {}
|
46
|
-
@weights = {}
|
49
|
+
def reset(ldoc, rdoc)
|
50
|
+
@builder = DeltaTreeBuilder.new(ldoc, rdoc)
|
47
51
|
@depths = {}
|
48
52
|
@lsignatures = {}
|
49
53
|
@rsignatures = {}
|
50
|
-
@matchqueue = PQueue.new() { |a, b| @
|
54
|
+
@matchqueue = PQueue.new() { |a, b| @builder.weight(a) > @builder.weight(b) }
|
51
55
|
end
|
52
56
|
|
53
57
|
def match_by_ids(ldoc, rdoc)
|
@@ -70,11 +74,11 @@ module HTMLDOMDiff
|
|
70
74
|
signatures << signature
|
71
75
|
end
|
72
76
|
|
73
|
-
@
|
77
|
+
@builder.add_weight(element, weights)
|
74
78
|
sig_hash[element] = hash_for(signatures)
|
75
79
|
@depths[element] = level
|
76
80
|
|
77
|
-
[
|
81
|
+
[ weights, sig_hash[element] ]
|
78
82
|
end
|
79
83
|
|
80
84
|
def weight_for(element)
|
@@ -98,8 +102,7 @@ module HTMLDOMDiff
|
|
98
102
|
end
|
99
103
|
|
100
104
|
def record_matching(left, right)
|
101
|
-
@
|
102
|
-
@backward[right] = left
|
105
|
+
@builder.match(left, right)
|
103
106
|
end
|
104
107
|
|
105
108
|
def perform_initial_top_down_matching(lnodes, rnodes)
|
@@ -119,7 +122,7 @@ module HTMLDOMDiff
|
|
119
122
|
def perform_initial_matching
|
120
123
|
while @matchqueue.size > 0
|
121
124
|
element = @matchqueue.pop
|
122
|
-
if
|
125
|
+
if !right_matched?(element) && (match = find_best_match(element))
|
123
126
|
match_all_children match, element
|
124
127
|
match_parents match, element
|
125
128
|
else
|
@@ -131,7 +134,7 @@ module HTMLDOMDiff
|
|
131
134
|
def find_best_match(element)
|
132
135
|
candidates = []
|
133
136
|
@lsignatures.each do |left, sig|
|
134
|
-
if
|
137
|
+
if !left_matched?(left) && sig == @rsignatures[element]
|
135
138
|
candidates << left
|
136
139
|
end
|
137
140
|
end
|
@@ -142,7 +145,7 @@ module HTMLDOMDiff
|
|
142
145
|
return candidates.first
|
143
146
|
else
|
144
147
|
matching_parents = candidates.select do |left|
|
145
|
-
|
148
|
+
left_matches?(left.parent, element.parent)
|
146
149
|
end
|
147
150
|
|
148
151
|
if matching_parents.size == 1
|
@@ -162,9 +165,9 @@ module HTMLDOMDiff
|
|
162
165
|
|
163
166
|
def match_parents(left, right)
|
164
167
|
# TODO implement multi-ancestor matching
|
165
|
-
return if
|
168
|
+
return if left_matched?(left.parent) || right_matched?(right.parent)
|
166
169
|
if left.parent.name == right.parent.name
|
167
|
-
record_matching left, right
|
170
|
+
record_matching left.parent, right.parent
|
168
171
|
end
|
169
172
|
end
|
170
173
|
|
@@ -173,16 +176,17 @@ module HTMLDOMDiff
|
|
173
176
|
match_bottom_up child
|
174
177
|
end
|
175
178
|
|
176
|
-
if element.respond_to?(:parent) &&
|
177
|
-
|
179
|
+
if !left_matched?(element) && element.respond_to?(:parent) && left_matched?(element.parent)
|
180
|
+
children = left_match(element.parent).children.reject { |c| right_matched?(c) }
|
181
|
+
match = children.find { |c| c.name == element.name }
|
178
182
|
record_matching(element, match) if match
|
179
183
|
end
|
180
184
|
end
|
181
185
|
|
182
186
|
def match_top_down(element)
|
183
|
-
|
184
|
-
childmatches = element.children.
|
185
|
-
childmatches.reject! { |e|
|
187
|
+
unless left_matched?(element)
|
188
|
+
childmatches = element.children.select { |c| left_matched?(c) }.map { |c| left_match(c).parent }.uniq
|
189
|
+
childmatches.reject! { |e| right_matched?(e) }
|
186
190
|
if childmatches.size == 1 && childmatches.first.name == element.name
|
187
191
|
record_matching(element, childmatches.first)
|
188
192
|
end
|
data/lib/html-dom-diff/node.rb
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
module HTMLDOMDiff
|
2
2
|
class Node
|
3
|
-
attr_reader :parent, :children
|
3
|
+
attr_reader :parent, :children, :weight
|
4
4
|
|
5
5
|
attr_reader :rnode
|
6
6
|
|
7
|
-
def initialize(rnode, lnode, parent=nil)
|
7
|
+
def initialize(rnode, lnode, weight, parent=nil)
|
8
8
|
@rnode = rnode
|
9
9
|
@lnode = lnode
|
10
|
+
@weight = weight
|
10
11
|
@parent = parent
|
11
12
|
@children = []
|
12
13
|
end
|
@@ -71,6 +72,7 @@ module HTMLDOMDiff
|
|
71
72
|
|
72
73
|
# states
|
73
74
|
def changed?
|
75
|
+
return false unless @rnode && @lnode
|
74
76
|
if @rnode.text?
|
75
77
|
text != original_text
|
76
78
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html-dom-diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Frederik Fix
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-04-
|
11
|
+
date: 2018-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|