lorax 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +6 -0
- data/LICENSE +20 -0
- data/Manifest.txt +37 -0
- data/README.rdoc +70 -0
- data/Rakefile +50 -0
- data/TODO +40 -0
- data/bin/lorax +15 -0
- data/lib/lorax.rb +35 -0
- data/lib/lorax/delta.rb +28 -0
- data/lib/lorax/delta/delete_delta.rb +19 -0
- data/lib/lorax/delta/insert_delta.rb +22 -0
- data/lib/lorax/delta/modify_delta.rb +51 -0
- data/lib/lorax/delta_set.rb +24 -0
- data/lib/lorax/delta_set_generator.rb +36 -0
- data/lib/lorax/fast_matcher.rb +108 -0
- data/lib/lorax/match.rb +22 -0
- data/lib/lorax/match_set.rb +30 -0
- data/lib/lorax/signature.rb +101 -0
- data/spec/fast_matcher_spec.rb +400 -0
- data/spec/files/Michael-Dalessio-200909.html +147 -0
- data/spec/files/Michael-Dalessio-201001.html +153 -0
- data/spec/files/slashdot-1.html +3236 -0
- data/spec/files/slashdot-2.html +3216 -0
- data/spec/files/slashdot-3.html +3228 -0
- data/spec/files/slashdot-4.html +3278 -0
- data/spec/integration/lorax_spec.rb +130 -0
- data/spec/match_spec.rb +54 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +42 -0
- data/spec/unit/delta/delete_delta_spec.rb +50 -0
- data/spec/unit/delta/insert_delta_spec.rb +109 -0
- data/spec/unit/delta/modify_delta_spec.rb +94 -0
- data/spec/unit/delta_set_generator_spec.rb +157 -0
- data/spec/unit/delta_set_spec.rb +40 -0
- data/spec/unit/lorax_spec.rb +9 -0
- data/spec/unit/match_set_spec.rb +93 -0
- data/spec/unit/signature_spec.rb +473 -0
- metadata +216 -0
- metadata.gz.sig +3 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module Lorax
|
2
|
+
class FastMatcher
|
3
|
+
attr_accessor :match_set
|
4
|
+
|
5
|
+
def initialize(doc1, doc2, dependency_injection={})
|
6
|
+
@document1 = doc1
|
7
|
+
@document2 = doc2
|
8
|
+
@match_set = dependency_injection[:matcher_match_set] || MatchSet.new(doc1, doc2, dependency_injection)
|
9
|
+
end
|
10
|
+
|
11
|
+
def match
|
12
|
+
match_node @document1.root
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def match_node(node1)
|
18
|
+
return if match_set.match(node1)
|
19
|
+
signature1 = match_set.signature1.signature(node1) # assumes node1 is in signature1
|
20
|
+
candidates = match_set.signature2.nodes(signature1) || []
|
21
|
+
candidates.reject! { |node| match_set.match(node) }
|
22
|
+
|
23
|
+
if candidates.empty?
|
24
|
+
node1.children.each do |child|
|
25
|
+
match_node(child)
|
26
|
+
end
|
27
|
+
match = match_set.match(node1)
|
28
|
+
propagate_to_children(match.pair.first, match.pair.last) if match
|
29
|
+
else
|
30
|
+
match_candidate(node1, candidates)
|
31
|
+
end
|
32
|
+
propagate_to_parent(node1) unless match_set.match(node1)
|
33
|
+
match_set
|
34
|
+
end
|
35
|
+
|
36
|
+
def match_candidate(node1, candidates)
|
37
|
+
ancestral_matches = candidates.collect do |node2|
|
38
|
+
ancestral_match(node1, node2, depth(node2, match_set.signature2))
|
39
|
+
end
|
40
|
+
longest_trail = ancestral_matches.max { |a, b| a.length <=> b.length }
|
41
|
+
longest_trail.each do |ancestral_match|
|
42
|
+
match_set.add ancestral_match
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def ancestral_match(node1, node2, max_depth)
|
47
|
+
matches = [Match.new(node1, node2, :perfect => true)]
|
48
|
+
curr1, curr2 = node1.parent, node2.parent
|
49
|
+
1.upto(max_depth) do
|
50
|
+
break unless curr1.name == curr2.name && ! curr1.is_a?(Nokogiri::XML::Document)
|
51
|
+
matches << Match.new(curr1, curr2)
|
52
|
+
curr1, curr2 = curr1.parent, curr2.parent
|
53
|
+
end
|
54
|
+
matches
|
55
|
+
end
|
56
|
+
|
57
|
+
def propagate_to_parent(node1)
|
58
|
+
node1.children.sort_by { |child| match_set.signature1.weight(child) }.reverse.each do |child|
|
59
|
+
next unless match = match_set.match(child)
|
60
|
+
match_parent = match.pair.last.parent
|
61
|
+
if match_parent.name == node1.name
|
62
|
+
match_set.add Match.new(node1, match_parent)
|
63
|
+
return
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def propagate_to_children(node1, node2)
|
69
|
+
# TODO: OMG! MY EYES ARE BLEEDING! REFACTOR ME AND OPTIMIZE ME!
|
70
|
+
children_set1 = collect_children_by_name(node1.children)
|
71
|
+
children_set2 = collect_children_by_name(node2.children)
|
72
|
+
|
73
|
+
children_set1.each do |name1, children1|
|
74
|
+
children_set2.each do |name2, children2|
|
75
|
+
next unless name1 == name2
|
76
|
+
if children1.length == 1 && children2.length == 1
|
77
|
+
match_set.add Match.new(children1.first, children2.first)
|
78
|
+
propagate_to_children children1.first, children2.first
|
79
|
+
else
|
80
|
+
children1.each do |child1|
|
81
|
+
children2.each do |child2|
|
82
|
+
if node1.children.index(child1) == node2.children.index(child2)
|
83
|
+
match_set.add Match.new(child1, child2)
|
84
|
+
propagate_to_children child1, child2
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def depth(node, sig)
|
94
|
+
depth = 1 + Math.log(sig.size) * sig.weight(node) / sig.weight
|
95
|
+
# puts "lorax: debug: #{__FILE__}:#{__LINE__}: depth #{depth} = 1 + #{Math.log(sig.size)} * #{sig.weight(node)} / #{sig.weight}"
|
96
|
+
depth.to_i
|
97
|
+
end
|
98
|
+
|
99
|
+
def collect_children_by_name(node_set)
|
100
|
+
collection = {}
|
101
|
+
node_set.each do |child|
|
102
|
+
next if match_set.match(child)
|
103
|
+
(collection[child.name] ||= []) << child
|
104
|
+
end
|
105
|
+
collection
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/lorax/match.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lorax
|
2
|
+
class Match
|
3
|
+
attr_accessor :pair
|
4
|
+
|
5
|
+
def initialize(node1, node2, options={})
|
6
|
+
@pair = [node1, node2]
|
7
|
+
@perfect = options[:perfect] ? true : false
|
8
|
+
end
|
9
|
+
|
10
|
+
def perfect?
|
11
|
+
@perfect
|
12
|
+
end
|
13
|
+
|
14
|
+
def other(node)
|
15
|
+
case node
|
16
|
+
when pair.first then pair.last
|
17
|
+
when pair.last then pair.first
|
18
|
+
else nil
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Lorax
|
2
|
+
class MatchSet
|
3
|
+
attr_accessor :signature1, :signature2
|
4
|
+
|
5
|
+
def initialize(doc1, doc2, dependency_injection={})
|
6
|
+
@document1 = doc1
|
7
|
+
@document2 = doc2
|
8
|
+
@signature1 = dependency_injection[:match_set_signature1] || Lorax::Signature.new(@document1.root)
|
9
|
+
@signature2 = dependency_injection[:match_set_signature2] || Lorax::Signature.new(@document2.root)
|
10
|
+
@matches = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def match(node)
|
14
|
+
@matches[node]
|
15
|
+
end
|
16
|
+
|
17
|
+
def matches
|
18
|
+
puts "MIKE: #{__FILE__}:#{__LINE__} REMOVE ME THIS IS FOR DEBUGGING ONLY"
|
19
|
+
@matches.values.uniq.collect {|m| [m.pair.first.path, m.pair.last.path, m.perfect?]}.sort
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(match)
|
23
|
+
match.pair.each { |node| @matches[node] = match }
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_delta_set
|
27
|
+
DeltaSetGenerator.generate_delta_set(self)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module Lorax
|
4
|
+
class Signature
|
5
|
+
SEP = "\0"
|
6
|
+
|
7
|
+
def initialize(node=nil)
|
8
|
+
@signatures = {} # node => signature
|
9
|
+
@monograms = {} # node => monogram (signature not including children)
|
10
|
+
@nodes = {} # signature => [node, ...]
|
11
|
+
@weights = {} # node => weight
|
12
|
+
@size = 0
|
13
|
+
@node = node
|
14
|
+
signature(node) if node
|
15
|
+
end
|
16
|
+
|
17
|
+
def root
|
18
|
+
@node
|
19
|
+
end
|
20
|
+
|
21
|
+
def nodes(sig=nil)
|
22
|
+
sig ? @nodes[sig] : @node
|
23
|
+
end
|
24
|
+
|
25
|
+
def size
|
26
|
+
@size
|
27
|
+
end
|
28
|
+
|
29
|
+
def signature(node=@node)
|
30
|
+
return @signatures[node] if @signatures.key?(node)
|
31
|
+
raise ArgumentError, "signature expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
32
|
+
|
33
|
+
if node.text? || node.cdata? || node.comment?
|
34
|
+
monogram = signature = hashify(node.content)
|
35
|
+
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
36
|
+
monogram = signature = hashify(node.to_html)
|
37
|
+
elsif node.element?
|
38
|
+
children_sig = hashify(node.children .collect { |child| signature(child) })
|
39
|
+
attr_sig = hashify(node.attributes.sort.collect { |k,v| [k, v.value] }.flatten)
|
40
|
+
monogram = hashify(node.name, attr_sig)
|
41
|
+
signature = hashify(node.name, attr_sig, children_sig)
|
42
|
+
else
|
43
|
+
raise ArgumentError, "signature expects an element, text, cdata or comment node, but received #{node.class}"
|
44
|
+
end
|
45
|
+
|
46
|
+
@size += 1
|
47
|
+
weight(node)
|
48
|
+
|
49
|
+
(@nodes[signature] ||= []) << node
|
50
|
+
@monograms[node] = monogram
|
51
|
+
@signatures[node] = signature
|
52
|
+
end
|
53
|
+
|
54
|
+
def weight(node=@node)
|
55
|
+
return @weights[node] if @weights.key?(node)
|
56
|
+
raise ArgumentError, "weight expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
57
|
+
|
58
|
+
calculated_weight = \
|
59
|
+
if node.text? || node.cdata? || node.comment?
|
60
|
+
1 + Math.log(node.content.length)
|
61
|
+
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
62
|
+
1
|
63
|
+
elsif node.element?
|
64
|
+
node.children.inject(1) { |sum, child| sum += weight(child) }
|
65
|
+
else
|
66
|
+
raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}"
|
67
|
+
end
|
68
|
+
|
69
|
+
@weights[node] = calculated_weight
|
70
|
+
end
|
71
|
+
|
72
|
+
def monogram(node=@node)
|
73
|
+
return @monograms[node] if @monograms.key?(node)
|
74
|
+
signature(node)
|
75
|
+
@monograms[node]
|
76
|
+
end
|
77
|
+
|
78
|
+
def set_signature(node, value) # :nodoc: for testing
|
79
|
+
(@nodes[value] ||= []) << node
|
80
|
+
@signatures[node] = value
|
81
|
+
end
|
82
|
+
|
83
|
+
def set_weight(node, value) # :nodoc: for testing
|
84
|
+
@weights[node] = value
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def hashify(*args)
|
90
|
+
if args.length == 1
|
91
|
+
if args.first.is_a?(Array)
|
92
|
+
Digest::SHA1.hexdigest args.first.join(SEP)
|
93
|
+
else
|
94
|
+
Digest::SHA1.hexdigest args.first
|
95
|
+
end
|
96
|
+
else
|
97
|
+
Digest::SHA1.hexdigest args.join(SEP)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,400 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::FastMatcher do
|
4
|
+
describe ".new" do
|
5
|
+
context "normal usage" do
|
6
|
+
it "takes two arguments" do
|
7
|
+
proc { Lorax::FastMatcher.new(xml{root}) }.should raise_error(ArgumentError)
|
8
|
+
proc { Lorax::FastMatcher.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "builds a MatchSet for the documents" do
|
12
|
+
doc1 = xml { root1 }
|
13
|
+
doc2 = xml { root2 }
|
14
|
+
mock.proxy(Lorax::MatchSet).new(doc1, doc2, anything)
|
15
|
+
Lorax::FastMatcher.new(doc1, doc2)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "dependency injection" do
|
20
|
+
it "takes an optional third argument for dependency injection" do
|
21
|
+
proc { Lorax::FastMatcher.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "will use the value of ':matcher_match_set' for @match_set" do
|
25
|
+
matcher = Lorax::FastMatcher.new(xml{root}, xml{root}, {:matcher_match_set => :foo})
|
26
|
+
matcher.match_set.should == :foo
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "basic node matching" do
|
32
|
+
context "simple matches" do
|
33
|
+
before do
|
34
|
+
@doc1 = xml { root1 {
|
35
|
+
a1
|
36
|
+
b1
|
37
|
+
} }
|
38
|
+
@doc2 = xml { root2 {
|
39
|
+
a1
|
40
|
+
b2
|
41
|
+
} }
|
42
|
+
@signature1 = Lorax::Signature.new(@doc1.root)
|
43
|
+
@signature1.set_signature(@doc1.at_css("root1"), "root1")
|
44
|
+
@signature1.set_signature(@doc1.at_css("a1"), "a1")
|
45
|
+
@signature1.set_signature(@doc1.at_css("b1"), "b1")
|
46
|
+
@signature2 = Lorax::Signature.new(@doc2.root)
|
47
|
+
@signature2.set_signature(@doc2.at_css("root2"), "root2")
|
48
|
+
@signature2.set_signature(@doc2.at_css("a1"), "a1")
|
49
|
+
@signature2.set_signature(@doc2.at_css("b2"), "b2")
|
50
|
+
end
|
51
|
+
|
52
|
+
it "matches identical nodes" do
|
53
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
54
|
+
:match_set_signature1 => @signature1,
|
55
|
+
:match_set_signature2 => @signature2).match
|
56
|
+
assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
|
57
|
+
end
|
58
|
+
|
59
|
+
it "does not match different nodes" do
|
60
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
61
|
+
:match_set_signature1 => @signature1,
|
62
|
+
:match_set_signature2 => @signature2).match
|
63
|
+
assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b2")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "sibling matches" do
|
68
|
+
it "matches all identical siblings" do
|
69
|
+
doc1 = xml { root {
|
70
|
+
a1_1 ; a1_3 ; a1_5
|
71
|
+
} }
|
72
|
+
doc2 = xml { root {
|
73
|
+
a2_1 ; a2_2 ; a2_3 ; a2_4 ; a2_5
|
74
|
+
} }
|
75
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
76
|
+
signature1.set_signature(doc1.at_css("a1_1"), "a1")
|
77
|
+
signature1.set_signature(doc1.at_css("a1_3"), "a3")
|
78
|
+
signature1.set_signature(doc1.at_css("a1_5"), "a5")
|
79
|
+
|
80
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
81
|
+
signature2.set_signature(doc2.at_css("a2_1"), "a1")
|
82
|
+
signature2.set_signature(doc2.at_css("a2_3"), "a3")
|
83
|
+
signature2.set_signature(doc2.at_css("a2_5"), "a5")
|
84
|
+
|
85
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
86
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
87
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_1"), doc2.at_css("a2_1")
|
88
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_3"), doc2.at_css("a2_3")
|
89
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_5"), doc2.at_css("a2_5")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "matching children of an unmatched node" do
|
94
|
+
it "matches those children" do
|
95
|
+
doc1 = xml { root {
|
96
|
+
a1 {
|
97
|
+
b1 ; b2
|
98
|
+
}
|
99
|
+
} }
|
100
|
+
doc2 = xml { root {
|
101
|
+
a2 {
|
102
|
+
b1 ; b2
|
103
|
+
}
|
104
|
+
} }
|
105
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
106
|
+
signature1.set_signature(doc1.at_css("a1"), "a1")
|
107
|
+
signature1.set_signature(doc1.at_css("b1"), "b1")
|
108
|
+
signature1.set_signature(doc1.at_css("b2"), "b2")
|
109
|
+
|
110
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
111
|
+
signature1.set_signature(doc2.at_css("a2"), "a2")
|
112
|
+
signature2.set_signature(doc2.at_css("b1"), "b1")
|
113
|
+
signature2.set_signature(doc2.at_css("b2"), "b2")
|
114
|
+
|
115
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
116
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
117
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
118
|
+
assert_perfect_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context "nested matches" do
|
123
|
+
before do
|
124
|
+
@doc1 = xml { root1 { a1 { b1 } } }
|
125
|
+
@doc2 = xml { root2 { a1 { b1 } } }
|
126
|
+
@signature1 = Lorax::Signature.new(@doc1.root)
|
127
|
+
@signature1.set_signature(@doc1.at_css("a1"), "a1")
|
128
|
+
@signature1.set_signature(@doc1.at_css("b1"), "b1")
|
129
|
+
@signature2 = Lorax::Signature.new(@doc2.root)
|
130
|
+
@signature2.set_signature(@doc2.at_css("a1"), "a1")
|
131
|
+
@signature2.set_signature(@doc2.at_css("b1"), "b2")
|
132
|
+
end
|
133
|
+
|
134
|
+
it "matches the root nodes of the largest identical subtree" do
|
135
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
136
|
+
:match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
|
137
|
+
assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
|
138
|
+
end
|
139
|
+
|
140
|
+
it "does not match children of identical match nodes" do
|
141
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
142
|
+
:match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
|
143
|
+
assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b1")
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "forced parent matching" do
|
149
|
+
before do
|
150
|
+
stub.instance_of(Lorax::FastMatcher).propagate_to_parent # we're not testing propagation to parent
|
151
|
+
end
|
152
|
+
|
153
|
+
it "forces a match when parent names are the same but attributes are different" do
|
154
|
+
doc1 = xml { root { a1(:foo => "bar") { b1 } } }
|
155
|
+
doc2 = xml { root { a1(:bazz => "quux") { b1 } } }
|
156
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
157
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
158
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
159
|
+
end
|
160
|
+
|
161
|
+
it "forces a match when parent names and attributes are the same but siblings are different" do
|
162
|
+
doc1 = xml { root { a1(:foo => "bar") { b1 ; b2 } } }
|
163
|
+
doc2 = xml { root { a1(:foo => "bar") { b1 ; b3 } } }
|
164
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
165
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
166
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
167
|
+
end
|
168
|
+
|
169
|
+
describe "subsequent forced child matching" do
|
170
|
+
it "force matches a uniquely-named sibling" do
|
171
|
+
doc1 = xml { root { a1 {
|
172
|
+
b2 "goodbye"
|
173
|
+
b1 "hello"
|
174
|
+
b3
|
175
|
+
b4
|
176
|
+
} } }
|
177
|
+
doc2 = xml { root { a1 {
|
178
|
+
b2 "good boy"
|
179
|
+
b1 "hello"
|
180
|
+
b3 "something"
|
181
|
+
b4 { c1 }
|
182
|
+
} } }
|
183
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
184
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
185
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
186
|
+
assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
187
|
+
assert_forced_match_exists match_set, doc1.at_css("b3"), doc2.at_css("b3")
|
188
|
+
assert_forced_match_exists match_set, doc1.at_css("b4"), doc2.at_css("b4")
|
189
|
+
end
|
190
|
+
|
191
|
+
it "force matches recursively" do
|
192
|
+
doc1 = xml { root { a1 ; a2 { b2 "hello" } } }
|
193
|
+
doc2 = xml { root { a1 ; a2 { b2 "goodbye" } } }
|
194
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
195
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
196
|
+
assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
|
197
|
+
assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
198
|
+
assert_forced_match_exists match_set, doc1.at_xpath("//b2/text()"), doc2.at_xpath("//b2/text()")
|
199
|
+
end
|
200
|
+
|
201
|
+
it "should match uniquely-named unmatched children" do
|
202
|
+
doc1 = xml { root {
|
203
|
+
a1 "hello"
|
204
|
+
a2 "goodbye"
|
205
|
+
a3 "natch"
|
206
|
+
} }
|
207
|
+
doc2 = xml { root {
|
208
|
+
a1 "hello"
|
209
|
+
a3 "not"
|
210
|
+
} }
|
211
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
212
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
213
|
+
assert_forced_match_exists match_set, doc1.at_css("a3"), doc2.at_css("a3")
|
214
|
+
end
|
215
|
+
|
216
|
+
it "should match same-named children in the same position, even if they are not uniquely named" do
|
217
|
+
doc1 = xml { root {
|
218
|
+
a1 {
|
219
|
+
text "hello"
|
220
|
+
b1 "foo"
|
221
|
+
text "goodbye"
|
222
|
+
}
|
223
|
+
} }
|
224
|
+
doc2 = xml { root {
|
225
|
+
a1 {
|
226
|
+
text "bonjour"
|
227
|
+
b1 "foo"
|
228
|
+
text "au revoir"
|
229
|
+
}
|
230
|
+
} }
|
231
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
232
|
+
assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[1]"), doc2.at_xpath("/root/a1/text()[1]")
|
233
|
+
assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[2]"), doc2.at_xpath("/root/a1/text()[2]")
|
234
|
+
end
|
235
|
+
|
236
|
+
it "large subtree matches force more parent matches than smaller subtree matches" do
|
237
|
+
small_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
238
|
+
f1
|
239
|
+
f2
|
240
|
+
} } } } } } }
|
241
|
+
small_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
242
|
+
f1
|
243
|
+
f3
|
244
|
+
} } } } } } }
|
245
|
+
large_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
246
|
+
f1
|
247
|
+
f2
|
248
|
+
} } } } } } }
|
249
|
+
large_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
250
|
+
f1
|
251
|
+
f3
|
252
|
+
} } } } } } }
|
253
|
+
|
254
|
+
small_signature1 = Lorax::Signature.new(small_doc1.root)
|
255
|
+
small_signature1.set_weight(small_doc1.at_css("f1"), 1)
|
256
|
+
small_signature2 = Lorax::Signature.new(small_doc2.root)
|
257
|
+
small_signature2.set_weight(small_doc2.at_css("f1"), 1)
|
258
|
+
large_signature1 = Lorax::Signature.new(large_doc1.root)
|
259
|
+
large_signature1.set_weight(large_doc1.at_css("f1"), 10)
|
260
|
+
large_signature2 = Lorax::Signature.new(large_doc2.root)
|
261
|
+
large_signature2.set_weight(large_doc2.at_css("f1"), 10)
|
262
|
+
|
263
|
+
small_match_set = Lorax::FastMatcher.new(small_doc1, small_doc2,
|
264
|
+
:match_set_signature1 => small_signature1, :match_set_signature2 => small_signature2).match
|
265
|
+
large_match_set = Lorax::FastMatcher.new(large_doc1, large_doc2,
|
266
|
+
:match_set_signature1 => large_signature1, :match_set_signature2 => large_signature2).match
|
267
|
+
|
268
|
+
assert_forced_match_exists small_match_set, small_doc1.at_css("e1"), small_doc2.at_css("e1")
|
269
|
+
assert_no_match_exists small_match_set, small_doc1.at_css("d1"), small_doc2.at_css("d1")
|
270
|
+
|
271
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("e1"), large_doc2.at_css("e1")
|
272
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("d1"), large_doc2.at_css("d1")
|
273
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("c1"), large_doc2.at_css("c1")
|
274
|
+
assert_no_match_exists large_match_set, large_doc1.at_css("b1"), large_doc2.at_css("b1")
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
describe "propagating matches to unmatched parents based on children's matches' parents" do
|
280
|
+
context "when there is only one child" do
|
281
|
+
it "should match parents all the way up the tree" do
|
282
|
+
doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
283
|
+
f1 "hello"
|
284
|
+
f2
|
285
|
+
} } } } } } }
|
286
|
+
doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
287
|
+
f1 "hello"
|
288
|
+
f3
|
289
|
+
} } } } } } }
|
290
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
291
|
+
assert_perfect_match_exists match_set, doc1.at_css("f1"), doc2.at_css("f1")
|
292
|
+
%w[e1 d1 c1 b1 a1 root].each do |node_name|
|
293
|
+
assert_forced_match_exists match_set, doc1.at_css(node_name), doc2.at_css(node_name)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
context "there are many possible children" do
|
299
|
+
it "should match via children with largest weight" do
|
300
|
+
doc1 = xml { root {
|
301
|
+
a1 { b1 ; b2 }
|
302
|
+
} }
|
303
|
+
doc2 = xml { root {
|
304
|
+
a1 { b1 ; b3 }
|
305
|
+
a1 { b2 ; b4 }
|
306
|
+
} }
|
307
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
308
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
309
|
+
signature1.set_weight(doc1.at_css("b1"), 10)
|
310
|
+
signature1.set_weight(doc1.at_css("b2"), 100)
|
311
|
+
signature2.set_weight(doc2.at_css("b1"), 10)
|
312
|
+
signature2.set_weight(doc2.at_css("b2"), 100)
|
313
|
+
|
314
|
+
match_set = Lorax::MatchSet.new(doc1, doc2, :match_set_signature1 => signature1, :match_set_signature2 => signature2)
|
315
|
+
match_set.add Lorax::Match.new(doc1.at_css("b1"), doc2.at_css("b1"))
|
316
|
+
match_set.add Lorax::Match.new(doc1.at_css("b2"), doc2.at_css("b2"))
|
317
|
+
|
318
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2, :matcher_match_set => match_set).match
|
319
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_xpath("//a1[2]")
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
describe "choosing the best among multiple possible matches" do
|
325
|
+
context "no match's parent is same-named" do
|
326
|
+
it "we don't care which node we match, just pick one" do
|
327
|
+
doc1 = xml { root {
|
328
|
+
a1 { b1 }
|
329
|
+
} }
|
330
|
+
doc2 = xml { root {
|
331
|
+
a2 { b1 }
|
332
|
+
a3 { b1 }
|
333
|
+
} }
|
334
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
335
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
336
|
+
signature1.set_signature(doc1.at_xpath("//b1"), "b1")
|
337
|
+
signature2.set_signature(doc2.at_xpath("//a2/b1"), "b1")
|
338
|
+
signature2.set_signature(doc2.at_xpath("//a3/b1"), "b1")
|
339
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
340
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
341
|
+
match_set.match(doc1.at_css("b1")).other(doc1.at_css("b1")).name.should == "b1"
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
context "one match's parent is same-named" do
|
346
|
+
it "matches the node with the same-named parent" do
|
347
|
+
doc1 = xml { root {
|
348
|
+
a2 { b1 ; b2 }
|
349
|
+
} }
|
350
|
+
doc2 = xml { root {
|
351
|
+
a1 { b1 }
|
352
|
+
a2 { b1 }
|
353
|
+
a3 { b1 }
|
354
|
+
} }
|
355
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
356
|
+
assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
context "multiple identical nodes exist in both documents" do
|
361
|
+
it "should create one-to-one match relationships" do
|
362
|
+
doc1 = xml { root1 {
|
363
|
+
a1 ; a1 ; a1
|
364
|
+
} }
|
365
|
+
doc2 = xml { root2 {
|
366
|
+
a1 ; a1
|
367
|
+
} }
|
368
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
369
|
+
[doc1, doc2].each do |doc|
|
370
|
+
others = doc.css("a1").collect do |node|
|
371
|
+
m = match_set.match(node)
|
372
|
+
m ? m.pair.last : nil
|
373
|
+
end
|
374
|
+
others.uniq.length.should == others.length
|
375
|
+
end
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
context "multiple matches' parents are same-named" do
|
380
|
+
it "matches the node with the same-named grandparent" do
|
381
|
+
doc1 = xml { root {
|
382
|
+
wrap2 {
|
383
|
+
a1 { b1 { 10.times { c1 "hello there" } } ; b2 }
|
384
|
+
} } }
|
385
|
+
doc2 = xml { root {
|
386
|
+
wrap1 {
|
387
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
388
|
+
}
|
389
|
+
wrap2 {
|
390
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
391
|
+
}
|
392
|
+
wrap3 {
|
393
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
394
|
+
} } }
|
395
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
396
|
+
assert_forced_match_exists match_set, doc1.at_css("wrap2"), doc2.at_css("wrap2")
|
397
|
+
end
|
398
|
+
end
|
399
|
+
end
|
400
|
+
end
|