lorax 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ module Lorax
2
+ class FastMatcher
3
+ attr_accessor :match_set
4
+
5
+ def initialize(doc1, doc2, dependency_injection={})
6
+ @document1 = doc1
7
+ @document2 = doc2
8
+ @match_set = dependency_injection[:matcher_match_set] || MatchSet.new(doc1, doc2, dependency_injection)
9
+ end
10
+
11
+ def match
12
+ match_node @document1.root
13
+ end
14
+
15
+ private
16
+
17
+ def match_node(node1)
18
+ return if match_set.match(node1)
19
+ signature1 = match_set.signature1.signature(node1) # assumes node1 is in signature1
20
+ candidates = match_set.signature2.nodes(signature1) || []
21
+ candidates.reject! { |node| match_set.match(node) }
22
+
23
+ if candidates.empty?
24
+ node1.children.each do |child|
25
+ match_node(child)
26
+ end
27
+ match = match_set.match(node1)
28
+ propagate_to_children(match.pair.first, match.pair.last) if match
29
+ else
30
+ match_candidate(node1, candidates)
31
+ end
32
+ propagate_to_parent(node1) unless match_set.match(node1)
33
+ match_set
34
+ end
35
+
36
+ def match_candidate(node1, candidates)
37
+ ancestral_matches = candidates.collect do |node2|
38
+ ancestral_match(node1, node2, depth(node2, match_set.signature2))
39
+ end
40
+ longest_trail = ancestral_matches.max { |a, b| a.length <=> b.length }
41
+ longest_trail.each do |ancestral_match|
42
+ match_set.add ancestral_match
43
+ end
44
+ end
45
+
46
+ def ancestral_match(node1, node2, max_depth)
47
+ matches = [Match.new(node1, node2, :perfect => true)]
48
+ curr1, curr2 = node1.parent, node2.parent
49
+ 1.upto(max_depth) do
50
+ break unless curr1.name == curr2.name && ! curr1.is_a?(Nokogiri::XML::Document)
51
+ matches << Match.new(curr1, curr2)
52
+ curr1, curr2 = curr1.parent, curr2.parent
53
+ end
54
+ matches
55
+ end
56
+
57
+ def propagate_to_parent(node1)
58
+ node1.children.sort_by { |child| match_set.signature1.weight(child) }.reverse.each do |child|
59
+ next unless match = match_set.match(child)
60
+ match_parent = match.pair.last.parent
61
+ if match_parent.name == node1.name
62
+ match_set.add Match.new(node1, match_parent)
63
+ return
64
+ end
65
+ end
66
+ end
67
+
68
+ def propagate_to_children(node1, node2)
69
+ # TODO: OMG! MY EYES ARE BLEEDING! REFACTOR ME AND OPTIMIZE ME!
70
+ children_set1 = collect_children_by_name(node1.children)
71
+ children_set2 = collect_children_by_name(node2.children)
72
+
73
+ children_set1.each do |name1, children1|
74
+ children_set2.each do |name2, children2|
75
+ next unless name1 == name2
76
+ if children1.length == 1 && children2.length == 1
77
+ match_set.add Match.new(children1.first, children2.first)
78
+ propagate_to_children children1.first, children2.first
79
+ else
80
+ children1.each do |child1|
81
+ children2.each do |child2|
82
+ if node1.children.index(child1) == node2.children.index(child2)
83
+ match_set.add Match.new(child1, child2)
84
+ propagate_to_children child1, child2
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
92
+
93
+ def depth(node, sig)
94
+ depth = 1 + Math.log(sig.size) * sig.weight(node) / sig.weight
95
+ # puts "lorax: debug: #{__FILE__}:#{__LINE__}: depth #{depth} = 1 + #{Math.log(sig.size)} * #{sig.weight(node)} / #{sig.weight}"
96
+ depth.to_i
97
+ end
98
+
99
+ def collect_children_by_name(node_set)
100
+ collection = {}
101
+ node_set.each do |child|
102
+ next if match_set.match(child)
103
+ (collection[child.name] ||= []) << child
104
+ end
105
+ collection
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,22 @@
1
+ module Lorax
2
+ class Match
3
+ attr_accessor :pair
4
+
5
+ def initialize(node1, node2, options={})
6
+ @pair = [node1, node2]
7
+ @perfect = options[:perfect] ? true : false
8
+ end
9
+
10
+ def perfect?
11
+ @perfect
12
+ end
13
+
14
+ def other(node)
15
+ case node
16
+ when pair.first then pair.last
17
+ when pair.last then pair.first
18
+ else nil
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,30 @@
1
+ module Lorax
2
+ class MatchSet
3
+ attr_accessor :signature1, :signature2
4
+
5
+ def initialize(doc1, doc2, dependency_injection={})
6
+ @document1 = doc1
7
+ @document2 = doc2
8
+ @signature1 = dependency_injection[:match_set_signature1] || Lorax::Signature.new(@document1.root)
9
+ @signature2 = dependency_injection[:match_set_signature2] || Lorax::Signature.new(@document2.root)
10
+ @matches = {}
11
+ end
12
+
13
+ def match(node)
14
+ @matches[node]
15
+ end
16
+
17
+ def matches
18
+ puts "MIKE: #{__FILE__}:#{__LINE__} REMOVE ME THIS IS FOR DEBUGGING ONLY"
19
+ @matches.values.uniq.collect {|m| [m.pair.first.path, m.pair.last.path, m.perfect?]}.sort
20
+ end
21
+
22
+ def add(match)
23
+ match.pair.each { |node| @matches[node] = match }
24
+ end
25
+
26
+ def to_delta_set
27
+ DeltaSetGenerator.generate_delta_set(self)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,101 @@
1
+ require 'digest/sha1'
2
+
3
+ module Lorax
4
+ class Signature
5
+ SEP = "\0"
6
+
7
+ def initialize(node=nil)
8
+ @signatures = {} # node => signature
9
+ @monograms = {} # node => monogram (signature not including children)
10
+ @nodes = {} # signature => [node, ...]
11
+ @weights = {} # node => weight
12
+ @size = 0
13
+ @node = node
14
+ signature(node) if node
15
+ end
16
+
17
+ def root
18
+ @node
19
+ end
20
+
21
+ def nodes(sig=nil)
22
+ sig ? @nodes[sig] : @node
23
+ end
24
+
25
+ def size
26
+ @size
27
+ end
28
+
29
+ def signature(node=@node)
30
+ return @signatures[node] if @signatures.key?(node)
31
+ raise ArgumentError, "signature expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
32
+
33
+ if node.text? || node.cdata? || node.comment?
34
+ monogram = signature = hashify(node.content)
35
+ elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
36
+ monogram = signature = hashify(node.to_html)
37
+ elsif node.element?
38
+ children_sig = hashify(node.children .collect { |child| signature(child) })
39
+ attr_sig = hashify(node.attributes.sort.collect { |k,v| [k, v.value] }.flatten)
40
+ monogram = hashify(node.name, attr_sig)
41
+ signature = hashify(node.name, attr_sig, children_sig)
42
+ else
43
+ raise ArgumentError, "signature expects an element, text, cdata or comment node, but received #{node.class}"
44
+ end
45
+
46
+ @size += 1
47
+ weight(node)
48
+
49
+ (@nodes[signature] ||= []) << node
50
+ @monograms[node] = monogram
51
+ @signatures[node] = signature
52
+ end
53
+
54
+ def weight(node=@node)
55
+ return @weights[node] if @weights.key?(node)
56
+ raise ArgumentError, "weight expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
57
+
58
+ calculated_weight = \
59
+ if node.text? || node.cdata? || node.comment?
60
+ 1 + Math.log(node.content.length)
61
+ elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
62
+ 1
63
+ elsif node.element?
64
+ node.children.inject(1) { |sum, child| sum += weight(child) }
65
+ else
66
+ raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}"
67
+ end
68
+
69
+ @weights[node] = calculated_weight
70
+ end
71
+
72
+ def monogram(node=@node)
73
+ return @monograms[node] if @monograms.key?(node)
74
+ signature(node)
75
+ @monograms[node]
76
+ end
77
+
78
+ def set_signature(node, value) # :nodoc: for testing
79
+ (@nodes[value] ||= []) << node
80
+ @signatures[node] = value
81
+ end
82
+
83
+ def set_weight(node, value) # :nodoc: for testing
84
+ @weights[node] = value
85
+ end
86
+
87
+ private
88
+
89
+ def hashify(*args)
90
+ if args.length == 1
91
+ if args.first.is_a?(Array)
92
+ Digest::SHA1.hexdigest args.first.join(SEP)
93
+ else
94
+ Digest::SHA1.hexdigest args.first
95
+ end
96
+ else
97
+ Digest::SHA1.hexdigest args.join(SEP)
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,400 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe Lorax::FastMatcher do
4
+ describe ".new" do
5
+ context "normal usage" do
6
+ it "takes two arguments" do
7
+ proc { Lorax::FastMatcher.new(xml{root}) }.should raise_error(ArgumentError)
8
+ proc { Lorax::FastMatcher.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError)
9
+ end
10
+
11
+ it "builds a MatchSet for the documents" do
12
+ doc1 = xml { root1 }
13
+ doc2 = xml { root2 }
14
+ mock.proxy(Lorax::MatchSet).new(doc1, doc2, anything)
15
+ Lorax::FastMatcher.new(doc1, doc2)
16
+ end
17
+ end
18
+
19
+ context "dependency injection" do
20
+ it "takes an optional third argument for dependency injection" do
21
+ proc { Lorax::FastMatcher.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError)
22
+ end
23
+
24
+ it "will use the value of ':matcher_match_set' for @match_set" do
25
+ matcher = Lorax::FastMatcher.new(xml{root}, xml{root}, {:matcher_match_set => :foo})
26
+ matcher.match_set.should == :foo
27
+ end
28
+ end
29
+ end
30
+
31
+ describe "basic node matching" do
32
+ context "simple matches" do
33
+ before do
34
+ @doc1 = xml { root1 {
35
+ a1
36
+ b1
37
+ } }
38
+ @doc2 = xml { root2 {
39
+ a1
40
+ b2
41
+ } }
42
+ @signature1 = Lorax::Signature.new(@doc1.root)
43
+ @signature1.set_signature(@doc1.at_css("root1"), "root1")
44
+ @signature1.set_signature(@doc1.at_css("a1"), "a1")
45
+ @signature1.set_signature(@doc1.at_css("b1"), "b1")
46
+ @signature2 = Lorax::Signature.new(@doc2.root)
47
+ @signature2.set_signature(@doc2.at_css("root2"), "root2")
48
+ @signature2.set_signature(@doc2.at_css("a1"), "a1")
49
+ @signature2.set_signature(@doc2.at_css("b2"), "b2")
50
+ end
51
+
52
+ it "matches identical nodes" do
53
+ match_set = Lorax::FastMatcher.new(@doc1, @doc2,
54
+ :match_set_signature1 => @signature1,
55
+ :match_set_signature2 => @signature2).match
56
+ assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
57
+ end
58
+
59
+ it "does not match different nodes" do
60
+ match_set = Lorax::FastMatcher.new(@doc1, @doc2,
61
+ :match_set_signature1 => @signature1,
62
+ :match_set_signature2 => @signature2).match
63
+ assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b2")
64
+ end
65
+ end
66
+
67
+ context "sibling matches" do
68
+ it "matches all identical siblings" do
69
+ doc1 = xml { root {
70
+ a1_1 ; a1_3 ; a1_5
71
+ } }
72
+ doc2 = xml { root {
73
+ a2_1 ; a2_2 ; a2_3 ; a2_4 ; a2_5
74
+ } }
75
+ signature1 = Lorax::Signature.new(doc1.root)
76
+ signature1.set_signature(doc1.at_css("a1_1"), "a1")
77
+ signature1.set_signature(doc1.at_css("a1_3"), "a3")
78
+ signature1.set_signature(doc1.at_css("a1_5"), "a5")
79
+
80
+ signature2 = Lorax::Signature.new(doc2.root)
81
+ signature2.set_signature(doc2.at_css("a2_1"), "a1")
82
+ signature2.set_signature(doc2.at_css("a2_3"), "a3")
83
+ signature2.set_signature(doc2.at_css("a2_5"), "a5")
84
+
85
+ match_set = Lorax::FastMatcher.new(doc1, doc2,
86
+ :match_set_signature1 => signature1, :match_set_signature2 => signature2).match
87
+ assert_perfect_match_exists match_set, doc1.at_css("a1_1"), doc2.at_css("a2_1")
88
+ assert_perfect_match_exists match_set, doc1.at_css("a1_3"), doc2.at_css("a2_3")
89
+ assert_perfect_match_exists match_set, doc1.at_css("a1_5"), doc2.at_css("a2_5")
90
+ end
91
+ end
92
+
93
+ context "matching children of an unmatched node" do
94
+ it "matches those children" do
95
+ doc1 = xml { root {
96
+ a1 {
97
+ b1 ; b2
98
+ }
99
+ } }
100
+ doc2 = xml { root {
101
+ a2 {
102
+ b1 ; b2
103
+ }
104
+ } }
105
+ signature1 = Lorax::Signature.new(doc1.root)
106
+ signature1.set_signature(doc1.at_css("a1"), "a1")
107
+ signature1.set_signature(doc1.at_css("b1"), "b1")
108
+ signature1.set_signature(doc1.at_css("b2"), "b2")
109
+
110
+ signature2 = Lorax::Signature.new(doc2.root)
111
+ signature1.set_signature(doc2.at_css("a2"), "a2")
112
+ signature2.set_signature(doc2.at_css("b1"), "b1")
113
+ signature2.set_signature(doc2.at_css("b2"), "b2")
114
+
115
+ match_set = Lorax::FastMatcher.new(doc1, doc2,
116
+ :match_set_signature1 => signature1, :match_set_signature2 => signature2).match
117
+ assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
118
+ assert_perfect_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
119
+ end
120
+ end
121
+
122
+ context "nested matches" do
123
+ before do
124
+ @doc1 = xml { root1 { a1 { b1 } } }
125
+ @doc2 = xml { root2 { a1 { b1 } } }
126
+ @signature1 = Lorax::Signature.new(@doc1.root)
127
+ @signature1.set_signature(@doc1.at_css("a1"), "a1")
128
+ @signature1.set_signature(@doc1.at_css("b1"), "b1")
129
+ @signature2 = Lorax::Signature.new(@doc2.root)
130
+ @signature2.set_signature(@doc2.at_css("a1"), "a1")
131
+ @signature2.set_signature(@doc2.at_css("b1"), "b2")
132
+ end
133
+
134
+ it "matches the root nodes of the largest identical subtree" do
135
+ match_set = Lorax::FastMatcher.new(@doc1, @doc2,
136
+ :match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
137
+ assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
138
+ end
139
+
140
+ it "does not match children of identical match nodes" do
141
+ match_set = Lorax::FastMatcher.new(@doc1, @doc2,
142
+ :match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
143
+ assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b1")
144
+ end
145
+ end
146
+ end
147
+
148
+ describe "forced parent matching" do
149
+ before do
150
+ stub.instance_of(Lorax::FastMatcher).propagate_to_parent # we're not testing propagation to parent
151
+ end
152
+
153
+ it "forces a match when parent names are the same but attributes are different" do
154
+ doc1 = xml { root { a1(:foo => "bar") { b1 } } }
155
+ doc2 = xml { root { a1(:bazz => "quux") { b1 } } }
156
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
157
+ assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
158
+ assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
159
+ end
160
+
161
+ it "forces a match when parent names and attributes are the same but siblings are different" do
162
+ doc1 = xml { root { a1(:foo => "bar") { b1 ; b2 } } }
163
+ doc2 = xml { root { a1(:foo => "bar") { b1 ; b3 } } }
164
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
165
+ assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
166
+ assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
167
+ end
168
+
169
+ describe "subsequent forced child matching" do
170
+ it "force matches a uniquely-named sibling" do
171
+ doc1 = xml { root { a1 {
172
+ b2 "goodbye"
173
+ b1 "hello"
174
+ b3
175
+ b4
176
+ } } }
177
+ doc2 = xml { root { a1 {
178
+ b2 "good boy"
179
+ b1 "hello"
180
+ b3 "something"
181
+ b4 { c1 }
182
+ } } }
183
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
184
+ assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
185
+ assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
186
+ assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
187
+ assert_forced_match_exists match_set, doc1.at_css("b3"), doc2.at_css("b3")
188
+ assert_forced_match_exists match_set, doc1.at_css("b4"), doc2.at_css("b4")
189
+ end
190
+
191
+ it "force matches recursively" do
192
+ doc1 = xml { root { a1 ; a2 { b2 "hello" } } }
193
+ doc2 = xml { root { a1 ; a2 { b2 "goodbye" } } }
194
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
195
+ assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
196
+ assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
197
+ assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
198
+ assert_forced_match_exists match_set, doc1.at_xpath("//b2/text()"), doc2.at_xpath("//b2/text()")
199
+ end
200
+
201
+ it "should match uniquely-named unmatched children" do
202
+ doc1 = xml { root {
203
+ a1 "hello"
204
+ a2 "goodbye"
205
+ a3 "natch"
206
+ } }
207
+ doc2 = xml { root {
208
+ a1 "hello"
209
+ a3 "not"
210
+ } }
211
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
212
+ assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
213
+ assert_forced_match_exists match_set, doc1.at_css("a3"), doc2.at_css("a3")
214
+ end
215
+
216
+ it "should match same-named children in the same position, even if they are not uniquely named" do
217
+ doc1 = xml { root {
218
+ a1 {
219
+ text "hello"
220
+ b1 "foo"
221
+ text "goodbye"
222
+ }
223
+ } }
224
+ doc2 = xml { root {
225
+ a1 {
226
+ text "bonjour"
227
+ b1 "foo"
228
+ text "au revoir"
229
+ }
230
+ } }
231
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
232
+ assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[1]"), doc2.at_xpath("/root/a1/text()[1]")
233
+ assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[2]"), doc2.at_xpath("/root/a1/text()[2]")
234
+ end
235
+
236
+ it "large subtree matches force more parent matches than smaller subtree matches" do
237
+ small_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
238
+ f1
239
+ f2
240
+ } } } } } } }
241
+ small_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
242
+ f1
243
+ f3
244
+ } } } } } } }
245
+ large_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
246
+ f1
247
+ f2
248
+ } } } } } } }
249
+ large_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
250
+ f1
251
+ f3
252
+ } } } } } } }
253
+
254
+ small_signature1 = Lorax::Signature.new(small_doc1.root)
255
+ small_signature1.set_weight(small_doc1.at_css("f1"), 1)
256
+ small_signature2 = Lorax::Signature.new(small_doc2.root)
257
+ small_signature2.set_weight(small_doc2.at_css("f1"), 1)
258
+ large_signature1 = Lorax::Signature.new(large_doc1.root)
259
+ large_signature1.set_weight(large_doc1.at_css("f1"), 10)
260
+ large_signature2 = Lorax::Signature.new(large_doc2.root)
261
+ large_signature2.set_weight(large_doc2.at_css("f1"), 10)
262
+
263
+ small_match_set = Lorax::FastMatcher.new(small_doc1, small_doc2,
264
+ :match_set_signature1 => small_signature1, :match_set_signature2 => small_signature2).match
265
+ large_match_set = Lorax::FastMatcher.new(large_doc1, large_doc2,
266
+ :match_set_signature1 => large_signature1, :match_set_signature2 => large_signature2).match
267
+
268
+ assert_forced_match_exists small_match_set, small_doc1.at_css("e1"), small_doc2.at_css("e1")
269
+ assert_no_match_exists small_match_set, small_doc1.at_css("d1"), small_doc2.at_css("d1")
270
+
271
+ assert_forced_match_exists large_match_set, large_doc1.at_css("e1"), large_doc2.at_css("e1")
272
+ assert_forced_match_exists large_match_set, large_doc1.at_css("d1"), large_doc2.at_css("d1")
273
+ assert_forced_match_exists large_match_set, large_doc1.at_css("c1"), large_doc2.at_css("c1")
274
+ assert_no_match_exists large_match_set, large_doc1.at_css("b1"), large_doc2.at_css("b1")
275
+ end
276
+ end
277
+ end
278
+
279
+ describe "propagating matches to unmatched parents based on children's matches' parents" do
280
+ context "when there is only one child" do
281
+ it "should match parents all the way up the tree" do
282
+ doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
283
+ f1 "hello"
284
+ f2
285
+ } } } } } } }
286
+ doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
287
+ f1 "hello"
288
+ f3
289
+ } } } } } } }
290
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
291
+ assert_perfect_match_exists match_set, doc1.at_css("f1"), doc2.at_css("f1")
292
+ %w[e1 d1 c1 b1 a1 root].each do |node_name|
293
+ assert_forced_match_exists match_set, doc1.at_css(node_name), doc2.at_css(node_name)
294
+ end
295
+ end
296
+ end
297
+
298
+ context "there are many possible children" do
299
+ it "should match via children with largest weight" do
300
+ doc1 = xml { root {
301
+ a1 { b1 ; b2 }
302
+ } }
303
+ doc2 = xml { root {
304
+ a1 { b1 ; b3 }
305
+ a1 { b2 ; b4 }
306
+ } }
307
+ signature1 = Lorax::Signature.new(doc1.root)
308
+ signature2 = Lorax::Signature.new(doc2.root)
309
+ signature1.set_weight(doc1.at_css("b1"), 10)
310
+ signature1.set_weight(doc1.at_css("b2"), 100)
311
+ signature2.set_weight(doc2.at_css("b1"), 10)
312
+ signature2.set_weight(doc2.at_css("b2"), 100)
313
+
314
+ match_set = Lorax::MatchSet.new(doc1, doc2, :match_set_signature1 => signature1, :match_set_signature2 => signature2)
315
+ match_set.add Lorax::Match.new(doc1.at_css("b1"), doc2.at_css("b1"))
316
+ match_set.add Lorax::Match.new(doc1.at_css("b2"), doc2.at_css("b2"))
317
+
318
+ match_set = Lorax::FastMatcher.new(doc1, doc2, :matcher_match_set => match_set).match
319
+ assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_xpath("//a1[2]")
320
+ end
321
+ end
322
+ end
323
+
324
+ describe "choosing the best among multiple possible matches" do
325
+ context "no match's parent is same-named" do
326
+ it "we don't care which node we match, just pick one" do
327
+ doc1 = xml { root {
328
+ a1 { b1 }
329
+ } }
330
+ doc2 = xml { root {
331
+ a2 { b1 }
332
+ a3 { b1 }
333
+ } }
334
+ signature1 = Lorax::Signature.new(doc1.root)
335
+ signature2 = Lorax::Signature.new(doc2.root)
336
+ signature1.set_signature(doc1.at_xpath("//b1"), "b1")
337
+ signature2.set_signature(doc2.at_xpath("//a2/b1"), "b1")
338
+ signature2.set_signature(doc2.at_xpath("//a3/b1"), "b1")
339
+ match_set = Lorax::FastMatcher.new(doc1, doc2,
340
+ :match_set_signature1 => signature1, :match_set_signature2 => signature2).match
341
+ match_set.match(doc1.at_css("b1")).other(doc1.at_css("b1")).name.should == "b1"
342
+ end
343
+ end
344
+
345
+ context "one match's parent is same-named" do
346
+ it "matches the node with the same-named parent" do
347
+ doc1 = xml { root {
348
+ a2 { b1 ; b2 }
349
+ } }
350
+ doc2 = xml { root {
351
+ a1 { b1 }
352
+ a2 { b1 }
353
+ a3 { b1 }
354
+ } }
355
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
356
+ assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
357
+ end
358
+ end
359
+
360
+ context "multiple identical nodes exist in both documents" do
361
+ it "should create one-to-one match relationships" do
362
+ doc1 = xml { root1 {
363
+ a1 ; a1 ; a1
364
+ } }
365
+ doc2 = xml { root2 {
366
+ a1 ; a1
367
+ } }
368
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
369
+ [doc1, doc2].each do |doc|
370
+ others = doc.css("a1").collect do |node|
371
+ m = match_set.match(node)
372
+ m ? m.pair.last : nil
373
+ end
374
+ others.uniq.length.should == others.length
375
+ end
376
+ end
377
+ end
378
+
379
+ context "multiple matches' parents are same-named" do
380
+ it "matches the node with the same-named grandparent" do
381
+ doc1 = xml { root {
382
+ wrap2 {
383
+ a1 { b1 { 10.times { c1 "hello there" } } ; b2 }
384
+ } } }
385
+ doc2 = xml { root {
386
+ wrap1 {
387
+ a1 { b1 { 10.times { c1 "hello there" } } }
388
+ }
389
+ wrap2 {
390
+ a1 { b1 { 10.times { c1 "hello there" } } }
391
+ }
392
+ wrap3 {
393
+ a1 { b1 { 10.times { c1 "hello there" } } }
394
+ } } }
395
+ match_set = Lorax::FastMatcher.new(doc1, doc2).match
396
+ assert_forced_match_exists match_set, doc1.at_css("wrap2"), doc2.at_css("wrap2")
397
+ end
398
+ end
399
+ end
400
+ end