lorax 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +6 -0
- data/LICENSE +20 -0
- data/Manifest.txt +37 -0
- data/README.rdoc +70 -0
- data/Rakefile +50 -0
- data/TODO +40 -0
- data/bin/lorax +15 -0
- data/lib/lorax.rb +35 -0
- data/lib/lorax/delta.rb +28 -0
- data/lib/lorax/delta/delete_delta.rb +19 -0
- data/lib/lorax/delta/insert_delta.rb +22 -0
- data/lib/lorax/delta/modify_delta.rb +51 -0
- data/lib/lorax/delta_set.rb +24 -0
- data/lib/lorax/delta_set_generator.rb +36 -0
- data/lib/lorax/fast_matcher.rb +108 -0
- data/lib/lorax/match.rb +22 -0
- data/lib/lorax/match_set.rb +30 -0
- data/lib/lorax/signature.rb +101 -0
- data/spec/fast_matcher_spec.rb +400 -0
- data/spec/files/Michael-Dalessio-200909.html +147 -0
- data/spec/files/Michael-Dalessio-201001.html +153 -0
- data/spec/files/slashdot-1.html +3236 -0
- data/spec/files/slashdot-2.html +3216 -0
- data/spec/files/slashdot-3.html +3228 -0
- data/spec/files/slashdot-4.html +3278 -0
- data/spec/integration/lorax_spec.rb +130 -0
- data/spec/match_spec.rb +54 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +42 -0
- data/spec/unit/delta/delete_delta_spec.rb +50 -0
- data/spec/unit/delta/insert_delta_spec.rb +109 -0
- data/spec/unit/delta/modify_delta_spec.rb +94 -0
- data/spec/unit/delta_set_generator_spec.rb +157 -0
- data/spec/unit/delta_set_spec.rb +40 -0
- data/spec/unit/lorax_spec.rb +9 -0
- data/spec/unit/match_set_spec.rb +93 -0
- data/spec/unit/signature_spec.rb +473 -0
- metadata +216 -0
- metadata.gz.sig +3 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module Lorax
|
2
|
+
class FastMatcher
|
3
|
+
attr_accessor :match_set
|
4
|
+
|
5
|
+
def initialize(doc1, doc2, dependency_injection={})
|
6
|
+
@document1 = doc1
|
7
|
+
@document2 = doc2
|
8
|
+
@match_set = dependency_injection[:matcher_match_set] || MatchSet.new(doc1, doc2, dependency_injection)
|
9
|
+
end
|
10
|
+
|
11
|
+
def match
|
12
|
+
match_node @document1.root
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def match_node(node1)
|
18
|
+
return if match_set.match(node1)
|
19
|
+
signature1 = match_set.signature1.signature(node1) # assumes node1 is in signature1
|
20
|
+
candidates = match_set.signature2.nodes(signature1) || []
|
21
|
+
candidates.reject! { |node| match_set.match(node) }
|
22
|
+
|
23
|
+
if candidates.empty?
|
24
|
+
node1.children.each do |child|
|
25
|
+
match_node(child)
|
26
|
+
end
|
27
|
+
match = match_set.match(node1)
|
28
|
+
propagate_to_children(match.pair.first, match.pair.last) if match
|
29
|
+
else
|
30
|
+
match_candidate(node1, candidates)
|
31
|
+
end
|
32
|
+
propagate_to_parent(node1) unless match_set.match(node1)
|
33
|
+
match_set
|
34
|
+
end
|
35
|
+
|
36
|
+
def match_candidate(node1, candidates)
|
37
|
+
ancestral_matches = candidates.collect do |node2|
|
38
|
+
ancestral_match(node1, node2, depth(node2, match_set.signature2))
|
39
|
+
end
|
40
|
+
longest_trail = ancestral_matches.max { |a, b| a.length <=> b.length }
|
41
|
+
longest_trail.each do |ancestral_match|
|
42
|
+
match_set.add ancestral_match
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def ancestral_match(node1, node2, max_depth)
|
47
|
+
matches = [Match.new(node1, node2, :perfect => true)]
|
48
|
+
curr1, curr2 = node1.parent, node2.parent
|
49
|
+
1.upto(max_depth) do
|
50
|
+
break unless curr1.name == curr2.name && ! curr1.is_a?(Nokogiri::XML::Document)
|
51
|
+
matches << Match.new(curr1, curr2)
|
52
|
+
curr1, curr2 = curr1.parent, curr2.parent
|
53
|
+
end
|
54
|
+
matches
|
55
|
+
end
|
56
|
+
|
57
|
+
def propagate_to_parent(node1)
|
58
|
+
node1.children.sort_by { |child| match_set.signature1.weight(child) }.reverse.each do |child|
|
59
|
+
next unless match = match_set.match(child)
|
60
|
+
match_parent = match.pair.last.parent
|
61
|
+
if match_parent.name == node1.name
|
62
|
+
match_set.add Match.new(node1, match_parent)
|
63
|
+
return
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def propagate_to_children(node1, node2)
|
69
|
+
# TODO: OMG! MY EYES ARE BLEEDING! REFACTOR ME AND OPTIMIZE ME!
|
70
|
+
children_set1 = collect_children_by_name(node1.children)
|
71
|
+
children_set2 = collect_children_by_name(node2.children)
|
72
|
+
|
73
|
+
children_set1.each do |name1, children1|
|
74
|
+
children_set2.each do |name2, children2|
|
75
|
+
next unless name1 == name2
|
76
|
+
if children1.length == 1 && children2.length == 1
|
77
|
+
match_set.add Match.new(children1.first, children2.first)
|
78
|
+
propagate_to_children children1.first, children2.first
|
79
|
+
else
|
80
|
+
children1.each do |child1|
|
81
|
+
children2.each do |child2|
|
82
|
+
if node1.children.index(child1) == node2.children.index(child2)
|
83
|
+
match_set.add Match.new(child1, child2)
|
84
|
+
propagate_to_children child1, child2
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def depth(node, sig)
|
94
|
+
depth = 1 + Math.log(sig.size) * sig.weight(node) / sig.weight
|
95
|
+
# puts "lorax: debug: #{__FILE__}:#{__LINE__}: depth #{depth} = 1 + #{Math.log(sig.size)} * #{sig.weight(node)} / #{sig.weight}"
|
96
|
+
depth.to_i
|
97
|
+
end
|
98
|
+
|
99
|
+
def collect_children_by_name(node_set)
|
100
|
+
collection = {}
|
101
|
+
node_set.each do |child|
|
102
|
+
next if match_set.match(child)
|
103
|
+
(collection[child.name] ||= []) << child
|
104
|
+
end
|
105
|
+
collection
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/lorax/match.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Lorax
|
2
|
+
class Match
|
3
|
+
attr_accessor :pair
|
4
|
+
|
5
|
+
def initialize(node1, node2, options={})
|
6
|
+
@pair = [node1, node2]
|
7
|
+
@perfect = options[:perfect] ? true : false
|
8
|
+
end
|
9
|
+
|
10
|
+
def perfect?
|
11
|
+
@perfect
|
12
|
+
end
|
13
|
+
|
14
|
+
def other(node)
|
15
|
+
case node
|
16
|
+
when pair.first then pair.last
|
17
|
+
when pair.last then pair.first
|
18
|
+
else nil
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Lorax
|
2
|
+
class MatchSet
|
3
|
+
attr_accessor :signature1, :signature2
|
4
|
+
|
5
|
+
def initialize(doc1, doc2, dependency_injection={})
|
6
|
+
@document1 = doc1
|
7
|
+
@document2 = doc2
|
8
|
+
@signature1 = dependency_injection[:match_set_signature1] || Lorax::Signature.new(@document1.root)
|
9
|
+
@signature2 = dependency_injection[:match_set_signature2] || Lorax::Signature.new(@document2.root)
|
10
|
+
@matches = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def match(node)
|
14
|
+
@matches[node]
|
15
|
+
end
|
16
|
+
|
17
|
+
def matches
|
18
|
+
puts "MIKE: #{__FILE__}:#{__LINE__} REMOVE ME THIS IS FOR DEBUGGING ONLY"
|
19
|
+
@matches.values.uniq.collect {|m| [m.pair.first.path, m.pair.last.path, m.perfect?]}.sort
|
20
|
+
end
|
21
|
+
|
22
|
+
def add(match)
|
23
|
+
match.pair.each { |node| @matches[node] = match }
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_delta_set
|
27
|
+
DeltaSetGenerator.generate_delta_set(self)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module Lorax
|
4
|
+
class Signature
|
5
|
+
SEP = "\0"
|
6
|
+
|
7
|
+
def initialize(node=nil)
|
8
|
+
@signatures = {} # node => signature
|
9
|
+
@monograms = {} # node => monogram (signature not including children)
|
10
|
+
@nodes = {} # signature => [node, ...]
|
11
|
+
@weights = {} # node => weight
|
12
|
+
@size = 0
|
13
|
+
@node = node
|
14
|
+
signature(node) if node
|
15
|
+
end
|
16
|
+
|
17
|
+
def root
|
18
|
+
@node
|
19
|
+
end
|
20
|
+
|
21
|
+
def nodes(sig=nil)
|
22
|
+
sig ? @nodes[sig] : @node
|
23
|
+
end
|
24
|
+
|
25
|
+
def size
|
26
|
+
@size
|
27
|
+
end
|
28
|
+
|
29
|
+
def signature(node=@node)
|
30
|
+
return @signatures[node] if @signatures.key?(node)
|
31
|
+
raise ArgumentError, "signature expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
32
|
+
|
33
|
+
if node.text? || node.cdata? || node.comment?
|
34
|
+
monogram = signature = hashify(node.content)
|
35
|
+
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
36
|
+
monogram = signature = hashify(node.to_html)
|
37
|
+
elsif node.element?
|
38
|
+
children_sig = hashify(node.children .collect { |child| signature(child) })
|
39
|
+
attr_sig = hashify(node.attributes.sort.collect { |k,v| [k, v.value] }.flatten)
|
40
|
+
monogram = hashify(node.name, attr_sig)
|
41
|
+
signature = hashify(node.name, attr_sig, children_sig)
|
42
|
+
else
|
43
|
+
raise ArgumentError, "signature expects an element, text, cdata or comment node, but received #{node.class}"
|
44
|
+
end
|
45
|
+
|
46
|
+
@size += 1
|
47
|
+
weight(node)
|
48
|
+
|
49
|
+
(@nodes[signature] ||= []) << node
|
50
|
+
@monograms[node] = monogram
|
51
|
+
@signatures[node] = signature
|
52
|
+
end
|
53
|
+
|
54
|
+
def weight(node=@node)
|
55
|
+
return @weights[node] if @weights.key?(node)
|
56
|
+
raise ArgumentError, "weight expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
57
|
+
|
58
|
+
calculated_weight = \
|
59
|
+
if node.text? || node.cdata? || node.comment?
|
60
|
+
1 + Math.log(node.content.length)
|
61
|
+
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
62
|
+
1
|
63
|
+
elsif node.element?
|
64
|
+
node.children.inject(1) { |sum, child| sum += weight(child) }
|
65
|
+
else
|
66
|
+
raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}"
|
67
|
+
end
|
68
|
+
|
69
|
+
@weights[node] = calculated_weight
|
70
|
+
end
|
71
|
+
|
72
|
+
def monogram(node=@node)
|
73
|
+
return @monograms[node] if @monograms.key?(node)
|
74
|
+
signature(node)
|
75
|
+
@monograms[node]
|
76
|
+
end
|
77
|
+
|
78
|
+
def set_signature(node, value) # :nodoc: for testing
|
79
|
+
(@nodes[value] ||= []) << node
|
80
|
+
@signatures[node] = value
|
81
|
+
end
|
82
|
+
|
83
|
+
def set_weight(node, value) # :nodoc: for testing
|
84
|
+
@weights[node] = value
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def hashify(*args)
|
90
|
+
if args.length == 1
|
91
|
+
if args.first.is_a?(Array)
|
92
|
+
Digest::SHA1.hexdigest args.first.join(SEP)
|
93
|
+
else
|
94
|
+
Digest::SHA1.hexdigest args.first
|
95
|
+
end
|
96
|
+
else
|
97
|
+
Digest::SHA1.hexdigest args.join(SEP)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,400 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::FastMatcher do
|
4
|
+
describe ".new" do
|
5
|
+
context "normal usage" do
|
6
|
+
it "takes two arguments" do
|
7
|
+
proc { Lorax::FastMatcher.new(xml{root}) }.should raise_error(ArgumentError)
|
8
|
+
proc { Lorax::FastMatcher.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "builds a MatchSet for the documents" do
|
12
|
+
doc1 = xml { root1 }
|
13
|
+
doc2 = xml { root2 }
|
14
|
+
mock.proxy(Lorax::MatchSet).new(doc1, doc2, anything)
|
15
|
+
Lorax::FastMatcher.new(doc1, doc2)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "dependency injection" do
|
20
|
+
it "takes an optional third argument for dependency injection" do
|
21
|
+
proc { Lorax::FastMatcher.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "will use the value of ':matcher_match_set' for @match_set" do
|
25
|
+
matcher = Lorax::FastMatcher.new(xml{root}, xml{root}, {:matcher_match_set => :foo})
|
26
|
+
matcher.match_set.should == :foo
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "basic node matching" do
|
32
|
+
context "simple matches" do
|
33
|
+
before do
|
34
|
+
@doc1 = xml { root1 {
|
35
|
+
a1
|
36
|
+
b1
|
37
|
+
} }
|
38
|
+
@doc2 = xml { root2 {
|
39
|
+
a1
|
40
|
+
b2
|
41
|
+
} }
|
42
|
+
@signature1 = Lorax::Signature.new(@doc1.root)
|
43
|
+
@signature1.set_signature(@doc1.at_css("root1"), "root1")
|
44
|
+
@signature1.set_signature(@doc1.at_css("a1"), "a1")
|
45
|
+
@signature1.set_signature(@doc1.at_css("b1"), "b1")
|
46
|
+
@signature2 = Lorax::Signature.new(@doc2.root)
|
47
|
+
@signature2.set_signature(@doc2.at_css("root2"), "root2")
|
48
|
+
@signature2.set_signature(@doc2.at_css("a1"), "a1")
|
49
|
+
@signature2.set_signature(@doc2.at_css("b2"), "b2")
|
50
|
+
end
|
51
|
+
|
52
|
+
it "matches identical nodes" do
|
53
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
54
|
+
:match_set_signature1 => @signature1,
|
55
|
+
:match_set_signature2 => @signature2).match
|
56
|
+
assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
|
57
|
+
end
|
58
|
+
|
59
|
+
it "does not match different nodes" do
|
60
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
61
|
+
:match_set_signature1 => @signature1,
|
62
|
+
:match_set_signature2 => @signature2).match
|
63
|
+
assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b2")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "sibling matches" do
|
68
|
+
it "matches all identical siblings" do
|
69
|
+
doc1 = xml { root {
|
70
|
+
a1_1 ; a1_3 ; a1_5
|
71
|
+
} }
|
72
|
+
doc2 = xml { root {
|
73
|
+
a2_1 ; a2_2 ; a2_3 ; a2_4 ; a2_5
|
74
|
+
} }
|
75
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
76
|
+
signature1.set_signature(doc1.at_css("a1_1"), "a1")
|
77
|
+
signature1.set_signature(doc1.at_css("a1_3"), "a3")
|
78
|
+
signature1.set_signature(doc1.at_css("a1_5"), "a5")
|
79
|
+
|
80
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
81
|
+
signature2.set_signature(doc2.at_css("a2_1"), "a1")
|
82
|
+
signature2.set_signature(doc2.at_css("a2_3"), "a3")
|
83
|
+
signature2.set_signature(doc2.at_css("a2_5"), "a5")
|
84
|
+
|
85
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
86
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
87
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_1"), doc2.at_css("a2_1")
|
88
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_3"), doc2.at_css("a2_3")
|
89
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1_5"), doc2.at_css("a2_5")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "matching children of an unmatched node" do
|
94
|
+
it "matches those children" do
|
95
|
+
doc1 = xml { root {
|
96
|
+
a1 {
|
97
|
+
b1 ; b2
|
98
|
+
}
|
99
|
+
} }
|
100
|
+
doc2 = xml { root {
|
101
|
+
a2 {
|
102
|
+
b1 ; b2
|
103
|
+
}
|
104
|
+
} }
|
105
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
106
|
+
signature1.set_signature(doc1.at_css("a1"), "a1")
|
107
|
+
signature1.set_signature(doc1.at_css("b1"), "b1")
|
108
|
+
signature1.set_signature(doc1.at_css("b2"), "b2")
|
109
|
+
|
110
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
111
|
+
signature1.set_signature(doc2.at_css("a2"), "a2")
|
112
|
+
signature2.set_signature(doc2.at_css("b1"), "b1")
|
113
|
+
signature2.set_signature(doc2.at_css("b2"), "b2")
|
114
|
+
|
115
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
116
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
117
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
118
|
+
assert_perfect_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context "nested matches" do
|
123
|
+
before do
|
124
|
+
@doc1 = xml { root1 { a1 { b1 } } }
|
125
|
+
@doc2 = xml { root2 { a1 { b1 } } }
|
126
|
+
@signature1 = Lorax::Signature.new(@doc1.root)
|
127
|
+
@signature1.set_signature(@doc1.at_css("a1"), "a1")
|
128
|
+
@signature1.set_signature(@doc1.at_css("b1"), "b1")
|
129
|
+
@signature2 = Lorax::Signature.new(@doc2.root)
|
130
|
+
@signature2.set_signature(@doc2.at_css("a1"), "a1")
|
131
|
+
@signature2.set_signature(@doc2.at_css("b1"), "b2")
|
132
|
+
end
|
133
|
+
|
134
|
+
it "matches the root nodes of the largest identical subtree" do
|
135
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
136
|
+
:match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
|
137
|
+
assert_perfect_match_exists match_set, @doc1.at_css("a1"), @doc2.at_css("a1")
|
138
|
+
end
|
139
|
+
|
140
|
+
it "does not match children of identical match nodes" do
|
141
|
+
match_set = Lorax::FastMatcher.new(@doc1, @doc2,
|
142
|
+
:match_set_signature1 => @signature1, :match_set_signature2 => @signature2).match
|
143
|
+
assert_no_match_exists match_set, @doc1.at_css("b1"), @doc2.at_css("b1")
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "forced parent matching" do
|
149
|
+
before do
|
150
|
+
stub.instance_of(Lorax::FastMatcher).propagate_to_parent # we're not testing propagation to parent
|
151
|
+
end
|
152
|
+
|
153
|
+
it "forces a match when parent names are the same but attributes are different" do
|
154
|
+
doc1 = xml { root { a1(:foo => "bar") { b1 } } }
|
155
|
+
doc2 = xml { root { a1(:bazz => "quux") { b1 } } }
|
156
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
157
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
158
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
159
|
+
end
|
160
|
+
|
161
|
+
it "forces a match when parent names and attributes are the same but siblings are different" do
|
162
|
+
doc1 = xml { root { a1(:foo => "bar") { b1 ; b2 } } }
|
163
|
+
doc2 = xml { root { a1(:foo => "bar") { b1 ; b3 } } }
|
164
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
165
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
166
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
167
|
+
end
|
168
|
+
|
169
|
+
describe "subsequent forced child matching" do
|
170
|
+
it "force matches a uniquely-named sibling" do
|
171
|
+
doc1 = xml { root { a1 {
|
172
|
+
b2 "goodbye"
|
173
|
+
b1 "hello"
|
174
|
+
b3
|
175
|
+
b4
|
176
|
+
} } }
|
177
|
+
doc2 = xml { root { a1 {
|
178
|
+
b2 "good boy"
|
179
|
+
b1 "hello"
|
180
|
+
b3 "something"
|
181
|
+
b4 { c1 }
|
182
|
+
} } }
|
183
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
184
|
+
assert_perfect_match_exists match_set, doc1.at_css("b1"), doc2.at_css("b1")
|
185
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
186
|
+
assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
187
|
+
assert_forced_match_exists match_set, doc1.at_css("b3"), doc2.at_css("b3")
|
188
|
+
assert_forced_match_exists match_set, doc1.at_css("b4"), doc2.at_css("b4")
|
189
|
+
end
|
190
|
+
|
191
|
+
it "force matches recursively" do
|
192
|
+
doc1 = xml { root { a1 ; a2 { b2 "hello" } } }
|
193
|
+
doc2 = xml { root { a1 ; a2 { b2 "goodbye" } } }
|
194
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
195
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
196
|
+
assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
|
197
|
+
assert_forced_match_exists match_set, doc1.at_css("b2"), doc2.at_css("b2")
|
198
|
+
assert_forced_match_exists match_set, doc1.at_xpath("//b2/text()"), doc2.at_xpath("//b2/text()")
|
199
|
+
end
|
200
|
+
|
201
|
+
it "should match uniquely-named unmatched children" do
|
202
|
+
doc1 = xml { root {
|
203
|
+
a1 "hello"
|
204
|
+
a2 "goodbye"
|
205
|
+
a3 "natch"
|
206
|
+
} }
|
207
|
+
doc2 = xml { root {
|
208
|
+
a1 "hello"
|
209
|
+
a3 "not"
|
210
|
+
} }
|
211
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
212
|
+
assert_perfect_match_exists match_set, doc1.at_css("a1"), doc2.at_css("a1")
|
213
|
+
assert_forced_match_exists match_set, doc1.at_css("a3"), doc2.at_css("a3")
|
214
|
+
end
|
215
|
+
|
216
|
+
it "should match same-named children in the same position, even if they are not uniquely named" do
|
217
|
+
doc1 = xml { root {
|
218
|
+
a1 {
|
219
|
+
text "hello"
|
220
|
+
b1 "foo"
|
221
|
+
text "goodbye"
|
222
|
+
}
|
223
|
+
} }
|
224
|
+
doc2 = xml { root {
|
225
|
+
a1 {
|
226
|
+
text "bonjour"
|
227
|
+
b1 "foo"
|
228
|
+
text "au revoir"
|
229
|
+
}
|
230
|
+
} }
|
231
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
232
|
+
assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[1]"), doc2.at_xpath("/root/a1/text()[1]")
|
233
|
+
assert_forced_match_exists match_set, doc1.at_xpath("/root/a1/text()[2]"), doc2.at_xpath("/root/a1/text()[2]")
|
234
|
+
end
|
235
|
+
|
236
|
+
it "large subtree matches force more parent matches than smaller subtree matches" do
|
237
|
+
small_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
238
|
+
f1
|
239
|
+
f2
|
240
|
+
} } } } } } }
|
241
|
+
small_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
242
|
+
f1
|
243
|
+
f3
|
244
|
+
} } } } } } }
|
245
|
+
large_doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
246
|
+
f1
|
247
|
+
f2
|
248
|
+
} } } } } } }
|
249
|
+
large_doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
250
|
+
f1
|
251
|
+
f3
|
252
|
+
} } } } } } }
|
253
|
+
|
254
|
+
small_signature1 = Lorax::Signature.new(small_doc1.root)
|
255
|
+
small_signature1.set_weight(small_doc1.at_css("f1"), 1)
|
256
|
+
small_signature2 = Lorax::Signature.new(small_doc2.root)
|
257
|
+
small_signature2.set_weight(small_doc2.at_css("f1"), 1)
|
258
|
+
large_signature1 = Lorax::Signature.new(large_doc1.root)
|
259
|
+
large_signature1.set_weight(large_doc1.at_css("f1"), 10)
|
260
|
+
large_signature2 = Lorax::Signature.new(large_doc2.root)
|
261
|
+
large_signature2.set_weight(large_doc2.at_css("f1"), 10)
|
262
|
+
|
263
|
+
small_match_set = Lorax::FastMatcher.new(small_doc1, small_doc2,
|
264
|
+
:match_set_signature1 => small_signature1, :match_set_signature2 => small_signature2).match
|
265
|
+
large_match_set = Lorax::FastMatcher.new(large_doc1, large_doc2,
|
266
|
+
:match_set_signature1 => large_signature1, :match_set_signature2 => large_signature2).match
|
267
|
+
|
268
|
+
assert_forced_match_exists small_match_set, small_doc1.at_css("e1"), small_doc2.at_css("e1")
|
269
|
+
assert_no_match_exists small_match_set, small_doc1.at_css("d1"), small_doc2.at_css("d1")
|
270
|
+
|
271
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("e1"), large_doc2.at_css("e1")
|
272
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("d1"), large_doc2.at_css("d1")
|
273
|
+
assert_forced_match_exists large_match_set, large_doc1.at_css("c1"), large_doc2.at_css("c1")
|
274
|
+
assert_no_match_exists large_match_set, large_doc1.at_css("b1"), large_doc2.at_css("b1")
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
describe "propagating matches to unmatched parents based on children's matches' parents" do
|
280
|
+
context "when there is only one child" do
|
281
|
+
it "should match parents all the way up the tree" do
|
282
|
+
doc1 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
283
|
+
f1 "hello"
|
284
|
+
f2
|
285
|
+
} } } } } } }
|
286
|
+
doc2 = xml { root { a1 { b1 { c1 { d1 { e1 {
|
287
|
+
f1 "hello"
|
288
|
+
f3
|
289
|
+
} } } } } } }
|
290
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
291
|
+
assert_perfect_match_exists match_set, doc1.at_css("f1"), doc2.at_css("f1")
|
292
|
+
%w[e1 d1 c1 b1 a1 root].each do |node_name|
|
293
|
+
assert_forced_match_exists match_set, doc1.at_css(node_name), doc2.at_css(node_name)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
context "there are many possible children" do
|
299
|
+
it "should match via children with largest weight" do
|
300
|
+
doc1 = xml { root {
|
301
|
+
a1 { b1 ; b2 }
|
302
|
+
} }
|
303
|
+
doc2 = xml { root {
|
304
|
+
a1 { b1 ; b3 }
|
305
|
+
a1 { b2 ; b4 }
|
306
|
+
} }
|
307
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
308
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
309
|
+
signature1.set_weight(doc1.at_css("b1"), 10)
|
310
|
+
signature1.set_weight(doc1.at_css("b2"), 100)
|
311
|
+
signature2.set_weight(doc2.at_css("b1"), 10)
|
312
|
+
signature2.set_weight(doc2.at_css("b2"), 100)
|
313
|
+
|
314
|
+
match_set = Lorax::MatchSet.new(doc1, doc2, :match_set_signature1 => signature1, :match_set_signature2 => signature2)
|
315
|
+
match_set.add Lorax::Match.new(doc1.at_css("b1"), doc2.at_css("b1"))
|
316
|
+
match_set.add Lorax::Match.new(doc1.at_css("b2"), doc2.at_css("b2"))
|
317
|
+
|
318
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2, :matcher_match_set => match_set).match
|
319
|
+
assert_forced_match_exists match_set, doc1.at_css("a1"), doc2.at_xpath("//a1[2]")
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
describe "choosing the best among multiple possible matches" do
|
325
|
+
context "no match's parent is same-named" do
|
326
|
+
it "we don't care which node we match, just pick one" do
|
327
|
+
doc1 = xml { root {
|
328
|
+
a1 { b1 }
|
329
|
+
} }
|
330
|
+
doc2 = xml { root {
|
331
|
+
a2 { b1 }
|
332
|
+
a3 { b1 }
|
333
|
+
} }
|
334
|
+
signature1 = Lorax::Signature.new(doc1.root)
|
335
|
+
signature2 = Lorax::Signature.new(doc2.root)
|
336
|
+
signature1.set_signature(doc1.at_xpath("//b1"), "b1")
|
337
|
+
signature2.set_signature(doc2.at_xpath("//a2/b1"), "b1")
|
338
|
+
signature2.set_signature(doc2.at_xpath("//a3/b1"), "b1")
|
339
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2,
|
340
|
+
:match_set_signature1 => signature1, :match_set_signature2 => signature2).match
|
341
|
+
match_set.match(doc1.at_css("b1")).other(doc1.at_css("b1")).name.should == "b1"
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
context "one match's parent is same-named" do
|
346
|
+
it "matches the node with the same-named parent" do
|
347
|
+
doc1 = xml { root {
|
348
|
+
a2 { b1 ; b2 }
|
349
|
+
} }
|
350
|
+
doc2 = xml { root {
|
351
|
+
a1 { b1 }
|
352
|
+
a2 { b1 }
|
353
|
+
a3 { b1 }
|
354
|
+
} }
|
355
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
356
|
+
assert_forced_match_exists match_set, doc1.at_css("a2"), doc2.at_css("a2")
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
context "multiple identical nodes exist in both documents" do
|
361
|
+
it "should create one-to-one match relationships" do
|
362
|
+
doc1 = xml { root1 {
|
363
|
+
a1 ; a1 ; a1
|
364
|
+
} }
|
365
|
+
doc2 = xml { root2 {
|
366
|
+
a1 ; a1
|
367
|
+
} }
|
368
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
369
|
+
[doc1, doc2].each do |doc|
|
370
|
+
others = doc.css("a1").collect do |node|
|
371
|
+
m = match_set.match(node)
|
372
|
+
m ? m.pair.last : nil
|
373
|
+
end
|
374
|
+
others.uniq.length.should == others.length
|
375
|
+
end
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
context "multiple matches' parents are same-named" do
|
380
|
+
it "matches the node with the same-named grandparent" do
|
381
|
+
doc1 = xml { root {
|
382
|
+
wrap2 {
|
383
|
+
a1 { b1 { 10.times { c1 "hello there" } } ; b2 }
|
384
|
+
} } }
|
385
|
+
doc2 = xml { root {
|
386
|
+
wrap1 {
|
387
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
388
|
+
}
|
389
|
+
wrap2 {
|
390
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
391
|
+
}
|
392
|
+
wrap3 {
|
393
|
+
a1 { b1 { 10.times { c1 "hello there" } } }
|
394
|
+
} } }
|
395
|
+
match_set = Lorax::FastMatcher.new(doc1, doc2).match
|
396
|
+
assert_forced_match_exists match_set, doc1.at_css("wrap2"), doc2.at_css("wrap2")
|
397
|
+
end
|
398
|
+
end
|
399
|
+
end
|
400
|
+
end
|