lorax 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +6 -0
- data/LICENSE +20 -0
- data/Manifest.txt +37 -0
- data/README.rdoc +70 -0
- data/Rakefile +50 -0
- data/TODO +40 -0
- data/bin/lorax +15 -0
- data/lib/lorax.rb +35 -0
- data/lib/lorax/delta.rb +28 -0
- data/lib/lorax/delta/delete_delta.rb +19 -0
- data/lib/lorax/delta/insert_delta.rb +22 -0
- data/lib/lorax/delta/modify_delta.rb +51 -0
- data/lib/lorax/delta_set.rb +24 -0
- data/lib/lorax/delta_set_generator.rb +36 -0
- data/lib/lorax/fast_matcher.rb +108 -0
- data/lib/lorax/match.rb +22 -0
- data/lib/lorax/match_set.rb +30 -0
- data/lib/lorax/signature.rb +101 -0
- data/spec/fast_matcher_spec.rb +400 -0
- data/spec/files/Michael-Dalessio-200909.html +147 -0
- data/spec/files/Michael-Dalessio-201001.html +153 -0
- data/spec/files/slashdot-1.html +3236 -0
- data/spec/files/slashdot-2.html +3216 -0
- data/spec/files/slashdot-3.html +3228 -0
- data/spec/files/slashdot-4.html +3278 -0
- data/spec/integration/lorax_spec.rb +130 -0
- data/spec/match_spec.rb +54 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +42 -0
- data/spec/unit/delta/delete_delta_spec.rb +50 -0
- data/spec/unit/delta/insert_delta_spec.rb +109 -0
- data/spec/unit/delta/modify_delta_spec.rb +94 -0
- data/spec/unit/delta_set_generator_spec.rb +157 -0
- data/spec/unit/delta_set_spec.rb +40 -0
- data/spec/unit/lorax_spec.rb +9 -0
- data/spec/unit/match_set_spec.rb +93 -0
- data/spec/unit/signature_spec.rb +473 -0
- metadata +216 -0
- metadata.gz.sig +3 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::DeltaSetGenerator do
|
4
|
+
describe "#generate_delta_set" do
|
5
|
+
context "InsertDeltas" do
|
6
|
+
it "should be generated for an atomic node without a match" do
|
7
|
+
doc1 = xml { root1 }
|
8
|
+
doc2 = xml { root2 }
|
9
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
10
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
11
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should be generated for a subtree without a match" do
|
15
|
+
doc1 = xml { root1 }
|
16
|
+
doc2 = xml { root2 { a1 ; a2 "hello" } }
|
17
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
18
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
19
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should not be generated for children of a perfect match" do
|
23
|
+
doc1 = xml { root { a1 { b1 "hello" } } }
|
24
|
+
doc2 = xml { root { a1 { b1 "hello" } ; a2 } }
|
25
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
26
|
+
match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root"))
|
27
|
+
match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true)
|
28
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
29
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 # a2
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should be generated for siblings without a match" do
|
33
|
+
doc1 = xml { root {
|
34
|
+
a1 "hello"
|
35
|
+
a3 "goodbye"
|
36
|
+
a5 "again"
|
37
|
+
} }
|
38
|
+
doc2 = xml { root {
|
39
|
+
a1 "hello"
|
40
|
+
a2 "middleman"
|
41
|
+
a3 "goodbye"
|
42
|
+
a4 "good boy"
|
43
|
+
a5 "again"
|
44
|
+
} }
|
45
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
46
|
+
match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true)
|
47
|
+
match_set.add Lorax::Match.new(doc1.at_css("a3"), doc2.at_css("a3"), :perfect => true)
|
48
|
+
match_set.add Lorax::Match.new(doc1.at_css("a5"), doc2.at_css("a5"), :perfect => true)
|
49
|
+
match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root"))
|
50
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
51
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 2
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context "ModifyDeltas" do
|
56
|
+
it "should be generated for nodes that are imperfectly matched" do
|
57
|
+
doc1 = xml { root(:foo => :bar) }
|
58
|
+
doc2 = xml { root(:foo => :quux) }
|
59
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
60
|
+
match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root")
|
61
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
62
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 1
|
63
|
+
end
|
64
|
+
|
65
|
+
context "imperfect self-same match with children" do
|
66
|
+
it "should handle children as expected" do
|
67
|
+
doc1 = xml { root {
|
68
|
+
a1
|
69
|
+
a2
|
70
|
+
a4(:foo => :bar)
|
71
|
+
} }
|
72
|
+
doc2 = xml { root {
|
73
|
+
a2
|
74
|
+
a3
|
75
|
+
a4(:foo => :quux)
|
76
|
+
} }
|
77
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
78
|
+
match_set.add Lorax::Match.new doc1.root, doc2.root
|
79
|
+
match_set.add Lorax::Match.new doc1.at_css("a2"), doc2.at_css("a2"), :perfect => true
|
80
|
+
match_set.add Lorax::Match.new doc1.at_css("a4"), doc2.at_css("a4")
|
81
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
82
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::InsertDelta) }.length.should == 1 # a3
|
83
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 1 # a4
|
84
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 # a1
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should not be generated for nodes that are imperfectly matched but are self-same" do
|
89
|
+
doc1 = xml { root(:foo => :bar) { a1 } }
|
90
|
+
doc2 = xml { root(:foo => :bar) { a2 } }
|
91
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
92
|
+
match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root")
|
93
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
94
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 0
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should not be generated for nodes that are perfectly matched" do
|
98
|
+
doc1 = xml { root }
|
99
|
+
doc2 = xml { root }
|
100
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
101
|
+
match_set.add Lorax::Match.new doc1.at_css("root"), doc2.at_css("root"), :perfect => true
|
102
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
103
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::ModifyDelta) }.length.should == 0
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "DeleteDeltas" do
|
108
|
+
it "should be generated for an atomic node without a match" do
|
109
|
+
doc1 = xml { root1 }
|
110
|
+
doc2 = xml { root2 }
|
111
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
112
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
113
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should be generated for a subtree without a match" do
|
117
|
+
doc1 = xml { root1 { a1 ; a2 "hello" } }
|
118
|
+
doc2 = xml { root2 }
|
119
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
120
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
121
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1
|
122
|
+
end
|
123
|
+
|
124
|
+
it "should not be generated for children of a deleted node" do
|
125
|
+
doc1 = xml { root { a1 { b1 "hello" } ; a2 } }
|
126
|
+
doc2 = xml { root { a2 } }
|
127
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
128
|
+
match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root"))
|
129
|
+
match_set.add Lorax::Match.new(doc1.at_css("a2"), doc2.at_css("a2"), :perfect => true)
|
130
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
131
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 1 # a1
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should be generated for siblings without a match" do
|
135
|
+
doc1 = xml { root {
|
136
|
+
a1 "hello"
|
137
|
+
a2 "middleman"
|
138
|
+
a3 "goodbye"
|
139
|
+
a4 "good boy"
|
140
|
+
a5 "again"
|
141
|
+
} }
|
142
|
+
doc2 = xml { root {
|
143
|
+
a1 "hello"
|
144
|
+
a3 "goodbye"
|
145
|
+
a5 "again"
|
146
|
+
} }
|
147
|
+
match_set = Lorax::MatchSet.new doc1, doc2
|
148
|
+
match_set.add Lorax::Match.new(doc1.at_css("a1"), doc2.at_css("a1"), :perfect => true)
|
149
|
+
match_set.add Lorax::Match.new(doc1.at_css("a3"), doc2.at_css("a3"), :perfect => true)
|
150
|
+
match_set.add Lorax::Match.new(doc1.at_css("a5"), doc2.at_css("a5"), :perfect => true)
|
151
|
+
match_set.add Lorax::Match.new(doc1.at_css("root"), doc2.at_css("root"))
|
152
|
+
delta_set = Lorax::DeltaSetGenerator.generate_delta_set(match_set)
|
153
|
+
delta_set.deltas.select { |d| d.is_a?(Lorax::DeleteDelta) }.length.should == 2
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::DeltaSet do
|
4
|
+
describe "#add / #deltas" do
|
5
|
+
it "appends to and returns an ordered list of deltas" do
|
6
|
+
delta_set = Lorax::DeltaSet.new
|
7
|
+
delta_set.add :foo
|
8
|
+
delta_set.add :bar
|
9
|
+
delta_set.deltas.should == [:foo, :bar]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#apply" do
|
14
|
+
it "calls apply! on a duplicate document" do
|
15
|
+
delta_set = Lorax::DeltaSet.new
|
16
|
+
document = Nokogiri::XML::Document.new
|
17
|
+
mock(document).dup { :foo }
|
18
|
+
mock(delta_set).apply!(:foo)
|
19
|
+
delta_set.apply document
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#apply!" do
|
24
|
+
it "invokes apply! on each delta in order" do
|
25
|
+
doc = xml { root }
|
26
|
+
delta_set = Lorax::DeltaSet.new
|
27
|
+
delta1 = Lorax::InsertDelta.new(:foo, :bar, :quux)
|
28
|
+
delta2 = Lorax::InsertDelta.new(:foo, :bar, :quux)
|
29
|
+
delta_set.add delta1
|
30
|
+
delta_set.add delta2
|
31
|
+
|
32
|
+
order_of_invocation = []
|
33
|
+
mock(delta1).apply!(doc) { order_of_invocation << :delta1 }
|
34
|
+
mock(delta2).apply!(doc) { order_of_invocation << :delta2 }
|
35
|
+
delta_set.apply!(doc)
|
36
|
+
|
37
|
+
order_of_invocation.should == [:delta1, :delta2]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::MatchSet do
|
4
|
+
describe "#new" do
|
5
|
+
context "normal usage" do
|
6
|
+
it "takes two arguments" do
|
7
|
+
proc { Lorax::MatchSet.new(xml{root}) }.should raise_error(ArgumentError)
|
8
|
+
proc { Lorax::MatchSet.new(xml{root}, xml{root}) }.should_not raise_error(ArgumentError)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "builds a Signature for each document root" do
|
12
|
+
doc1 = xml { root1 }
|
13
|
+
doc2 = xml { root2 }
|
14
|
+
mock.proxy(Lorax::Signature).new(doc1.root)
|
15
|
+
mock.proxy(Lorax::Signature).new(doc2.root)
|
16
|
+
Lorax::MatchSet.new(doc1, doc2)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
context "with dependency injection" do
|
21
|
+
it "takes an optional third argument for dependency injection" do
|
22
|
+
proc { Lorax::MatchSet.new(xml{root}, xml{root}, {:foo => :bar}) }.should_not raise_error(ArgumentError)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "will use the value of ':match_set_signature1' for @signature1" do
|
26
|
+
match_set = Lorax::MatchSet.new(xml{root}, xml{root}, {:match_set_signature1 => :foo})
|
27
|
+
match_set.signature1.should == :foo
|
28
|
+
end
|
29
|
+
|
30
|
+
it "will use the value of ':match_set_signature2' for @signature2" do
|
31
|
+
match_set = Lorax::MatchSet.new(xml{root}, xml{root}, {:match_set_signature2 => :foo})
|
32
|
+
match_set.signature2.should == :foo
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#signature1" do
|
38
|
+
it "returns the Signature of the first document" do
|
39
|
+
doc1 = xml { root1 }
|
40
|
+
doc2 = xml { root2 }
|
41
|
+
match_set = Lorax::MatchSet.new(doc1, doc2)
|
42
|
+
match_set.signature1.should_not be_nil
|
43
|
+
match_set.signature1.root.should == doc1.root
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "#signature2" do
|
48
|
+
it "returns the Signature of the second document" do
|
49
|
+
doc1 = xml { root1 }
|
50
|
+
doc2 = xml { root2 }
|
51
|
+
match_set = Lorax::MatchSet.new(doc1, doc2)
|
52
|
+
match_set.signature2.should_not be_nil
|
53
|
+
match_set.signature2.root.should == doc2.root
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "#match and #add" do
|
58
|
+
before do
|
59
|
+
@doc1 = xml { root1 { a1 } }
|
60
|
+
@doc2 = xml { root2 { a1 } }
|
61
|
+
@match_set = Lorax::MatchSet.new(@doc1, @doc2)
|
62
|
+
end
|
63
|
+
|
64
|
+
context "when there is a match for the node" do
|
65
|
+
before do
|
66
|
+
@match = Lorax::Match.new(@doc1.at_css("a1"), @doc2.at_css("a1"))
|
67
|
+
@match_set.add @match
|
68
|
+
end
|
69
|
+
|
70
|
+
it "returns the match" do
|
71
|
+
@match_set.match(@doc1.at_css("a1")).should == @match
|
72
|
+
@match_set.match(@doc2.at_css("a1")).should == @match
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "when there is no match" do
|
77
|
+
it "returns nil" do
|
78
|
+
@match_set.match(@doc1.at_css("a1")).should be_nil
|
79
|
+
@match_set.match(@doc2.at_css("a1")).should be_nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "#to_delta_set" do
|
85
|
+
it "invokes DeltaSetGenerator.generate_delta_set on itself" do
|
86
|
+
doc1 = xml { root1 }
|
87
|
+
doc2 = xml { root2 }
|
88
|
+
match_set = Lorax::MatchSet.new(doc1, doc2)
|
89
|
+
mock(Lorax::DeltaSetGenerator).generate_delta_set(match_set)
|
90
|
+
match_set.to_delta_set
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,473 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Lorax::Signature do
|
4
|
+
def assert_node_signature_equal(node1, node2)
|
5
|
+
Lorax::Signature.new(node1).signature.should == Lorax::Signature.new(node2).signature
|
6
|
+
end
|
7
|
+
|
8
|
+
def assert_node_signature_not_equal(node1, node2)
|
9
|
+
Lorax::Signature.new(node1).signature.should_not == Lorax::Signature.new(node2).signature
|
10
|
+
end
|
11
|
+
|
12
|
+
describe ".new" do
|
13
|
+
it "accepts nil" do
|
14
|
+
proc { Lorax::Signature.new }.should_not raise_error
|
15
|
+
end
|
16
|
+
|
17
|
+
it "does not call signature if param is nil" do
|
18
|
+
mock.instance_of(Lorax::Signature).signature(42).never
|
19
|
+
Lorax::Signature.new(nil)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "calls signature if a param is non-nil" do
|
23
|
+
mock.instance_of(Lorax::Signature).signature(42).once
|
24
|
+
Lorax::Signature.new(42)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "#root" do
|
29
|
+
it "returns the subtree root" do
|
30
|
+
doc = xml { root { a1 "hello" } }
|
31
|
+
node = doc.at_css("a1")
|
32
|
+
sig = Lorax::Signature.new(node)
|
33
|
+
sig.root.should == node
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#nodes" do
|
38
|
+
it "returns an array of nodes matching the signature" do
|
39
|
+
doc = xml { root {
|
40
|
+
a1 "hello"
|
41
|
+
a1 "hello"
|
42
|
+
a1 "hello"
|
43
|
+
} }
|
44
|
+
nodes = doc.css("a1")
|
45
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
46
|
+
node_sig = Lorax::Signature.new(nodes.first)
|
47
|
+
doc_sig.nodes(node_sig.signature).should =~ nodes.to_a
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#size" do
|
52
|
+
it "returns the total number of nodes in the subtree" do
|
53
|
+
doc = xml { root { a1 "hello" } }
|
54
|
+
node = doc.at_css("a1")
|
55
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
56
|
+
doc_sig.size.should == 3 # root, a1, hello
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#set_signature" do
|
61
|
+
it "assigns values such that signature and nodes return the proper thing" do
|
62
|
+
signature = Lorax::Signature.new
|
63
|
+
signature.set_signature(:foo, "a")
|
64
|
+
signature.set_signature(:bar, "a")
|
65
|
+
signature.set_signature(:bazz, "b")
|
66
|
+
signature.signature(:foo).should == "a"
|
67
|
+
signature.signature(:bar).should == "a"
|
68
|
+
signature.signature(:bazz).should == "b"
|
69
|
+
signature.nodes("a").should =~ [:foo, :bar]
|
70
|
+
signature.nodes("b").should == [:bazz]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "#set_weight" do
|
75
|
+
it "assigns values such that weight returns the proper thing" do
|
76
|
+
signature = Lorax::Signature.new
|
77
|
+
signature.set_weight(:foo, 2.2)
|
78
|
+
signature.weight(:foo).should == 2.2
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
describe "#signature" do
|
83
|
+
context "passed no argument" do
|
84
|
+
it "returns the subtree root's signature" do
|
85
|
+
doc = xml { root { a1 "hello" } }
|
86
|
+
sig = Lorax::Signature.new(doc.root)
|
87
|
+
sig.signature.should == sig.signature(doc.root)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context "passed a node" do
|
92
|
+
it "returns the node's signature" do
|
93
|
+
doc = xml { root { a1 "hello" } }
|
94
|
+
node = doc.at_css("a1")
|
95
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
96
|
+
node_sig = Lorax::Signature.new(node)
|
97
|
+
doc_sig.signature(node).should == node_sig.signature
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
context "passed a non-Node" do
|
102
|
+
it "raises an error" do
|
103
|
+
proc { Lorax::Signature.new.signature(42) }.should raise_error(ArgumentError, /signature expects a Node/)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "passed a cdata Node" do
|
108
|
+
it "treats it like a leaf text node" do
|
109
|
+
doc = xml { root { cdata "hello" } }
|
110
|
+
node = doc.root.children.first
|
111
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
112
|
+
node_sig = Lorax::Signature.new(node)
|
113
|
+
doc_sig.signature(node).should == node_sig.signature
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
context "passed a comment Node" do
|
118
|
+
it "treats it like a leaf text node" do
|
119
|
+
doc = Nokogiri::XML "<root><!-- hello --></root>"
|
120
|
+
node = doc.root.children.first
|
121
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
122
|
+
node_sig = Lorax::Signature.new(node)
|
123
|
+
doc_sig.signature(node).should == node_sig.signature
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
context "passed an entity reference Node" do
|
128
|
+
it "treats it like a leaf text node" do
|
129
|
+
doc = Nokogiri::XML %q(<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><span> </span></html>)
|
130
|
+
node = doc.at_css("span").children.first
|
131
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
132
|
+
node_sig = Lorax::Signature.new(node)
|
133
|
+
doc_sig.signature(node).should == node_sig.signature
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
context "passed an invalid Node" do
|
138
|
+
it "raises an error" do
|
139
|
+
doc = xml { root { a1("foo" => "bar") } }
|
140
|
+
attr = doc.at_css("a1").attributes.first.last
|
141
|
+
proc { Lorax::Signature.new.signature(attr) }.should raise_error(ArgumentError, /signature expects an element/)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
it "hashes each node only once" do
|
146
|
+
doc = xml { root { a1 { b1 { c1 "hello" } } } }
|
147
|
+
node = doc.at_css "c1"
|
148
|
+
mock.proxy.instance_of(Lorax::Signature).signature(anything).times(5)
|
149
|
+
Lorax::Signature.new.signature(doc.root)
|
150
|
+
end
|
151
|
+
|
152
|
+
it "caches signaturees" do
|
153
|
+
doc = xml { root { a1 { b1 { c1 "hello" } } } }
|
154
|
+
node = doc.at_css "c1"
|
155
|
+
mock.proxy.instance_of(Lorax::Signature).signature(anything).times(6)
|
156
|
+
sig = Lorax::Signature.new
|
157
|
+
sig.signature(doc.root)
|
158
|
+
sig.signature(doc.root)
|
159
|
+
end
|
160
|
+
|
161
|
+
it "calculates weights along the way" do
|
162
|
+
doc = xml { root { a1 } }
|
163
|
+
node = doc.at_css "a1"
|
164
|
+
sig = Lorax::Signature.new
|
165
|
+
mock(sig).weight(node)
|
166
|
+
sig.signature(node)
|
167
|
+
end
|
168
|
+
|
169
|
+
context "identical text nodes" do
|
170
|
+
it "have equal signatures" do
|
171
|
+
doc = xml { root {
|
172
|
+
span "hello"
|
173
|
+
span "hello"
|
174
|
+
} }
|
175
|
+
assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first })
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
context "different text nodes" do
|
180
|
+
it "have inequal signatures" do
|
181
|
+
doc = xml { root {
|
182
|
+
span "hello"
|
183
|
+
span "goodbye"
|
184
|
+
} }
|
185
|
+
assert_node_signature_not_equal(*doc.css("span").collect { |n| n.children.first })
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
context "elements with same name (with no attributes and no content)" do
|
190
|
+
it "have equal signatures" do
|
191
|
+
doc = xml { root { a1 ; a1 } }
|
192
|
+
assert_node_signature_equal(*doc.css("a1"))
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
context "elements with different names" do
|
197
|
+
it "have inequal signatures" do
|
198
|
+
doc = xml { root { a1 ; a2 } }
|
199
|
+
assert_node_signature_not_equal doc.at_css("a1"), doc.at_css("a2")
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
context "same elements in different docs" do
|
204
|
+
it "have equal signatures" do
|
205
|
+
doc1 = xml { root { a1 } }
|
206
|
+
doc2 = xml { root { a1 } }
|
207
|
+
assert_node_signature_equal doc1.at_css("a1"), doc2.at_css("a1")
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
context "elements with same name and content (with no attributes)" do
|
212
|
+
context "and content is the same" do
|
213
|
+
it "have equal signatures" do
|
214
|
+
doc = xml { root {
|
215
|
+
a1 "hello"
|
216
|
+
a1 "hello"
|
217
|
+
} }
|
218
|
+
assert_node_signature_equal(*doc.css("a1"))
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
context "and content is not the same" do
|
223
|
+
it "have inequal signatures" do
|
224
|
+
doc = xml { root {
|
225
|
+
a1 "hello"
|
226
|
+
a1 "goodbye"
|
227
|
+
} }
|
228
|
+
assert_node_signature_not_equal(*doc.css("a1"))
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
context "elements with same name and children (with no attributes)" do
|
234
|
+
context "and children are in the same order" do
|
235
|
+
it "have equal signatures" do
|
236
|
+
doc = xml { root {
|
237
|
+
a1 { b1 ; b2 }
|
238
|
+
a1 { b1 ; b2 }
|
239
|
+
} }
|
240
|
+
assert_node_signature_equal(*doc.css("a1"))
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
context "and children are not in the same order" do
|
245
|
+
it "have inequal signatures" do
|
246
|
+
doc = xml { root {
|
247
|
+
a1 { b1 ; b2 }
|
248
|
+
a1 { b2 ; b1 }
|
249
|
+
} }
|
250
|
+
assert_node_signature_not_equal(*doc.css("a1"))
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
context "elements with same name and same attributes (with no content)" do
|
256
|
+
it "have equal signatures" do
|
257
|
+
doc = xml { root {
|
258
|
+
a1("foo" => "bar", "bazz" => "quux")
|
259
|
+
a1("foo" => "bar", "bazz" => "quux")
|
260
|
+
} }
|
261
|
+
assert_node_signature_equal(*doc.css("a1"))
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
context "elements with same name and different attributes (with no content)" do
|
266
|
+
it "have inequal signatures" do
|
267
|
+
doc = xml { root {
|
268
|
+
a1("foo" => "bar", "bazz" => "quux")
|
269
|
+
a1("foo" => "123", "bazz" => "456")
|
270
|
+
} }
|
271
|
+
assert_node_signature_not_equal(*doc.css("a1"))
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
context "attributes reverse-engineered to be similar" do
|
276
|
+
it "have inequal signatures" do
|
277
|
+
doc = xml { root {
|
278
|
+
a1("foo" => "bar#{Lorax::Signature::SEP}quux")
|
279
|
+
a1("foo#{Lorax::Signature::SEP}bar" => "quux")
|
280
|
+
} }
|
281
|
+
assert_node_signature_not_equal(*doc.css("a1"))
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
context "HTML" do
|
286
|
+
it "should be case-insensitive" do
|
287
|
+
doc1 = Nokogiri::HTML <<-EOH
|
288
|
+
<html><body>
|
289
|
+
<DIV FOO="BAR">hello</DIV>
|
290
|
+
</body></html>
|
291
|
+
EOH
|
292
|
+
doc2 = Nokogiri::HTML <<-EOH
|
293
|
+
<html><body>
|
294
|
+
<div foo="BAR">hello</div>
|
295
|
+
</body></html>
|
296
|
+
EOH
|
297
|
+
assert_node_signature_equal(doc1.at_css("body").children.first,
|
298
|
+
doc2.at_css("body").children.first)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
describe "#weight" do
|
304
|
+
context "passed no argument" do
|
305
|
+
it "returns the subtree root's weight" do
|
306
|
+
doc = xml { root { a1 { b1 { c1 { d1 } } } } }
|
307
|
+
node = doc.at_css("a1")
|
308
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
309
|
+
doc_sig.weight.should == 5
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
context "passed a node" do
|
314
|
+
it "returns the node's weight" do
|
315
|
+
doc = xml { root { a1 "hello" } }
|
316
|
+
node = doc.at_css("a1")
|
317
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
318
|
+
node_sig = Lorax::Signature.new(node)
|
319
|
+
doc_sig.weight(node).should == node_sig.weight
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
context "passed a non-Node" do
|
324
|
+
it "raises an error" do
|
325
|
+
proc { Lorax::Signature.new.weight(42) }.should raise_error(ArgumentError, /weight expects a Node/)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
context "passed a cdata Node" do
|
330
|
+
it "treats it like a leaf text node" do
|
331
|
+
doc = xml { root { cdata "hello" } }
|
332
|
+
node = doc.root.children.first
|
333
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
334
|
+
node_sig = Lorax::Signature.new(node)
|
335
|
+
doc_sig.weight(node).should == node_sig.weight
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
context "passed a comment Node" do
|
340
|
+
it "treats it like a leaf text node" do
|
341
|
+
doc = Nokogiri::XML "<root><!-- hello --></root>"
|
342
|
+
node = doc.root.children.first
|
343
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
344
|
+
node_sig = Lorax::Signature.new(node)
|
345
|
+
doc_sig.weight(node).should == node_sig.weight
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
context "passed an entity reference Node" do
|
350
|
+
it "treats it like a leaf text node" do
|
351
|
+
doc = Nokogiri::XML %q(<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><span> </span></html>)
|
352
|
+
node = doc.at_css("span").children.first
|
353
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
354
|
+
node_sig = Lorax::Signature.new(node)
|
355
|
+
doc_sig.weight(node).should == node_sig.weight
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
context "passed an invalid Node" do
|
360
|
+
it "raises an error" do
|
361
|
+
doc = xml { root { a1("foo" => "bar") } }
|
362
|
+
attr = doc.at_css("a1").attributes.first.last
|
363
|
+
proc { Lorax::Signature.new.weight(attr) }.should raise_error(ArgumentError, /weight expects an element/)
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
it "weighs each node only once" do
|
368
|
+
doc = xml { root { a1 { b1 { c1 "hello" } } } }
|
369
|
+
node = doc.at_css "c1"
|
370
|
+
mock.proxy.instance_of(Lorax::Signature).weight(anything).times(5)
|
371
|
+
Lorax::Signature.new.weight(doc.root)
|
372
|
+
end
|
373
|
+
|
374
|
+
it "caches weights" do
|
375
|
+
doc = xml { root { a1 { b1 { c1 "hello" } } } }
|
376
|
+
node = doc.at_css "c1"
|
377
|
+
mock.proxy.instance_of(Lorax::Signature).weight(anything).times(6)
|
378
|
+
sig = Lorax::Signature.new
|
379
|
+
sig.weight(doc.root)
|
380
|
+
sig.weight(doc.root)
|
381
|
+
end
|
382
|
+
|
383
|
+
it "weighs empty nodes with no children as 1" do
|
384
|
+
doc = xml { root { a1 } }
|
385
|
+
sig = Lorax::Signature.new(doc.root)
|
386
|
+
sig.weight(doc.at_css("a1")).should == 1
|
387
|
+
end
|
388
|
+
|
389
|
+
it "weighs nodes with children as 1 + sum(weight(children))" do
|
390
|
+
doc = xml { root {
|
391
|
+
a1 { b1 ; b2 }
|
392
|
+
a2 { b1 ; b2 ; b3 ; b4 }
|
393
|
+
} }
|
394
|
+
sig = Lorax::Signature.new(doc.root)
|
395
|
+
sig.weight(doc.at_css("a1")).should == 3
|
396
|
+
sig.weight(doc.at_css("a2")).should == 5
|
397
|
+
sig.weight.should == 9
|
398
|
+
end
|
399
|
+
|
400
|
+
describe "text nodes" do
|
401
|
+
it "scores as 1 + log(length)" do
|
402
|
+
doc = xml { root {
|
403
|
+
a1 "x"
|
404
|
+
a2("x" * 500)
|
405
|
+
a3("x" * 50_000)
|
406
|
+
} }
|
407
|
+
sig = Lorax::Signature.new(doc.root)
|
408
|
+
sig.weight(doc.at_css("a1")).should be_close(2, 0.0005)
|
409
|
+
sig.weight(doc.at_css("a2")).should be_close(2 + Math.log(500), 0.0005)
|
410
|
+
sig.weight(doc.at_css("a3")).should be_close(2 + Math.log(50_000), 0.0005)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
describe "#monogram" do
|
416
|
+
context "passed no argument" do
|
417
|
+
it "returns the subtree root's signature" do
|
418
|
+
doc = xml { root { a1(:foo => :bar) } }
|
419
|
+
sig = Lorax::Signature.new(doc.root)
|
420
|
+
sig.monogram.should == sig.monogram(doc.root)
|
421
|
+
end
|
422
|
+
end
|
423
|
+
|
424
|
+
context "passed a node" do
|
425
|
+
it "returns the node's signature" do
|
426
|
+
doc = xml { root { a1(:foo => :bar) } }
|
427
|
+
node = doc.at_css("a1")
|
428
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
429
|
+
node_sig = Lorax::Signature.new(node)
|
430
|
+
doc_sig.monogram(node).should == node_sig.monogram
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
context "passed a non-Node" do
|
435
|
+
it "raises an error" do
|
436
|
+
proc { Lorax::Signature.new.monogram(42) }.should raise_error(ArgumentError, /signature expects a Node/)
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
context "text nodes" do
|
441
|
+
it "returns the signature as the monogram" do
|
442
|
+
doc = xml { root { text "hello" } }
|
443
|
+
node = doc.root.children.first
|
444
|
+
sig = Lorax::Signature.new(doc.root)
|
445
|
+
sig.monogram(node).should == sig.signature(node)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
context "element nodes" do
|
450
|
+
it "is equal for nodes with equal names and attributes" do
|
451
|
+
doc = xml { root {
|
452
|
+
a1(:foo => :bar, :bazz => :quux) { text "hello" }
|
453
|
+
a1(:foo => :bar, :bazz => :quux) { b1 }
|
454
|
+
a1(:foo => :bar, :bazz => :quux)
|
455
|
+
} }
|
456
|
+
nodes = doc.css("a1")
|
457
|
+
sig = Lorax::Signature.new(doc.root)
|
458
|
+
sig.monogram(nodes[0]).should == sig.monogram(nodes[1])
|
459
|
+
sig.monogram(nodes[1]).should == sig.monogram(nodes[2])
|
460
|
+
end
|
461
|
+
|
462
|
+
it "is inequal for nodes with different attributes" do
|
463
|
+
doc = xml { root {
|
464
|
+
a1(:foo => :bar)
|
465
|
+
a1(:foo => :bar, :bazz => :quux)
|
466
|
+
} }
|
467
|
+
nodes = doc.css("a1")
|
468
|
+
sig = Lorax::Signature.new(doc.root)
|
469
|
+
sig.monogram(nodes[0]).should_not == sig.monogram(nodes[1])
|
470
|
+
end
|
471
|
+
end
|
472
|
+
end
|
473
|
+
end
|