lorax 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/README.rdoc +2 -0
- data/Rakefile +1 -0
- data/TODO +1 -6
- data/lib/lorax.rb +1 -1
- data/lib/lorax/signature.rb +25 -12
- data/spec/unit/signature_spec.rb +41 -5
- metadata +47 -44
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/README.rdoc
CHANGED
@@ -15,6 +15,8 @@ generates deltas in less than O(n * log n) time, accepting some
|
|
15
15
|
tradeoffs in the size of the delta set. You can find his paper at
|
16
16
|
http://gregory.cobena.free.fr/www/Publications/thesis.html.
|
17
17
|
|
18
|
+
"I am the Lorax, I speak for the trees."
|
19
|
+
|
18
20
|
== Features / Problems
|
19
21
|
|
20
22
|
* Detect differences between documents, or tell whether two documents are the same.
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -1,13 +1,8 @@
|
|
1
1
|
# -*-org-*-
|
2
|
-
|
2
|
+
Lorax TODO
|
3
3
|
|
4
|
-
* gem
|
5
|
-
*** gemspec
|
6
|
-
*** license
|
7
|
-
*** gemcutter
|
8
4
|
* docs
|
9
5
|
*** rdocs
|
10
|
-
*** readme
|
11
6
|
*** class description notes
|
12
7
|
- Signature: calculate and persist signatures and weights for nodes in a single document
|
13
8
|
- Match: represents a match between two nodes
|
data/lib/lorax.rb
CHANGED
data/lib/lorax/signature.rb
CHANGED
@@ -19,7 +19,7 @@ module Lorax
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def nodes(sig=nil)
|
22
|
-
sig ? @nodes[sig] : @node
|
22
|
+
sig ? @nodes[sig] : [@node]
|
23
23
|
end
|
24
24
|
|
25
25
|
def size
|
@@ -30,12 +30,19 @@ module Lorax
|
|
30
30
|
return @signatures[node] if @signatures.key?(node)
|
31
31
|
raise ArgumentError, "signature expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
32
32
|
|
33
|
-
if node.text?
|
33
|
+
if node.text?
|
34
|
+
content = node.content.strip
|
35
|
+
if content.empty?
|
36
|
+
return nil
|
37
|
+
else
|
38
|
+
monogram = signature = hashify(content)
|
39
|
+
end
|
40
|
+
elsif node.cdata? || node.comment?
|
34
41
|
monogram = signature = hashify(node.content)
|
35
42
|
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
36
43
|
monogram = signature = hashify(node.to_html)
|
37
44
|
elsif node.element?
|
38
|
-
children_sig = hashify(node.children .collect { |child| signature(child) })
|
45
|
+
children_sig = hashify(node.children .collect { |child| signature(child) }.compact)
|
39
46
|
attr_sig = hashify(node.attributes.sort.collect { |k,v| [k, v.value] }.flatten)
|
40
47
|
monogram = hashify(node.name, attr_sig)
|
41
48
|
signature = hashify(node.name, attr_sig, children_sig)
|
@@ -55,16 +62,22 @@ module Lorax
|
|
55
62
|
return @weights[node] if @weights.key?(node)
|
56
63
|
raise ArgumentError, "weight expects a Node, but received #{node.inspect}" unless node.is_a?(Nokogiri::XML::Node)
|
57
64
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
node.children.inject(1) { |sum, child| sum += weight(child) }
|
65
|
-
else
|
66
|
-
raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}"
|
65
|
+
if node.text?
|
66
|
+
content = node.content.strip
|
67
|
+
if content.empty?
|
68
|
+
calculated_weight = 0
|
69
|
+
else
|
70
|
+
calculated_weight = 1 + Math.log(content.length)
|
67
71
|
end
|
72
|
+
elsif node.cdata? || node.comment?
|
73
|
+
calculated_weight = 1 + Math.log(node.content.length)
|
74
|
+
elsif node.type == Nokogiri::XML::Node::ENTITY_REF_NODE
|
75
|
+
calculated_weight = 1
|
76
|
+
elsif node.element?
|
77
|
+
calculated_weight = node.children.inject(1) { |sum, child| sum += weight(child) }
|
78
|
+
else
|
79
|
+
raise ArgumentError, "weight expects an element, text, cdata or comment node, but received #{node.class}"
|
80
|
+
end
|
68
81
|
|
69
82
|
@weights[node] = calculated_weight
|
70
83
|
end
|
data/spec/unit/signature_spec.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
3
|
describe Lorax::Signature do
|
4
|
+
WHITESPACES = ["\n"," ","\t","\r","\f"]
|
5
|
+
|
4
6
|
def assert_node_signature_equal(node1, node2)
|
5
7
|
Lorax::Signature.new(node1).signature.should == Lorax::Signature.new(node2).signature
|
6
8
|
end
|
@@ -46,6 +48,18 @@ describe Lorax::Signature do
|
|
46
48
|
node_sig = Lorax::Signature.new(nodes.first)
|
47
49
|
doc_sig.nodes(node_sig.signature).should =~ nodes.to_a
|
48
50
|
end
|
51
|
+
|
52
|
+
it "returns the node if I pass nil" do
|
53
|
+
doc = xml { root {
|
54
|
+
a1 "hello1"
|
55
|
+
a1 "hello2"
|
56
|
+
a1 "hello3"
|
57
|
+
} }
|
58
|
+
nodes = doc.css("a1")
|
59
|
+
doc_sig = Lorax::Signature.new(doc.root)
|
60
|
+
node_sig = Lorax::Signature.new(nodes.first)
|
61
|
+
doc_sig.nodes(nil).should == [doc.root]
|
62
|
+
end
|
49
63
|
end
|
50
64
|
|
51
65
|
describe "#size" do
|
@@ -166,24 +180,46 @@ describe Lorax::Signature do
|
|
166
180
|
sig.signature(node)
|
167
181
|
end
|
168
182
|
|
169
|
-
context "
|
170
|
-
it "
|
183
|
+
context "passed a text Node" do
|
184
|
+
it "returns equal signatures for identical text nodes" do
|
171
185
|
doc = xml { root {
|
172
186
|
span "hello"
|
173
187
|
span "hello"
|
174
188
|
} }
|
175
189
|
assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first })
|
176
190
|
end
|
177
|
-
end
|
178
191
|
|
179
|
-
|
180
|
-
it "have inequal signatures" do
|
192
|
+
it "returns inequal signatures for different text nodes" do
|
181
193
|
doc = xml { root {
|
182
194
|
span "hello"
|
183
195
|
span "goodbye"
|
184
196
|
} }
|
185
197
|
assert_node_signature_not_equal(*doc.css("span").collect { |n| n.children.first })
|
186
198
|
end
|
199
|
+
|
200
|
+
it "ignores leading whitespace" do
|
201
|
+
doc = xml { root {
|
202
|
+
span "hello"
|
203
|
+
span "#{WHITESPACES.join}hello"
|
204
|
+
} }
|
205
|
+
assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first })
|
206
|
+
end
|
207
|
+
|
208
|
+
it "ignores trailing whitespace" do
|
209
|
+
doc = xml { root {
|
210
|
+
span "hello"
|
211
|
+
span "hello#{WHITESPACES.join}"
|
212
|
+
} }
|
213
|
+
assert_node_signature_equal(*doc.css("span").collect { |n| n.children.first })
|
214
|
+
end
|
215
|
+
|
216
|
+
it "treats empty text nodes the same as no text node" do
|
217
|
+
doc = xml { root {
|
218
|
+
span WHITESPACES.join
|
219
|
+
span
|
220
|
+
} }
|
221
|
+
assert_node_signature_equal(*doc.css("span"))
|
222
|
+
end
|
187
223
|
end
|
188
224
|
|
189
225
|
context "elements with same name (with no attributes and no content)" do
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lorax
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
+
- 2
|
8
9
|
- 0
|
9
|
-
version: 0.
|
10
|
+
version: 0.2.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Mike Dalessio
|
@@ -17,34 +18,36 @@ cert_chain:
|
|
17
18
|
-----BEGIN CERTIFICATE-----
|
18
19
|
MIIDPDCCAiSgAwIBAgIBADANBgkqhkiG9w0BAQUFADBEMRYwFAYDVQQDDA1taWtl
|
19
20
|
LmRhbGVzc2lvMRUwEwYKCZImiZPyLGQBGRYFZ21haWwxEzARBgoJkiaJk/IsZAEZ
|
20
|
-
|
21
|
+
FgNjb20wHhcNMTAwOTMwMDYyNjQ3WhcNMTEwOTMwMDYyNjQ3WjBEMRYwFAYDVQQD
|
21
22
|
DA1taWtlLmRhbGVzc2lvMRUwEwYKCZImiZPyLGQBGRYFZ21haWwxEzARBgoJkiaJ
|
22
|
-
k/
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
23
|
+
k/IsZAEZFgNjb20wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLv4nl
|
24
|
+
BGRtliYy5s5MhlFO88UvkkETFcS79OCaGFKorxPTmcfDrR2/2x0mAySXJ6I1uPEU
|
25
|
+
WSAWaPb1at61NEOvp5kRNzUNdwGakBA/fd1vZ1N2rwHRtjk/8t6DX8yiflr6T761
|
26
|
+
9ZMYPE+t85NvlPt0/WpT778imNZXwGQNcQJwNESDiBTgyjN8bOWpvRrVADVdOCme
|
27
|
+
DW3AfJnF/kdMYuSiUuFMZpyOlULEbOsrvOfUoEKjoFaVNv7FJ28/kLH1UgmtucOD
|
28
|
+
m5bZ/qy5b2+CWzzsmUfysaGnLQ4LjvAFpmgZGAjIE9TnyjU0jw+2e7dq8uRjdnFJ
|
29
|
+
gfWQlnJuwAlZXR1nAgMBAAGjOTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0G
|
30
|
+
A1UdDgQWBBRbc4XnK6it228clp2DjyqaVjKW+DANBgkqhkiG9w0BAQUFAAOCAQEA
|
31
|
+
xPtSPtMl9qsgNGcnSDLSTjwGouwsjOB19IbtdODFTabUpRPCk7OFHeYGdJik4iiZ
|
32
|
+
fk10t3vzr6uWMAyOfwpWWFRnEYAvy9ZaMGDIZPKD8xWxaRTLwmi+pQsS8Lo2IpDC
|
33
|
+
Lb+l0lUiRiYS3/Ez7tA6pS122cvuQroWfuqh5Mi3pNAi1nuBTlhCNJuR5XUaOjqs
|
34
|
+
DAoZLfYEEW+4bmkAb6ky2TPUslaln56PO3/JG+IfWZwCvTFFVdKRBKXqLaAxO9rv
|
35
|
+
7nflCv7xpUSUGGZ6hoPG8dil+Mp/kKV8cb1kxZz+C8660hC93dJ3FQ3adX30ylvZ
|
36
|
+
C4THW+6HEQDCdOkiArif8A==
|
36
37
|
-----END CERTIFICATE-----
|
37
38
|
|
38
|
-
date: 2010-
|
39
|
+
date: 2010-10-14 00:00:00 -04:00
|
39
40
|
default_executable:
|
40
41
|
dependencies:
|
41
42
|
- !ruby/object:Gem::Dependency
|
42
43
|
name: nokogiri
|
43
44
|
prerelease: false
|
44
45
|
requirement: &id001 !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
45
47
|
requirements:
|
46
48
|
- - ">="
|
47
49
|
- !ruby/object:Gem::Version
|
50
|
+
hash: 7
|
48
51
|
segments:
|
49
52
|
- 1
|
50
53
|
- 4
|
@@ -56,72 +59,66 @@ dependencies:
|
|
56
59
|
name: rubyforge
|
57
60
|
prerelease: false
|
58
61
|
requirement: &id002 !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
59
63
|
requirements:
|
60
64
|
- - ">="
|
61
65
|
- !ruby/object:Gem::Version
|
66
|
+
hash: 7
|
62
67
|
segments:
|
63
68
|
- 2
|
64
69
|
- 0
|
65
|
-
-
|
66
|
-
version: 2.0.
|
70
|
+
- 4
|
71
|
+
version: 2.0.4
|
67
72
|
type: :development
|
68
73
|
version_requirements: *id002
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: gemcutter
|
71
|
-
prerelease: false
|
72
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
73
|
-
requirements:
|
74
|
-
- - ">="
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
segments:
|
77
|
-
- 0
|
78
|
-
- 3
|
79
|
-
- 0
|
80
|
-
version: 0.3.0
|
81
|
-
type: :development
|
82
|
-
version_requirements: *id003
|
83
74
|
- !ruby/object:Gem::Dependency
|
84
75
|
name: rspec
|
85
76
|
prerelease: false
|
86
|
-
requirement: &
|
77
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
87
79
|
requirements:
|
88
80
|
- - ">="
|
89
81
|
- !ruby/object:Gem::Version
|
82
|
+
hash: 13
|
90
83
|
segments:
|
91
84
|
- 1
|
92
85
|
- 2
|
93
86
|
- 9
|
94
87
|
version: 1.2.9
|
95
88
|
type: :development
|
96
|
-
version_requirements: *
|
89
|
+
version_requirements: *id003
|
97
90
|
- !ruby/object:Gem::Dependency
|
98
91
|
name: rr
|
99
92
|
prerelease: false
|
100
|
-
requirement: &
|
93
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
101
95
|
requirements:
|
102
96
|
- - ">="
|
103
97
|
- !ruby/object:Gem::Version
|
98
|
+
hash: 63
|
104
99
|
segments:
|
105
100
|
- 0
|
106
101
|
- 10
|
107
102
|
- 4
|
108
103
|
version: 0.10.4
|
109
104
|
type: :development
|
110
|
-
version_requirements: *
|
105
|
+
version_requirements: *id004
|
111
106
|
- !ruby/object:Gem::Dependency
|
112
107
|
name: hoe
|
113
108
|
prerelease: false
|
114
|
-
requirement: &
|
109
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
110
|
+
none: false
|
115
111
|
requirements:
|
116
112
|
- - ">="
|
117
113
|
- !ruby/object:Gem::Version
|
114
|
+
hash: 21
|
118
115
|
segments:
|
119
116
|
- 2
|
120
|
-
-
|
121
|
-
-
|
122
|
-
version: 2.
|
117
|
+
- 6
|
118
|
+
- 1
|
119
|
+
version: 2.6.1
|
123
120
|
type: :development
|
124
|
-
version_requirements: *
|
121
|
+
version_requirements: *id005
|
125
122
|
description: |-
|
126
123
|
The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri.
|
127
124
|
|
@@ -133,6 +130,8 @@ description: |-
|
|
133
130
|
generates deltas in less than O(n * log n) time, accepting some
|
134
131
|
tradeoffs in the size of the delta set. You can find his paper at
|
135
132
|
http://gregory.cobena.free.fr/www/Publications/thesis.html.
|
133
|
+
|
134
|
+
"I am the Lorax, I speak for the trees."
|
136
135
|
email:
|
137
136
|
- mike.dalessio@gmail.com
|
138
137
|
executables:
|
@@ -192,23 +191,27 @@ rdoc_options:
|
|
192
191
|
require_paths:
|
193
192
|
- lib
|
194
193
|
required_ruby_version: !ruby/object:Gem::Requirement
|
194
|
+
none: false
|
195
195
|
requirements:
|
196
196
|
- - ">="
|
197
197
|
- !ruby/object:Gem::Version
|
198
|
+
hash: 3
|
198
199
|
segments:
|
199
200
|
- 0
|
200
201
|
version: "0"
|
201
202
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
203
|
+
none: false
|
202
204
|
requirements:
|
203
205
|
- - ">="
|
204
206
|
- !ruby/object:Gem::Version
|
207
|
+
hash: 3
|
205
208
|
segments:
|
206
209
|
- 0
|
207
210
|
version: "0"
|
208
211
|
requirements: []
|
209
212
|
|
210
213
|
rubyforge_project: lorax
|
211
|
-
rubygems_version: 1.3.
|
214
|
+
rubygems_version: 1.3.7
|
212
215
|
signing_key:
|
213
216
|
specification_version: 3
|
214
217
|
summary: The Lorax is a full diff and patch library for XML/HTML documents, based on Nokogiri
|
metadata.gz.sig
CHANGED
Binary file
|