saxxy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +5 -0
- data/Gemfile +13 -0
- data/LICENSE +22 -0
- data/README.md +117 -0
- data/Rakefile +12 -0
- data/lib/saxxy.rb +2 -0
- data/lib/saxxy/activatable.rb +160 -0
- data/lib/saxxy/callbacks/libxml.rb +26 -0
- data/lib/saxxy/callbacks/nokogiri.rb +30 -0
- data/lib/saxxy/callbacks/ox.rb +66 -0
- data/lib/saxxy/callbacks/sax.rb +86 -0
- data/lib/saxxy/context.rb +88 -0
- data/lib/saxxy/context_tree.rb +85 -0
- data/lib/saxxy/event.rb +83 -0
- data/lib/saxxy/event_registry.rb +122 -0
- data/lib/saxxy/node_action.rb +59 -0
- data/lib/saxxy/node_rule.rb +90 -0
- data/lib/saxxy/parsers/base.rb +28 -0
- data/lib/saxxy/parsers/libxml.rb +52 -0
- data/lib/saxxy/parsers/nokogiri.rb +28 -0
- data/lib/saxxy/parsers/ox.rb +30 -0
- data/lib/saxxy/service.rb +47 -0
- data/lib/saxxy/utils/agent.rb +66 -0
- data/lib/saxxy/utils/callback_array.rb +27 -0
- data/lib/saxxy/utils/helpers.rb +13 -0
- data/lib/saxxy/version.rb +3 -0
- data/saxxy.gemspec +21 -0
- data/spec/saxxy/activatable_spec.rb +344 -0
- data/spec/saxxy/callbacks/sax_spec.rb +456 -0
- data/spec/saxxy/context_spec.rb +51 -0
- data/spec/saxxy/context_tree_spec.rb +68 -0
- data/spec/saxxy/event_registry_spec.rb +137 -0
- data/spec/saxxy/event_spec.rb +49 -0
- data/spec/saxxy/node_action_spec.rb +46 -0
- data/spec/saxxy/node_rule_spec.rb +99 -0
- data/spec/saxxy/parsers/libxml_spec.rb +104 -0
- data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
- data/spec/saxxy/parsers/ox_spec.rb +175 -0
- data/spec/saxxy/utils/agent_spec.rb +63 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/agent_macros.rb +24 -0
- metadata +155 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "saxxy/utils/agent"
|
3
|
+
require "saxxy/context_tree"
|
4
|
+
|
5
|
+
|
6
|
+
describe "Saxxy::Parsers::Libxml", :not_jruby do
|
7
|
+
|
8
|
+
def parser(*args)
|
9
|
+
Saxxy::Parsers::Libxml.new(*args)
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
describe "#initialize" do
|
14
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
15
|
+
let(:subject) { parser(tree, {foo: :bar}) }
|
16
|
+
|
17
|
+
it "should set the options" do
|
18
|
+
subject.options.should == {foo: :bar}
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should set the context tree" do
|
22
|
+
subject.context_tree.should == tree
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
context "integration" do
|
28
|
+
let(:valid) do
|
29
|
+
"<html><div><span class='fo'></span></div><div class='f'></div></html>"
|
30
|
+
end
|
31
|
+
|
32
|
+
let(:not_closed) do
|
33
|
+
"<html><div><span class='fo'></span><div class='f'></div></html>"
|
34
|
+
end
|
35
|
+
|
36
|
+
let(:not_opened) do
|
37
|
+
"<html><div></span></div><div></div></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
let(:tree) do
|
41
|
+
Saxxy::ContextTree.new do
|
42
|
+
on("div", class: nil) do |text, elem, attrs|
|
43
|
+
@counts[:div] += 1
|
44
|
+
end
|
45
|
+
under("div") do
|
46
|
+
on("span", class: /foo?/) do |text, elem, attrs|
|
47
|
+
@counts[:span] += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse(string, tree)
|
54
|
+
parser(tree).parse_string(string)
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
before { @counts = { div: 0, span: 0 } }
|
59
|
+
|
60
|
+
describe "valid html" do
|
61
|
+
it "should change the div count" do
|
62
|
+
pending("Libxml generates double callbacks") do
|
63
|
+
expect { parse(valid, tree) }.to change { @counts[:div] }.from(0).to(1)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should change the span count" do
|
68
|
+
pending("Libxml generates double callbacks") do
|
69
|
+
expect { parse(valid, tree) }.to change { @counts[:span] }.from(0).to(1)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "not closed div" do
|
75
|
+
it "should change the div count" do
|
76
|
+
pending("Libxml does not handle malformed html") do
|
77
|
+
expect { parse(not_closed, tree) }.to change { @counts[:div] }.from(0).to(1)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should change the span count" do
|
82
|
+
pending("Libxml does not handle malformed html") do
|
83
|
+
expect { parse(not_closed, tree) }.to change { @counts[:span] }.from(0).to(1)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "not opened span" do
|
89
|
+
it "should change the div count" do
|
90
|
+
pending("Libxml does not handle malformed html") do
|
91
|
+
expect { parse(not_opened, tree) }.to change { @counts[:div] }.from(0).to(2)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should not change the span count" do
|
96
|
+
pending("Libxml does not handle malformed html") do
|
97
|
+
expect { parse(not_opened, tree) }.to_not change { @counts[:span] }
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "saxxy/utils/agent"
|
3
|
+
require "saxxy/context_tree"
|
4
|
+
|
5
|
+
|
6
|
+
describe Saxxy::Parsers::Nokogiri do
|
7
|
+
|
8
|
+
def parser(*args)
|
9
|
+
Saxxy::Parsers::Nokogiri.new(*args)
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
describe "#initialize" do
|
14
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
15
|
+
let(:subject) { parser(tree, {foo: :bar}) }
|
16
|
+
|
17
|
+
it "should set the options" do
|
18
|
+
subject.options.should == {foo: :bar}
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should set the context tree" do
|
22
|
+
subject.context_tree.should == tree
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
describe "#parse_io" do
|
28
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
29
|
+
let(:subject) { parser(tree) }
|
30
|
+
|
31
|
+
it "should delegate the call to the underlying parser" do
|
32
|
+
obj = Object.new
|
33
|
+
io = IO.new(0)
|
34
|
+
subject.stub(new_parser: obj)
|
35
|
+
obj.should_receive(:parse_io).with(io, 'UTF-8')
|
36
|
+
subject.parse_io(io)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
describe "#parse_file" do
|
42
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
43
|
+
let(:subject) { parser(tree) }
|
44
|
+
|
45
|
+
it "should delegate the call to the underlying parser" do
|
46
|
+
obj = Object.new
|
47
|
+
file = File.new(0)
|
48
|
+
subject.stub(new_parser: obj)
|
49
|
+
obj.should_receive(:parse_file).with(file.path, 'UTF-8')
|
50
|
+
subject.parse_file(file.path)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
describe "#parse_string" do
|
56
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
57
|
+
let(:subject) { parser(tree) }
|
58
|
+
|
59
|
+
it "should delegate the call to the underlying parsers' parse_memory" do
|
60
|
+
obj = Object.new
|
61
|
+
string = ""
|
62
|
+
subject.stub(new_parser: obj)
|
63
|
+
obj.should_receive(:parse_memory).with(string, 'UTF-8')
|
64
|
+
subject.parse_string(string)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
context "node count" do
|
71
|
+
let(:valid) do
|
72
|
+
"<html><div><span class='fo'></span></div><div class='f'></div></html>"
|
73
|
+
end
|
74
|
+
|
75
|
+
let(:not_closed) do
|
76
|
+
"<html><div><span class='fo'></span><div class='f'></div></html>"
|
77
|
+
end
|
78
|
+
|
79
|
+
let(:not_opened) do
|
80
|
+
"<html><div></span></div><div></div></html>"
|
81
|
+
end
|
82
|
+
|
83
|
+
let(:tree) do
|
84
|
+
Saxxy::ContextTree.new do
|
85
|
+
on("div", class: nil) do |text, elem, attrs|
|
86
|
+
@counts[:div] += 1
|
87
|
+
end
|
88
|
+
under("div") do
|
89
|
+
on("span", class: /foo?/) do |text, elem, attrs|
|
90
|
+
@counts[:span] += 1
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse(string, tree)
|
97
|
+
parser(tree).parse_string(string)
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
before { @counts = { div: 0, span: 0 } }
|
102
|
+
|
103
|
+
describe "valid html" do
|
104
|
+
it "should change the div count" do
|
105
|
+
expect { parse(valid, tree) }.to change { @counts[:div] }.from(0).to(1)
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should change the span count" do
|
109
|
+
expect { parse(valid, tree) }.to change { @counts[:span] }.from(0).to(1)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
describe "not closed div" do
|
114
|
+
it "should change the div count" do
|
115
|
+
expect { parse(not_closed, tree) }.to change { @counts[:div] }.from(0).to(1)
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should change the span count" do
|
119
|
+
expect { parse(not_closed, tree) }.to change { @counts[:span] }.from(0).to(1)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "not opened span" do
|
124
|
+
it "should change the div count" do
|
125
|
+
expect { parse(not_opened, tree) }.to change { @counts[:div] }.from(0).to(2)
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should not change the span count" do
|
129
|
+
expect { parse(not_opened, tree) }.to_not change { @counts[:span] }
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
context "text aggregation" do
|
137
|
+
let(:valid) do
|
138
|
+
"<html>0<div>1<span class='fo'>2</span>3</div><div class='f'>4</div></html>"
|
139
|
+
end
|
140
|
+
|
141
|
+
let(:not_closed) do
|
142
|
+
"<html>0<div>1<span class='fo'>2</span>3<div>4</div></html>"
|
143
|
+
end
|
144
|
+
|
145
|
+
let(:not_opened) do
|
146
|
+
"<html>0<div>1</span>23</div><div>4</div></html>"
|
147
|
+
end
|
148
|
+
|
149
|
+
let(:tree) do
|
150
|
+
Saxxy::ContextTree.new do
|
151
|
+
on("div", class: nil) do |text, elem, attrs|
|
152
|
+
@texts[:div] = (@texts[:div] || "") + text
|
153
|
+
end
|
154
|
+
under("div") do
|
155
|
+
on("span", class: /foo?/) do |text, elem, attrs|
|
156
|
+
@texts[:span] = (@texts[:span] || "") + text
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def parse(string, tree)
|
163
|
+
parser(tree).parse_string(string)
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
before { @texts = { div: nil, span: nil } }
|
168
|
+
|
169
|
+
describe "valid html" do
|
170
|
+
it "should change the div text" do
|
171
|
+
expect { parse(valid, tree) }.to change { @texts[:div] }.to("123")
|
172
|
+
end
|
173
|
+
|
174
|
+
it "should change the span text" do
|
175
|
+
expect { parse(valid, tree) }.to change { @texts[:span] }.to("2")
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
describe "not closed div" do
|
180
|
+
it "should change the div text" do
|
181
|
+
expect { parse(not_closed, tree) }.to change { @texts[:div] }.to("41234")
|
182
|
+
end
|
183
|
+
|
184
|
+
it "should change the span text" do
|
185
|
+
expect { parse(not_closed, tree) }.to change { @texts[:span] }.to("2")
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
describe "not opened span" do
|
190
|
+
it "should change the div text" do
|
191
|
+
expect { parse(not_opened, tree) }.to change { @texts[:div] }.to("1234")
|
192
|
+
end
|
193
|
+
|
194
|
+
it "should not change the span text" do
|
195
|
+
expect { parse(not_opened, tree) }.to_not change { @texts[:span] }
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
require "saxxy/utils/agent"
|
3
|
+
require "saxxy/context_tree"
|
4
|
+
|
5
|
+
|
6
|
+
# We have :not_jruby here because travis-ci does not
|
7
|
+
# support C extensions for jruby.
|
8
|
+
describe "Saxxy::Parsers::Ox", :not_jruby do
|
9
|
+
|
10
|
+
def parser(*args)
|
11
|
+
Saxxy::Parsers::Ox.new(*args)
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
describe "#initialize" do
|
16
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
17
|
+
let(:subject) { parser(tree, {foo: :bar}) }
|
18
|
+
|
19
|
+
it "should set the options" do
|
20
|
+
subject.options.should == {foo: :bar}
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should set the context tree" do
|
24
|
+
subject.context_tree.should == tree
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
describe "#parse_*" do
|
30
|
+
let(:tree) { Saxxy::ContextTree.new {} }
|
31
|
+
let(:subject) { parser(tree) }
|
32
|
+
|
33
|
+
it "#parse_io should delegate the call to parse" do
|
34
|
+
::Ox.should_receive(:sax_parse)
|
35
|
+
subject.parse_io(StringIO.new(""))
|
36
|
+
end
|
37
|
+
|
38
|
+
it "#parse_string should delegate the call to parse" do
|
39
|
+
::Ox.should_receive(:sax_parse)
|
40
|
+
subject.parse_string("")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
context "node count" do
|
47
|
+
let(:valid) do
|
48
|
+
"<html><div><span class='fo'></span></div><div class='f'></div></html>"
|
49
|
+
end
|
50
|
+
|
51
|
+
let(:not_closed) do
|
52
|
+
"<html><div><span class='fo'></span><div class='f'></div></html>"
|
53
|
+
end
|
54
|
+
|
55
|
+
let(:not_opened) do
|
56
|
+
"<html><div></span></div><div></div></html>"
|
57
|
+
end
|
58
|
+
|
59
|
+
let(:tree) do
|
60
|
+
Saxxy::ContextTree.new do
|
61
|
+
on("div", class: nil) do |text, elem, attrs|
|
62
|
+
@counts[:div] += 1
|
63
|
+
end
|
64
|
+
under("div") do
|
65
|
+
on("span", class: /foo?/) do |text, elem, attrs|
|
66
|
+
@counts[:span] += 1
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse(string, tree)
|
73
|
+
parser(tree).parse_string(string)
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
before { @counts = { div: 0, span: 0 } }
|
78
|
+
|
79
|
+
describe "valid html" do
|
80
|
+
it "should change the div count" do
|
81
|
+
expect { parse(valid, tree) }.to change { @counts[:div] }.from(0).to(1)
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should change the span count" do
|
85
|
+
expect { parse(valid, tree) }.to change { @counts[:span] }.from(0).to(1)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "not closed div" do
|
90
|
+
it "should change the div count" do
|
91
|
+
expect { parse(not_closed, tree) }.to change { @counts[:div] }.from(0).to(1)
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should change the span count" do
|
95
|
+
expect { parse(not_closed, tree) }.to change { @counts[:span] }.from(0).to(1)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "not opened span" do
|
100
|
+
it "should change the div count" do
|
101
|
+
expect { parse(not_opened, tree) }.to change { @counts[:div] }.from(0).to(2)
|
102
|
+
end
|
103
|
+
|
104
|
+
it "should not change the span count" do
|
105
|
+
expect { parse(not_opened, tree) }.to_not change { @counts[:span] }
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
context "text aggregation" do
|
112
|
+
let(:valid) do
|
113
|
+
"<html>0<div>1<span class='fo'>2</span>3</div><div class='f'>4</div></html>"
|
114
|
+
end
|
115
|
+
|
116
|
+
let(:not_closed) do
|
117
|
+
"<html>0<div>1<span class='fo'>2</span>3<div>4</div></html>"
|
118
|
+
end
|
119
|
+
|
120
|
+
let(:not_opened) do
|
121
|
+
"<html>0<div>1</span>23</div><div>4</div></html>"
|
122
|
+
end
|
123
|
+
|
124
|
+
let(:tree) do
|
125
|
+
Saxxy::ContextTree.new do
|
126
|
+
on("div", class: nil) do |text, elem, attrs|
|
127
|
+
@texts[:div] = (@texts[:div] || "") + text
|
128
|
+
end
|
129
|
+
under("div") do
|
130
|
+
on("span", class: /foo?/) do |text, elem, attrs|
|
131
|
+
@texts[:span] = (@texts[:span] || "") + text
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def parse(string, tree)
|
138
|
+
parser(tree).parse_string(string)
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
before { @texts = { div: nil, span: nil } }
|
143
|
+
|
144
|
+
describe "valid html" do
|
145
|
+
it "should change the div text" do
|
146
|
+
expect { parse(valid, tree) }.to change { @texts[:div] }.to("123")
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should change the span text" do
|
150
|
+
expect { parse(valid, tree) }.to change { @texts[:span] }.to("2")
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe "not closed div" do
|
155
|
+
it "should change the div text" do
|
156
|
+
expect { parse(not_closed, tree) }.to change { @texts[:div] }.to("41234")
|
157
|
+
end
|
158
|
+
|
159
|
+
it "should change the span text" do
|
160
|
+
expect { parse(not_closed, tree) }.to change { @texts[:span] }.to("2")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
describe "not opened span" do
|
165
|
+
it "should change the div text" do
|
166
|
+
expect { parse(not_opened, tree) }.to change { @texts[:div] }.to("1234")
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should not change the span text" do
|
170
|
+
expect { parse(not_opened, tree) }.to_not change { @texts[:span] }
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|