spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+ require 'sinatra/base'
3
+ require 'webmock/rspec'
4
+
5
+ require 'spidr/agent'
6
+
7
+ RSpec.shared_context "example App" do
8
+ let(:host) { 'example.com' }
9
+
10
+ subject { Agent.new(host: host) }
11
+
12
+ def self.app(&block)
13
+ let(:app) do
14
+ klass = Class.new(Sinatra::Base)
15
+ klass.set :host, host
16
+ klass.set :port, 80
17
+ klass.class_eval(&block)
18
+ return klass
19
+ end
20
+
21
+ before do
22
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
23
+
24
+ subject.start_at("http://#{host}/")
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,33 @@
1
+ require 'rspec'
2
+
3
+ RSpec.shared_context "example Page" do
4
+ let(:code) { 200 }
5
+ let(:msg) { 'OK' }
6
+ let(:content_type) { 'text/html' }
7
+ let(:headers) { {} }
8
+ let(:body) { '' }
9
+
10
+ let(:response) do
11
+ Net::HTTPResponse.new('1.1', code.to_s, msg).tap do |response|
12
+ response.set_content_type(content_type) if content_type
13
+
14
+ headers.each do |name,values|
15
+ if values
16
+ Array(values).each do |value|
17
+ response.add_field(name,value)
18
+ end
19
+ else
20
+ response.remove_field(name)
21
+ end
22
+ end
23
+
24
+ # stub #body, otherwise Net::HTTP will check @socket
25
+ allow(response).to receive(:body).and_return(body)
26
+ end
27
+ end
28
+
29
+ let(:host) { 'example.com' }
30
+ let(:url) { URI::HTTP.build(host: host) }
31
+
32
+ subject { described_class.new(url,response) }
33
+ end
@@ -0,0 +1,150 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ describe "#content_type" do
10
+ it "should return the Content-Type as a String" do
11
+ expect(subject.content_type).to be == content_type
12
+ end
13
+
14
+ context "when content_type is missing" do
15
+ let(:content_type) { nil }
16
+
17
+ it "should return an empty String" do
18
+ expect(subject.content_type).to be == ''
19
+ end
20
+ end
21
+ end
22
+
23
+ describe "#content_types" do
24
+ it "should return the Content-Type as an Array" do
25
+ expect(subject.content_types).to be == [content_type]
26
+ end
27
+
28
+ context "when content_type is missing" do
29
+ let(:content_type) { nil }
30
+
31
+ it "should return an empty Array" do
32
+ expect(subject.content_types).to be == []
33
+ end
34
+ end
35
+ end
36
+
37
+ describe "#content_charset" do
38
+ let(:charset) { 'utf8' }
39
+ let(:content_type) { "text/html;charset=#{charset}" }
40
+
41
+ it "should extract the 'charset=' param" do
42
+ expect(subject.content_charset).to be == charset
43
+ end
44
+
45
+ context "when there is no 'charset='" do
46
+ let(:content_type) { 'text/html' }
47
+
48
+ it { expect(subject.content_charset).to be nil }
49
+ end
50
+ end
51
+
52
+ describe "#is_content_type?" do
53
+ let(:charset) { 'utf8' }
54
+ let(:sub_type) { 'html' }
55
+ let(:mime_type) { "text/#{sub_type}" }
56
+ let(:content_type) { "#{mime_type};charset=#{charset}" }
57
+
58
+ context "when given a full mime-type" do
59
+ context "and it matches the Content-Type's mime-type" do
60
+ it { expect(subject.is_content_type?(mime_type)).to be true }
61
+ end
62
+
63
+ context "but it doesn't match the Content-Type's mime-type" do
64
+ it { expect(subject.is_content_type?('text/plain')).to be false }
65
+ end
66
+ end
67
+
68
+ context "when given a sub-type" do
69
+ context "and it matches the Content-Type's sub-type" do
70
+ it { expect(subject.is_content_type?(sub_type)).to be true }
71
+ end
72
+
73
+ context "but it doesn't match the Content-Type's sub-type" do
74
+ it { expect(subject.is_content_type?('plain')).to be false }
75
+ end
76
+ end
77
+ end
78
+
79
+ shared_examples "Content-Type method" do |method,*content_types|
80
+ content_types.each do |content_type|
81
+ context "when Content-Type includes #{content_type}" do
82
+ let(:content_type) { content_type }
83
+
84
+ it { expect(subject.send(method)).to be true }
85
+ end
86
+ end
87
+
88
+ context "when Content-Type does not include #{content_types.join(', ')}" do
89
+ let(:content_type) { 'unknown/unknown' }
90
+
91
+ it { expect(subject.send(method)).to be false }
92
+ end
93
+ end
94
+
95
+ describe "#plain_text?" do
96
+ include_examples "Content-Type method", :plain_text?, 'text/plain'
97
+ end
98
+
99
+ describe "#directory?" do
100
+ include_examples "Content-Type method", :directory?, 'text/directory'
101
+ end
102
+
103
+ describe "#directory?" do
104
+ include_examples "Content-Type method", :html?, 'text/html'
105
+ end
106
+
107
+ describe "#html?" do
108
+ include_examples "Content-Type method", :html?, 'text/html'
109
+ end
110
+
111
+ describe "#xml?" do
112
+ include_examples "Content-Type method", :xml?, 'text/xml', 'application/xml'
113
+ end
114
+
115
+ describe "#xsl?" do
116
+ include_examples "Content-Type method", :xsl?, 'text/xsl'
117
+ end
118
+
119
+ describe "#javascript?" do
120
+ include_examples "Content-Type method", :javascript?, 'text/javascript', 'application/javascript'
121
+ end
122
+
123
+ describe "#json?" do
124
+ include_examples "Content-Type method", :json?, 'application/json'
125
+ end
126
+
127
+ describe "#css?" do
128
+ include_examples "Content-Type method", :css?, 'text/css'
129
+ end
130
+
131
+ describe "#rss?" do
132
+ include_examples "Content-Type method", :rss?, 'application/rss+xml', 'application/rdf+xml'
133
+ end
134
+
135
+ describe "#atom?" do
136
+ include_examples "Content-Type method", :atom?, 'application/atom+xml'
137
+ end
138
+
139
+ describe "#ms_word?" do
140
+ include_examples "Content-Type method", :ms_word?, 'application/msword'
141
+ end
142
+
143
+ describe "#pdf?" do
144
+ include_examples "Content-Type method", :pdf?, 'application/pdf'
145
+ end
146
+
147
+ describe "#zip?" do
148
+ include_examples "Content-Type method", :zip?, 'application/zip'
149
+ end
150
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ let(:name) { 'foo' }
10
+ let(:value) { 'bar' }
11
+ let(:path) { '/' }
12
+ let(:cookie) { "#{name}=#{value}; Path=#{path}; Domain=#{host}; Secure; HTTPOnly" }
13
+ let(:headers) do
14
+ {'Set-Cookie' => cookie}
15
+ end
16
+
17
+ describe "#cookie" do
18
+ it "should return the Set-Cookie header as a String" do
19
+ expect(subject.cookie).to be == cookie
20
+ end
21
+
22
+ context "when Set-Cookie is not set" do
23
+ let(:headers) { {} }
24
+
25
+ it "should return an empty String" do
26
+ expect(subject.cookie).to be == ''
27
+ end
28
+ end
29
+ end
30
+
31
+ describe "#cookies" do
32
+ it "should return the Set-Cookie header as an Array" do
33
+ expect(subject.cookies).to be == [cookie]
34
+ end
35
+
36
+ context "when Set-Cookie is not set" do
37
+ let(:headers) { {} }
38
+
39
+ it "should return an empty Array" do
40
+ expect(subject.cookies).to be == []
41
+ end
42
+ end
43
+ end
44
+
45
+ describe "#cookie_params" do
46
+ it "should parse the cookie params into a Hash" do
47
+ expect(subject.cookie_params).to be == {name => value}
48
+ end
49
+
50
+ context "when the cookie has no value" do
51
+ let(:value) { '' }
52
+
53
+ it "should default the value to an empty String" do
54
+ expect(subject.cookie_params[name]).to be == ''
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,524 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
10
+
11
+ describe "#title" do
12
+ context "when there is a title" do
13
+ it "should return the title inner_text" do
14
+ expect(subject.title).to be == 'example'
15
+ end
16
+ end
17
+
18
+ context "when there is no title" do
19
+ let(:body) { %{<html><head></head><body><p>hello</p></body></html>} }
20
+
21
+ it "should return nil" do
22
+ expect(subject.title).to be nil
23
+ end
24
+ end
25
+ end
26
+
27
+ describe "#each_meta_redirect" do
28
+ context "when the Content-Type is text/html" do
29
+ let(:content_type) { 'text/html' }
30
+
31
+ context "and the HTML is valid" do
32
+ let(:link) { '/link' }
33
+ let(:refresh) { 'refresh' }
34
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /></head><body>Redirecting...</body></html>} }
35
+
36
+ it "should yield each meta http-equiv='refresh' URL" do
37
+ expect { |b|
38
+ subject.each_meta_redirect(&b)
39
+ }.to yield_successive_args(link)
40
+ end
41
+
42
+ context "but when http-equiv is REFRESH" do
43
+ let(:refresh) { 'REFRESH' }
44
+
45
+ it "should ignore the case of refresh" do
46
+ expect { |b|
47
+ subject.each_meta_redirect(&b)
48
+ }.to yield_successive_args(link)
49
+ end
50
+ end
51
+
52
+ context "but the http-equiv attribute is missing" do
53
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta content="4; url=#{link}2" /></head><body>Redirecting...</body></html>} }
54
+
55
+ it "should ignore those meta tags" do
56
+ expect { |b|
57
+ subject.each_meta_redirect(&b)
58
+ }.to yield_successive_args(link)
59
+ end
60
+ end
61
+
62
+ context "but http-equiv is not refresh" do
63
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="content-type" content="#{content_type}" /></head><body></body></html>} }
64
+
65
+ it "should ignore those meta tags" do
66
+ expect { |b|
67
+ subject.each_meta_redirect(&b)
68
+ }.to yield_successive_args(link)
69
+ end
70
+ end
71
+
72
+ context "but the content attribute is missing" do
73
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" /></head><body>Redirecting...</body></html>} }
74
+
75
+ it "should ignore those meta tags" do
76
+ expect { |b|
77
+ subject.each_meta_redirect(&b)
78
+ }.to yield_successive_args(link)
79
+ end
80
+ end
81
+
82
+ context "but the content attribute does not contain url=..." do
83
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" content="0" /></head><body>Redirecting...</body></html>} }
84
+
85
+ it "should ignore those meta tags" do
86
+ expect { |b|
87
+ subject.each_meta_redirect(&b)
88
+ }.to yield_successive_args(link)
89
+ end
90
+ end
91
+ end
92
+
93
+ context "but the HTML cannot be parsed" do
94
+ let(:body) { "<html></" }
95
+
96
+ it "should yield nothing" do
97
+ expect { |b| subject(&b) }.not_to yield_control
98
+ end
99
+ end
100
+ end
101
+
102
+ context "when the Content-Type is not text/html" do
103
+ let(:content_type) { 'text/xml' }
104
+
105
+ it "should yield nothing" do
106
+ expect { |b| subject(&b) }.not_to yield_control
107
+ end
108
+ end
109
+
110
+ context "when not given a block" do
111
+ it "should return an Enumerator" do
112
+ expect(subject.each_meta_redirect).to be_kind_of(Enumerator)
113
+ end
114
+ end
115
+ end
116
+
117
+ describe "#meta_redirect?" do
118
+ context "when there are meta refresh redirects" do
119
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=/link" /></head><body>Redirecting...</body></html>} }
120
+
121
+ it { expect(subject.meta_redirect?).to be true }
122
+ end
123
+
124
+ context "when there are no meta refresh redirects" do
125
+ let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
126
+
127
+ it { expect(subject.meta_redirect?).to be false }
128
+ end
129
+ end
130
+
131
+ describe "#meta_redirects" do
132
+ context "when there are meta refresh redirects" do
133
+ let(:link1) { "/link1" }
134
+ let(:link2) { "/link2" }
135
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
136
+
137
+ it "should return each meta refresh redirect URL" do
138
+ expect(subject.meta_redirects).to be == [link1, link2]
139
+ end
140
+ end
141
+
142
+ context "when there are no meta refresh redirects" do
143
+ let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
144
+
145
+ it { expect(subject.meta_redirects).to be == [] }
146
+ end
147
+ end
148
+
149
+ describe "#each_redirect" do
150
+ context "when the Location header is set" do
151
+ let(:link) { "http://#{host}/link" }
152
+ let(:headers) { {'Location' => link} }
153
+
154
+ it "should yield the Location header" do
155
+ expect { |b|
156
+ subject.each_redirect(&b)
157
+ }.to yield_successive_args(link)
158
+ end
159
+ end
160
+
161
+ context "when there are multiple Location headers" do
162
+ let(:link1) { "http://#{host}/link1" }
163
+ let(:link2) { "http://#{host}/link2" }
164
+ let(:headers) { {'Location' => [link1, link2]} }
165
+
166
+ it "should yield each Location header value" do
167
+ expect { |b|
168
+ subject.each_redirect(&b)
169
+ }.to yield_successive_args(link1, link2)
170
+ end
171
+ end
172
+
173
+ context "when there is no Location header set" do
174
+ context "but there are meta refresh redirects" do
175
+ let(:link1) { "/link1" }
176
+ let(:link2) { "/link2" }
177
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
178
+
179
+ it "should yield each meta refresh redirect URL" do
180
+ expect { |b|
181
+ subject.each_redirect(&b)
182
+ }.to yield_successive_args(link1, link2)
183
+ end
184
+ end
185
+
186
+ context "and there are no meta refresh redirects" do
187
+ it do
188
+ expect { |b|
189
+ subject.each_redirect(&b)
190
+ }.not_to yield_control
191
+ end
192
+ end
193
+ end
194
+
195
+ context "when not given a block" do
196
+ it "should return an Enumerator" do
197
+ expect(subject.each_redirect).to be_kind_of(Enumerator)
198
+ end
199
+ end
200
+ end
201
+
202
+ context "#redirects_to" do
203
+ context "when there are redirects" do
204
+ let(:link1) { "http://#{host}/link1" }
205
+ let(:link2) { "http://#{host}/link2" }
206
+ let(:headers) { {'Location' => [link1, link2]} }
207
+
208
+ it "should return the redirects as an Array" do
209
+ expect(subject.redirects_to).to be == [link1, link2]
210
+ end
211
+ end
212
+
213
+ context "when there are no redirects" do
214
+ it { expect(subject.redirects_to).to be == [] }
215
+ end
216
+ end
217
+
218
+ describe "#each_mailto" do
219
+ context "when the Content-Type is text/html" do
220
+ let(:content_type) { 'text/html' }
221
+
222
+ context "and the HTML is valid" do
223
+ let(:email1) { "bob@example.com" }
224
+ let(:email2) { "jim@example.com" }
225
+ let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
226
+
227
+ it "should yield each a link where the href starts with 'mailto:'" do
228
+ expect { |b|
229
+ subject.each_mailto(&b)
230
+ }.to yield_successive_args(email1, email2)
231
+ end
232
+ end
233
+
234
+ context "but the HTML is not valid" do
235
+ let(:body) { "<html" }
236
+
237
+ it "should yield nothing" do
238
+ expect { |b|
239
+ subject.each_mailto(&b)
240
+ }.not_to yield_control
241
+ end
242
+ end
243
+ end
244
+
245
+ context "when the Content-Type is not text/html" do
246
+ let(:content_type) { 'text/plain' }
247
+
248
+ it "should yield nothing" do
249
+ expect { |b|
250
+ subject.each_mailto(&b)
251
+ }.not_to yield_control
252
+ end
253
+ end
254
+ end
255
+
256
+ describe "#mailtos" do
257
+ context "when there are 'mailto:' links" do
258
+ let(:email1) { "bob@example.com" }
259
+ let(:email2) { "jim@example.com" }
260
+ let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
261
+
262
+ it "should return all 'mailto:' links" do
263
+ expect(subject.mailtos).to be == [email1, email2]
264
+ end
265
+ end
266
+
267
+ context "when there are no 'mailto:' links" do
268
+ it { expect(subject.mailtos).to be == [] }
269
+ end
270
+ end
271
+
272
+ describe "#each_link" do
273
+ context "when the page contains a links" do
274
+ let(:link1) { '/link1' }
275
+ let(:link2) { '/link2' }
276
+ let(:body) { %{<html><body><a href="#{link1}">link1</a> <a href="#{link2}">link2</a></body></html>} }
277
+
278
+ it "should yield each a/@href value" do
279
+ expect { |b|
280
+ subject.each_link(&b)
281
+ }.to yield_successive_args(link1, link2)
282
+ end
283
+ end
284
+
285
+ context "when the page contains frames" do
286
+ let(:frame1) { '/frame1' }
287
+ let(:frame2) { '/frame2' }
288
+ let(:body) { %{<html><body><frameset><frame src="#{frame1}" /><frame src="#{frame2}" /></frameset></body></html>} }
289
+
290
+ it "should yield each frame/@src value" do
291
+ expect { |b|
292
+ subject.each_link(&b)
293
+ }.to yield_successive_args(frame1, frame2)
294
+ end
295
+ end
296
+
297
+ context "when the page contains iframes" do
298
+ let(:iframe1) { '/iframe1' }
299
+ let(:iframe2) { '/iframe2' }
300
+ let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
301
+
302
+ it "should yield each iframe/@src value" do
303
+ expect { |b|
304
+ subject.each_link(&b)
305
+ }.to yield_successive_args(iframe1, iframe2)
306
+ end
307
+ end
308
+
309
+ context "when the page contains remote stylesheets" do
310
+ let(:stylesheet1) { '/stylesheet1.css' }
311
+ let(:stylesheet2) { '/stylesheet2.css' }
312
+ let(:body) { %{<html><head><link rel="stylesheet" type="text/css" href="#{stylesheet1}" /><link rel="stylesheet" type="text/css" href="#{stylesheet2}" /><body><p>hello</p></body></html>} }
313
+
314
+ it "should yield each link/@href value" do
315
+ expect { |b|
316
+ subject.each_link(&b)
317
+ }.to yield_successive_args(stylesheet1, stylesheet2)
318
+ end
319
+ end
320
+
321
+ context "when the page contains remote javascript" do
322
+ let(:javascript1) { '/script1.js' }
323
+ let(:javascript2) { '/script2.js' }
324
+ let(:body) { %{<html><head><script type="text/javascript" src="#{javascript1}"></script><script type="text/javascript" src="#{javascript2}"></script><body><p>hello</p></body></html>} }
325
+
326
+ it "should yield each script/@src value" do
327
+ expect { |b|
328
+ subject.each_link(&b)
329
+ }.to yield_successive_args(javascript1, javascript2)
330
+ end
331
+ end
332
+ end
333
+
334
+ describe "#links" do
335
+ context "when the page contains links" do
336
+ let(:link) { '/link' }
337
+ let(:frame) { '/frame' }
338
+ let(:iframe) { '/iframe' }
339
+ let(:stylesheet) { '/stylesheet.css' }
340
+ let(:javascript) { '/script.js' }
341
+ let(:body) do
342
+ %{<html>} +
343
+ %{<head>} +
344
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
345
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
346
+ %{</head>} +
347
+ %{<body>} +
348
+ %{<a href="#{link}">link</a>} +
349
+ %{<frameset><frame src="#{frame}" /></frameset>} +
350
+ %{<iframe src="#{iframe}" />} +
351
+ %{</body>} +
352
+ %{</html>}
353
+ end
354
+
355
+ it "should return an Array of links" do
356
+ expect(subject.links).to be == [
357
+ link,
358
+ frame,
359
+ iframe,
360
+ stylesheet,
361
+ javascript
362
+ ]
363
+ end
364
+ end
365
+
366
+ context "when the page does not contain any links" do
367
+ it { expect(subject.links).to be == [] }
368
+ end
369
+ end
370
+
371
+ describe "#each_url" do
372
+ context "when the page contains links" do
373
+ let(:link) { '/link' }
374
+ let(:frame) { '/frame' }
375
+ let(:iframe) { '/iframe' }
376
+ let(:stylesheet) { '/stylesheet.css' }
377
+ let(:javascript) { '/script.js' }
378
+ let(:body) do
379
+ %{<html>} +
380
+ %{<head>} +
381
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
382
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
383
+ %{</head>} +
384
+ %{<body>} +
385
+ %{<a href="#{link}">link</a>} +
386
+ %{<frameset><frame src="#{frame}" /></frameset>} +
387
+ %{<iframe src="#{iframe}" />} +
388
+ %{</body>} +
389
+ %{</html>}
390
+ end
391
+
392
+ it "should return an Array of absolute URIs" do
393
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
394
+ URI("http://#{host}#{link}"),
395
+ URI("http://#{host}#{frame}"),
396
+ URI("http://#{host}#{iframe}"),
397
+ URI("http://#{host}#{stylesheet}"),
398
+ URI("http://#{host}#{javascript}")
399
+ )
400
+ end
401
+ end
402
+
403
+ context "when the page contains no links" do
404
+ it do
405
+ expect { |b|
406
+ subject.each_url(&b)
407
+ }.not_to yield_control
408
+ end
409
+ end
410
+ end
411
+
412
+ describe "#urls" do
413
+ context "when the page contains links" do
414
+ let(:link) { '/link' }
415
+ let(:frame) { '/frame' }
416
+ let(:iframe) { '/iframe' }
417
+ let(:stylesheet) { '/stylesheet.css' }
418
+ let(:javascript) { '/script.js' }
419
+ let(:body) do
420
+ %{<html>} +
421
+ %{<head>} +
422
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
423
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
424
+ %{</head>} +
425
+ %{<body>} +
426
+ %{<a href="#{link}">link</a>} +
427
+ %{<frameset><frame src="#{frame}" /></frameset>} +
428
+ %{<iframe src="#{iframe}" />} +
429
+ %{</body>} +
430
+ %{</html>}
431
+ end
432
+
433
+ it "should return an Array of absolute URIs" do
434
+ expect(subject.urls).to be == [
435
+ URI("http://#{host}#{link}"),
436
+ URI("http://#{host}#{frame}"),
437
+ URI("http://#{host}#{iframe}"),
438
+ URI("http://#{host}#{stylesheet}"),
439
+ URI("http://#{host}#{javascript}")
440
+ ]
441
+ end
442
+ end
443
+
444
+ context "when the page contains no links" do
445
+ it { expect(subject.urls).to be == [] }
446
+ end
447
+ end
448
+
449
+ describe "#to_absolute" do
450
+ context "when given an relative path" do
451
+ let(:path) { '/foo/' }
452
+ let(:url) { URI("http://#{host}#{path}") }
453
+
454
+ let(:relative_path) { 'bar' }
455
+
456
+ subject { super().to_absolute(relative_path) }
457
+
458
+ it "should merge it with the page's URI" do
459
+ expect(subject).to be == URI("http://#{host}#{path}#{relative_path}")
460
+ end
461
+
462
+ context "when given a relative path with directory traversal" do
463
+ let(:expanded_path) { '/bar' }
464
+ let(:relative_path) { "../../.././../#{expanded_path}" }
465
+
466
+ it "should expand the relative path before merging it" do
467
+ expect(subject).to be == URI("http://#{host}#{expanded_path}")
468
+ end
469
+ end
470
+ end
471
+
472
+ context "when given an absolute path" do
473
+ let(:path) { '/foo/' }
474
+ let(:url) { URI("http://#{host}#{path}") }
475
+
476
+ let(:absolute_path) { '/bar/' }
477
+
478
+ subject { super().to_absolute(absolute_path) }
479
+
480
+ it "should override the page URI's path" do
481
+ expect(subject).to be == URI("http://#{host}#{absolute_path}")
482
+ end
483
+
484
+ context "when given an absolute path with directory traversal" do
485
+ let(:expanded_path) { '/bar/' }
486
+ let(:absolute_path) { "/../../.././../#{expanded_path}" }
487
+
488
+ it "should expand the absolute path before merging it" do
489
+ expect(subject).to be == URI("http://#{host}#{expanded_path}")
490
+ end
491
+ end
492
+ end
493
+
494
+ context "when given a remote link" do
495
+ let(:remote_host) { 'foo.example.com' }
496
+ let(:remote_path) { '/bar' }
497
+ let(:link) { "http://#{remote_host}#{remote_path}" }
498
+
499
+ subject { super().to_absolute(link) }
500
+
501
+ it "should override the page's URI" do
502
+ expect(subject).to be == URI(link)
503
+ end
504
+
505
+ context "when the remote link contains directory traversal" do
506
+ let(:expanded_path) { '/bar' }
507
+ let(:remote_path) { "/../.././../../#{expanded_path}" }
508
+
509
+ it "should expand the remote link's path" do
510
+ expect(subject).to be == URI("http://#{remote_host}#{expanded_path}")
511
+ end
512
+ end
513
+
514
+ context "when the remote link ftp://" do
515
+ let(:remote_path) { "/pub" }
516
+ let(:link) { "ftp://#{remote_host}#{remote_path}" }
517
+
518
+ it "should preserve the leading '/' of the path" do
519
+ expect(subject.path).to be == remote_path
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end