spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+ require 'sinatra/base'
3
+ require 'webmock/rspec'
4
+
5
+ require 'spidr/agent'
6
+
7
+ RSpec.shared_context "example App" do
8
+ let(:host) { 'example.com' }
9
+
10
+ subject { Agent.new(host: host) }
11
+
12
+ def self.app(&block)
13
+ let(:app) do
14
+ klass = Class.new(Sinatra::Base)
15
+ klass.set :host, host
16
+ klass.set :port, 80
17
+ klass.class_eval(&block)
18
+ return klass
19
+ end
20
+
21
+ before do
22
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
23
+
24
+ subject.start_at("http://#{host}/")
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,33 @@
1
+ require 'rspec'
2
+
3
+ RSpec.shared_context "example Page" do
4
+ let(:code) { 200 }
5
+ let(:msg) { 'OK' }
6
+ let(:content_type) { 'text/html' }
7
+ let(:headers) { {} }
8
+ let(:body) { '' }
9
+
10
+ let(:response) do
11
+ Net::HTTPResponse.new('1.1', code.to_s, msg).tap do |response|
12
+ response.set_content_type(content_type) if content_type
13
+
14
+ headers.each do |name,values|
15
+ if values
16
+ Array(values).each do |value|
17
+ response.add_field(name,value)
18
+ end
19
+ else
20
+ response.remove_field(name)
21
+ end
22
+ end
23
+
24
+ # stub #body, otherwise Net::HTTP will check @socket
25
+ allow(response).to receive(:body).and_return(body)
26
+ end
27
+ end
28
+
29
+ let(:host) { 'example.com' }
30
+ let(:url) { URI::HTTP.build(host: host) }
31
+
32
+ subject { described_class.new(url,response) }
33
+ end
@@ -0,0 +1,150 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ describe "#content_type" do
10
+ it "should return the Content-Type as a String" do
11
+ expect(subject.content_type).to be == content_type
12
+ end
13
+
14
+ context "when content_type is missing" do
15
+ let(:content_type) { nil }
16
+
17
+ it "should return an empty String" do
18
+ expect(subject.content_type).to be == ''
19
+ end
20
+ end
21
+ end
22
+
23
+ describe "#content_types" do
24
+ it "should return the Content-Type as an Array" do
25
+ expect(subject.content_types).to be == [content_type]
26
+ end
27
+
28
+ context "when content_type is missing" do
29
+ let(:content_type) { nil }
30
+
31
+ it "should return an empty Array" do
32
+ expect(subject.content_types).to be == []
33
+ end
34
+ end
35
+ end
36
+
37
+ describe "#content_charset" do
38
+ let(:charset) { 'utf8' }
39
+ let(:content_type) { "text/html;charset=#{charset}" }
40
+
41
+ it "should extract the 'charset=' param" do
42
+ expect(subject.content_charset).to be == charset
43
+ end
44
+
45
+ context "when there is no 'charset='" do
46
+ let(:content_type) { 'text/html' }
47
+
48
+ it { expect(subject.content_charset).to be nil }
49
+ end
50
+ end
51
+
52
+ describe "#is_content_type?" do
53
+ let(:charset) { 'utf8' }
54
+ let(:sub_type) { 'html' }
55
+ let(:mime_type) { "text/#{sub_type}" }
56
+ let(:content_type) { "#{mime_type};charset=#{charset}" }
57
+
58
+ context "when given a full mime-type" do
59
+ context "and it matches the Content-Type's mime-type" do
60
+ it { expect(subject.is_content_type?(mime_type)).to be true }
61
+ end
62
+
63
+ context "but it doesn't match the Content-Type's mime-type" do
64
+ it { expect(subject.is_content_type?('text/plain')).to be false }
65
+ end
66
+ end
67
+
68
+ context "when given a sub-type" do
69
+ context "and it matches the Content-Type's sub-type" do
70
+ it { expect(subject.is_content_type?(sub_type)).to be true }
71
+ end
72
+
73
+ context "but it doesn't match the Content-Type's sub-type" do
74
+ it { expect(subject.is_content_type?('plain')).to be false }
75
+ end
76
+ end
77
+ end
78
+
79
+ shared_examples "Content-Type method" do |method,*content_types|
80
+ content_types.each do |content_type|
81
+ context "when Content-Type includes #{content_type}" do
82
+ let(:content_type) { content_type }
83
+
84
+ it { expect(subject.send(method)).to be true }
85
+ end
86
+ end
87
+
88
+ context "when Content-Type does not include #{content_types.join(', ')}" do
89
+ let(:content_type) { 'unknown/unknown' }
90
+
91
+ it { expect(subject.send(method)).to be false }
92
+ end
93
+ end
94
+
95
+ describe "#plain_text?" do
96
+ include_examples "Content-Type method", :plain_text?, 'text/plain'
97
+ end
98
+
99
+ describe "#directory?" do
100
+ include_examples "Content-Type method", :directory?, 'text/directory'
101
+ end
102
+
103
+ describe "#directory?" do
104
+ include_examples "Content-Type method", :html?, 'text/html'
105
+ end
106
+
107
+ describe "#html?" do
108
+ include_examples "Content-Type method", :html?, 'text/html'
109
+ end
110
+
111
+ describe "#xml?" do
112
+ include_examples "Content-Type method", :xml?, 'text/xml', 'application/xml'
113
+ end
114
+
115
+ describe "#xsl?" do
116
+ include_examples "Content-Type method", :xsl?, 'text/xsl'
117
+ end
118
+
119
+ describe "#javascript?" do
120
+ include_examples "Content-Type method", :javascript?, 'text/javascript', 'application/javascript'
121
+ end
122
+
123
+ describe "#json?" do
124
+ include_examples "Content-Type method", :json?, 'application/json'
125
+ end
126
+
127
+ describe "#css?" do
128
+ include_examples "Content-Type method", :css?, 'text/css'
129
+ end
130
+
131
+ describe "#rss?" do
132
+ include_examples "Content-Type method", :rss?, 'application/rss+xml', 'application/rdf+xml'
133
+ end
134
+
135
+ describe "#atom?" do
136
+ include_examples "Content-Type method", :atom?, 'application/atom+xml'
137
+ end
138
+
139
+ describe "#ms_word?" do
140
+ include_examples "Content-Type method", :ms_word?, 'application/msword'
141
+ end
142
+
143
+ describe "#pdf?" do
144
+ include_examples "Content-Type method", :pdf?, 'application/pdf'
145
+ end
146
+
147
+ describe "#zip?" do
148
+ include_examples "Content-Type method", :zip?, 'application/zip'
149
+ end
150
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ let(:name) { 'foo' }
10
+ let(:value) { 'bar' }
11
+ let(:path) { '/' }
12
+ let(:cookie) { "#{name}=#{value}; Path=#{path}; Domain=#{host}; Secure; HTTPOnly" }
13
+ let(:headers) do
14
+ {'Set-Cookie' => cookie}
15
+ end
16
+
17
+ describe "#cookie" do
18
+ it "should return the Set-Cookie header as a String" do
19
+ expect(subject.cookie).to be == cookie
20
+ end
21
+
22
+ context "when Set-Cookie is not set" do
23
+ let(:headers) { {} }
24
+
25
+ it "should return an empty String" do
26
+ expect(subject.cookie).to be == ''
27
+ end
28
+ end
29
+ end
30
+
31
+ describe "#cookies" do
32
+ it "should return the Set-Cookie header as an Array" do
33
+ expect(subject.cookies).to be == [cookie]
34
+ end
35
+
36
+ context "when Set-Cookie is not set" do
37
+ let(:headers) { {} }
38
+
39
+ it "should return an empty Array" do
40
+ expect(subject.cookies).to be == []
41
+ end
42
+ end
43
+ end
44
+
45
+ describe "#cookie_params" do
46
+ it "should parse the cookie params into a Hash" do
47
+ expect(subject.cookie_params).to be == {name => value}
48
+ end
49
+
50
+ context "when the cookie has no value" do
51
+ let(:value) { '' }
52
+
53
+ it "should default the value to an empty String" do
54
+ expect(subject.cookie_params[name]).to be == ''
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,524 @@
1
+ require 'spec_helper'
2
+ require 'example_page'
3
+
4
+ require 'spidr/page'
5
+
6
+ describe Page do
7
+ include_context "example Page"
8
+
9
+ let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
10
+
11
+ describe "#title" do
12
+ context "when there is a title" do
13
+ it "should return the title inner_text" do
14
+ expect(subject.title).to be == 'example'
15
+ end
16
+ end
17
+
18
+ context "when there is no title" do
19
+ let(:body) { %{<html><head></head><body><p>hello</p></body></html>} }
20
+
21
+ it "should return nil" do
22
+ expect(subject.title).to be nil
23
+ end
24
+ end
25
+ end
26
+
27
+ describe "#each_meta_redirect" do
28
+ context "when the Content-Type is text/html" do
29
+ let(:content_type) { 'text/html' }
30
+
31
+ context "and the HTML is valid" do
32
+ let(:link) { '/link' }
33
+ let(:refresh) { 'refresh' }
34
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /></head><body>Redirecting...</body></html>} }
35
+
36
+ it "should yield each meta http-equiv='refresh' URL" do
37
+ expect { |b|
38
+ subject.each_meta_redirect(&b)
39
+ }.to yield_successive_args(link)
40
+ end
41
+
42
+ context "but when http-equiv is REFRESH" do
43
+ let(:refresh) { 'REFRESH' }
44
+
45
+ it "should ignore the case of refresh" do
46
+ expect { |b|
47
+ subject.each_meta_redirect(&b)
48
+ }.to yield_successive_args(link)
49
+ end
50
+ end
51
+
52
+ context "but the http-equiv attribute is missing" do
53
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta content="4; url=#{link}2" /></head><body>Redirecting...</body></html>} }
54
+
55
+ it "should ignore those meta tags" do
56
+ expect { |b|
57
+ subject.each_meta_redirect(&b)
58
+ }.to yield_successive_args(link)
59
+ end
60
+ end
61
+
62
+ context "but http-equiv is not refresh" do
63
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="content-type" content="#{content_type}" /></head><body></body></html>} }
64
+
65
+ it "should ignore those meta tags" do
66
+ expect { |b|
67
+ subject.each_meta_redirect(&b)
68
+ }.to yield_successive_args(link)
69
+ end
70
+ end
71
+
72
+ context "but the content attribute is missing" do
73
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" /></head><body>Redirecting...</body></html>} }
74
+
75
+ it "should ignore those meta tags" do
76
+ expect { |b|
77
+ subject.each_meta_redirect(&b)
78
+ }.to yield_successive_args(link)
79
+ end
80
+ end
81
+
82
+ context "but the content attribute does not contain url=..." do
83
+ let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" content="0" /></head><body>Redirecting...</body></html>} }
84
+
85
+ it "should ignore those meta tags" do
86
+ expect { |b|
87
+ subject.each_meta_redirect(&b)
88
+ }.to yield_successive_args(link)
89
+ end
90
+ end
91
+ end
92
+
93
+ context "but the HTML cannot be parsed" do
94
+ let(:body) { "<html></" }
95
+
96
+ it "should yield nothing" do
97
+ expect { |b| subject(&b) }.not_to yield_control
98
+ end
99
+ end
100
+ end
101
+
102
+ context "when the Content-Type is not text/html" do
103
+ let(:content_type) { 'text/xml' }
104
+
105
+ it "should yield nothing" do
106
+ expect { |b| subject(&b) }.not_to yield_control
107
+ end
108
+ end
109
+
110
+ context "when not given a block" do
111
+ it "should return an Enumerator" do
112
+ expect(subject.each_meta_redirect).to be_kind_of(Enumerator)
113
+ end
114
+ end
115
+ end
116
+
117
+ describe "#meta_redirect?" do
118
+ context "when there are meta refresh redirects" do
119
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=/link" /></head><body>Redirecting...</body></html>} }
120
+
121
+ it { expect(subject.meta_redirect?).to be true }
122
+ end
123
+
124
+ context "when there are no meta refresh redirects" do
125
+ let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
126
+
127
+ it { expect(subject.meta_redirect?).to be false }
128
+ end
129
+ end
130
+
131
+ describe "#meta_redirects" do
132
+ context "when there are meta refresh redirects" do
133
+ let(:link1) { "/link1" }
134
+ let(:link2) { "/link2" }
135
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
136
+
137
+ it "should return each meta refresh redirect URL" do
138
+ expect(subject.meta_redirects).to be == [link1, link2]
139
+ end
140
+ end
141
+
142
+ context "when there are no meta refresh redirects" do
143
+ let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
144
+
145
+ it { expect(subject.meta_redirects).to be == [] }
146
+ end
147
+ end
148
+
149
+ describe "#each_redirect" do
150
+ context "when the Location header is set" do
151
+ let(:link) { "http://#{host}/link" }
152
+ let(:headers) { {'Location' => link} }
153
+
154
+ it "should yield the Location header" do
155
+ expect { |b|
156
+ subject.each_redirect(&b)
157
+ }.to yield_successive_args(link)
158
+ end
159
+ end
160
+
161
+ context "when there are multiple Location headers" do
162
+ let(:link1) { "http://#{host}/link1" }
163
+ let(:link2) { "http://#{host}/link2" }
164
+ let(:headers) { {'Location' => [link1, link2]} }
165
+
166
+ it "should yield each Location header value" do
167
+ expect { |b|
168
+ subject.each_redirect(&b)
169
+ }.to yield_successive_args(link1, link2)
170
+ end
171
+ end
172
+
173
+ context "when there is no Location header set" do
174
+ context "but there are meta refresh redirects" do
175
+ let(:link1) { "/link1" }
176
+ let(:link2) { "/link2" }
177
+ let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
178
+
179
+ it "should yield each meta refresh redirect URL" do
180
+ expect { |b|
181
+ subject.each_redirect(&b)
182
+ }.to yield_successive_args(link1, link2)
183
+ end
184
+ end
185
+
186
+ context "and there are no meta refresh redirects" do
187
+ it do
188
+ expect { |b|
189
+ subject.each_redirect(&b)
190
+ }.not_to yield_control
191
+ end
192
+ end
193
+ end
194
+
195
+ context "when not given a block" do
196
+ it "should return an Enumerator" do
197
+ expect(subject.each_redirect).to be_kind_of(Enumerator)
198
+ end
199
+ end
200
+ end
201
+
202
+ context "#redirects_to" do
203
+ context "when there are redirects" do
204
+ let(:link1) { "http://#{host}/link1" }
205
+ let(:link2) { "http://#{host}/link2" }
206
+ let(:headers) { {'Location' => [link1, link2]} }
207
+
208
+ it "should return the redirects as an Array" do
209
+ expect(subject.redirects_to).to be == [link1, link2]
210
+ end
211
+ end
212
+
213
+ context "when there are no redirects" do
214
+ it { expect(subject.redirects_to).to be == [] }
215
+ end
216
+ end
217
+
218
+ describe "#each_mailto" do
219
+ context "when the Content-Type is text/html" do
220
+ let(:content_type) { 'text/html' }
221
+
222
+ context "and the HTML is valid" do
223
+ let(:email1) { "bob@example.com" }
224
+ let(:email2) { "jim@example.com" }
225
+ let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
226
+
227
+ it "should yield each a link where the href starts with 'mailto:'" do
228
+ expect { |b|
229
+ subject.each_mailto(&b)
230
+ }.to yield_successive_args(email1, email2)
231
+ end
232
+ end
233
+
234
+ context "but the HTML is not valid" do
235
+ let(:body) { "<html" }
236
+
237
+ it "should yield nothing" do
238
+ expect { |b|
239
+ subject.each_mailto(&b)
240
+ }.not_to yield_control
241
+ end
242
+ end
243
+ end
244
+
245
+ context "when the Content-Type is not text/html" do
246
+ let(:content_type) { 'text/plain' }
247
+
248
+ it "should yield nothing" do
249
+ expect { |b|
250
+ subject.each_mailto(&b)
251
+ }.not_to yield_control
252
+ end
253
+ end
254
+ end
255
+
256
+ describe "#mailtos" do
257
+ context "when there are 'mailto:' links" do
258
+ let(:email1) { "bob@example.com" }
259
+ let(:email2) { "jim@example.com" }
260
+ let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
261
+
262
+ it "should return all 'mailto:' links" do
263
+ expect(subject.mailtos).to be == [email1, email2]
264
+ end
265
+ end
266
+
267
+ context "when there are no 'mailto:' links" do
268
+ it { expect(subject.mailtos).to be == [] }
269
+ end
270
+ end
271
+
272
+ describe "#each_link" do
273
+ context "when the page contains a links" do
274
+ let(:link1) { '/link1' }
275
+ let(:link2) { '/link2' }
276
+ let(:body) { %{<html><body><a href="#{link1}">link1</a> <a href="#{link2}">link2</a></body></html>} }
277
+
278
+ it "should yield each a/@href value" do
279
+ expect { |b|
280
+ subject.each_link(&b)
281
+ }.to yield_successive_args(link1, link2)
282
+ end
283
+ end
284
+
285
+ context "when the page contains frames" do
286
+ let(:frame1) { '/frame1' }
287
+ let(:frame2) { '/frame2' }
288
+ let(:body) { %{<html><body><frameset><frame src="#{frame1}" /><frame src="#{frame2}" /></frameset></body></html>} }
289
+
290
+ it "should yield each frame/@src value" do
291
+ expect { |b|
292
+ subject.each_link(&b)
293
+ }.to yield_successive_args(frame1, frame2)
294
+ end
295
+ end
296
+
297
+ context "when the page contains iframes" do
298
+ let(:iframe1) { '/iframe1' }
299
+ let(:iframe2) { '/iframe2' }
300
+ let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
301
+
302
+ it "should yield each iframe/@src value" do
303
+ expect { |b|
304
+ subject.each_link(&b)
305
+ }.to yield_successive_args(iframe1, iframe2)
306
+ end
307
+ end
308
+
309
+ context "when the page contains remote stylesheets" do
310
+ let(:stylesheet1) { '/stylesheet1.css' }
311
+ let(:stylesheet2) { '/stylesheet2.css' }
312
+ let(:body) { %{<html><head><link rel="stylesheet" type="text/css" href="#{stylesheet1}" /><link rel="stylesheet" type="text/css" href="#{stylesheet2}" /><body><p>hello</p></body></html>} }
313
+
314
+ it "should yield each link/@href value" do
315
+ expect { |b|
316
+ subject.each_link(&b)
317
+ }.to yield_successive_args(stylesheet1, stylesheet2)
318
+ end
319
+ end
320
+
321
+ context "when the page contains remote javascript" do
322
+ let(:javascript1) { '/script1.js' }
323
+ let(:javascript2) { '/script2.js' }
324
+ let(:body) { %{<html><head><script type="text/javascript" src="#{javascript1}"></script><script type="text/javascript" src="#{javascript2}"></script><body><p>hello</p></body></html>} }
325
+
326
+ it "should yield each script/@src value" do
327
+ expect { |b|
328
+ subject.each_link(&b)
329
+ }.to yield_successive_args(javascript1, javascript2)
330
+ end
331
+ end
332
+ end
333
+
334
+ describe "#links" do
335
+ context "when the page contains links" do
336
+ let(:link) { '/link' }
337
+ let(:frame) { '/frame' }
338
+ let(:iframe) { '/iframe' }
339
+ let(:stylesheet) { '/stylesheet.css' }
340
+ let(:javascript) { '/script.js' }
341
+ let(:body) do
342
+ %{<html>} +
343
+ %{<head>} +
344
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
345
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
346
+ %{</head>} +
347
+ %{<body>} +
348
+ %{<a href="#{link}">link</a>} +
349
+ %{<frameset><frame src="#{frame}" /></frameset>} +
350
+ %{<iframe src="#{iframe}" />} +
351
+ %{</body>} +
352
+ %{</html>}
353
+ end
354
+
355
+ it "should return an Array of links" do
356
+ expect(subject.links).to be == [
357
+ link,
358
+ frame,
359
+ iframe,
360
+ stylesheet,
361
+ javascript
362
+ ]
363
+ end
364
+ end
365
+
366
+ context "when the page does not contain any links" do
367
+ it { expect(subject.links).to be == [] }
368
+ end
369
+ end
370
+
371
+ describe "#each_url" do
372
+ context "when the page contains links" do
373
+ let(:link) { '/link' }
374
+ let(:frame) { '/frame' }
375
+ let(:iframe) { '/iframe' }
376
+ let(:stylesheet) { '/stylesheet.css' }
377
+ let(:javascript) { '/script.js' }
378
+ let(:body) do
379
+ %{<html>} +
380
+ %{<head>} +
381
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
382
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
383
+ %{</head>} +
384
+ %{<body>} +
385
+ %{<a href="#{link}">link</a>} +
386
+ %{<frameset><frame src="#{frame}" /></frameset>} +
387
+ %{<iframe src="#{iframe}" />} +
388
+ %{</body>} +
389
+ %{</html>}
390
+ end
391
+
392
+ it "should return an Array of absolute URIs" do
393
+ expect { |b| subject.each_url(&b) }.to yield_successive_args(
394
+ URI("http://#{host}#{link}"),
395
+ URI("http://#{host}#{frame}"),
396
+ URI("http://#{host}#{iframe}"),
397
+ URI("http://#{host}#{stylesheet}"),
398
+ URI("http://#{host}#{javascript}")
399
+ )
400
+ end
401
+ end
402
+
403
+ context "when the page contains no links" do
404
+ it do
405
+ expect { |b|
406
+ subject.each_url(&b)
407
+ }.not_to yield_control
408
+ end
409
+ end
410
+ end
411
+
412
+ describe "#urls" do
413
+ context "when the page contains links" do
414
+ let(:link) { '/link' }
415
+ let(:frame) { '/frame' }
416
+ let(:iframe) { '/iframe' }
417
+ let(:stylesheet) { '/stylesheet.css' }
418
+ let(:javascript) { '/script.js' }
419
+ let(:body) do
420
+ %{<html>} +
421
+ %{<head>} +
422
+ %{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
423
+ %{<script type="text/javascript" src="#{javascript}"></script>} +
424
+ %{</head>} +
425
+ %{<body>} +
426
+ %{<a href="#{link}">link</a>} +
427
+ %{<frameset><frame src="#{frame}" /></frameset>} +
428
+ %{<iframe src="#{iframe}" />} +
429
+ %{</body>} +
430
+ %{</html>}
431
+ end
432
+
433
+ it "should return an Array of absolute URIs" do
434
+ expect(subject.urls).to be == [
435
+ URI("http://#{host}#{link}"),
436
+ URI("http://#{host}#{frame}"),
437
+ URI("http://#{host}#{iframe}"),
438
+ URI("http://#{host}#{stylesheet}"),
439
+ URI("http://#{host}#{javascript}")
440
+ ]
441
+ end
442
+ end
443
+
444
+ context "when the page contains no links" do
445
+ it { expect(subject.urls).to be == [] }
446
+ end
447
+ end
448
+
449
+ describe "#to_absolute" do
450
+ context "when given an relative path" do
451
+ let(:path) { '/foo/' }
452
+ let(:url) { URI("http://#{host}#{path}") }
453
+
454
+ let(:relative_path) { 'bar' }
455
+
456
+ subject { super().to_absolute(relative_path) }
457
+
458
+ it "should merge it with the page's URI" do
459
+ expect(subject).to be == URI("http://#{host}#{path}#{relative_path}")
460
+ end
461
+
462
+ context "when given a relative path with directory traversal" do
463
+ let(:expanded_path) { '/bar' }
464
+ let(:relative_path) { "../../.././../#{expanded_path}" }
465
+
466
+ it "should expand the relative path before merging it" do
467
+ expect(subject).to be == URI("http://#{host}#{expanded_path}")
468
+ end
469
+ end
470
+ end
471
+
472
+ context "when given an absolute path" do
473
+ let(:path) { '/foo/' }
474
+ let(:url) { URI("http://#{host}#{path}") }
475
+
476
+ let(:absolute_path) { '/bar/' }
477
+
478
+ subject { super().to_absolute(absolute_path) }
479
+
480
+ it "should override the page URI's path" do
481
+ expect(subject).to be == URI("http://#{host}#{absolute_path}")
482
+ end
483
+
484
+ context "when given an absolute path with directory traversal" do
485
+ let(:expanded_path) { '/bar/' }
486
+ let(:absolute_path) { "/../../.././../#{expanded_path}" }
487
+
488
+ it "should expand the absolute path before merging it" do
489
+ expect(subject).to be == URI("http://#{host}#{expanded_path}")
490
+ end
491
+ end
492
+ end
493
+
494
+ context "when given a remote link" do
495
+ let(:remote_host) { 'foo.example.com' }
496
+ let(:remote_path) { '/bar' }
497
+ let(:link) { "http://#{remote_host}#{remote_path}" }
498
+
499
+ subject { super().to_absolute(link) }
500
+
501
+ it "should override the page's URI" do
502
+ expect(subject).to be == URI(link)
503
+ end
504
+
505
+ context "when the remote link contains directory traversal" do
506
+ let(:expanded_path) { '/bar' }
507
+ let(:remote_path) { "/../.././../../#{expanded_path}" }
508
+
509
+ it "should expand the remote link's path" do
510
+ expect(subject).to be == URI("http://#{remote_host}#{expanded_path}")
511
+ end
512
+ end
513
+
514
+ context "when the remote link ftp://" do
515
+ let(:remote_path) { "/pub" }
516
+ let(:link) { "ftp://#{remote_host}#{remote_path}" }
517
+
518
+ it "should preserve the leading '/' of the path" do
519
+ expect(subject.path).to be == remote_path
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end