spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/spec/example_app.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sinatra/base'
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
require 'spidr/agent'
|
6
|
+
|
7
|
+
RSpec.shared_context "example App" do
|
8
|
+
let(:host) { 'example.com' }
|
9
|
+
|
10
|
+
subject { Agent.new(host: host) }
|
11
|
+
|
12
|
+
def self.app(&block)
|
13
|
+
let(:app) do
|
14
|
+
klass = Class.new(Sinatra::Base)
|
15
|
+
klass.set :host, host
|
16
|
+
klass.set :port, 80
|
17
|
+
klass.class_eval(&block)
|
18
|
+
return klass
|
19
|
+
end
|
20
|
+
|
21
|
+
before do
|
22
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
23
|
+
|
24
|
+
subject.start_at("http://#{host}/")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
|
3
|
+
RSpec.shared_context "example Page" do
|
4
|
+
let(:code) { 200 }
|
5
|
+
let(:msg) { 'OK' }
|
6
|
+
let(:content_type) { 'text/html' }
|
7
|
+
let(:headers) { {} }
|
8
|
+
let(:body) { '' }
|
9
|
+
|
10
|
+
let(:response) do
|
11
|
+
Net::HTTPResponse.new('1.1', code.to_s, msg).tap do |response|
|
12
|
+
response.set_content_type(content_type) if content_type
|
13
|
+
|
14
|
+
headers.each do |name,values|
|
15
|
+
if values
|
16
|
+
Array(values).each do |value|
|
17
|
+
response.add_field(name,value)
|
18
|
+
end
|
19
|
+
else
|
20
|
+
response.remove_field(name)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# stub #body, otherwise Net::HTTP will check @socket
|
25
|
+
allow(response).to receive(:body).and_return(body)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
let(:host) { 'example.com' }
|
30
|
+
let(:url) { URI::HTTP.build(host: host) }
|
31
|
+
|
32
|
+
subject { described_class.new(url,response) }
|
33
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
describe "#content_type" do
|
10
|
+
it "should return the Content-Type as a String" do
|
11
|
+
expect(subject.content_type).to be == content_type
|
12
|
+
end
|
13
|
+
|
14
|
+
context "when content_type is missing" do
|
15
|
+
let(:content_type) { nil }
|
16
|
+
|
17
|
+
it "should return an empty String" do
|
18
|
+
expect(subject.content_type).to be == ''
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#content_types" do
|
24
|
+
it "should return the Content-Type as an Array" do
|
25
|
+
expect(subject.content_types).to be == [content_type]
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when content_type is missing" do
|
29
|
+
let(:content_type) { nil }
|
30
|
+
|
31
|
+
it "should return an empty Array" do
|
32
|
+
expect(subject.content_types).to be == []
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#content_charset" do
|
38
|
+
let(:charset) { 'utf8' }
|
39
|
+
let(:content_type) { "text/html;charset=#{charset}" }
|
40
|
+
|
41
|
+
it "should extract the 'charset=' param" do
|
42
|
+
expect(subject.content_charset).to be == charset
|
43
|
+
end
|
44
|
+
|
45
|
+
context "when there is no 'charset='" do
|
46
|
+
let(:content_type) { 'text/html' }
|
47
|
+
|
48
|
+
it { expect(subject.content_charset).to be nil }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "#is_content_type?" do
|
53
|
+
let(:charset) { 'utf8' }
|
54
|
+
let(:sub_type) { 'html' }
|
55
|
+
let(:mime_type) { "text/#{sub_type}" }
|
56
|
+
let(:content_type) { "#{mime_type};charset=#{charset}" }
|
57
|
+
|
58
|
+
context "when given a full mime-type" do
|
59
|
+
context "and it matches the Content-Type's mime-type" do
|
60
|
+
it { expect(subject.is_content_type?(mime_type)).to be true }
|
61
|
+
end
|
62
|
+
|
63
|
+
context "but it doesn't match the Content-Type's mime-type" do
|
64
|
+
it { expect(subject.is_content_type?('text/plain')).to be false }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "when given a sub-type" do
|
69
|
+
context "and it matches the Content-Type's sub-type" do
|
70
|
+
it { expect(subject.is_content_type?(sub_type)).to be true }
|
71
|
+
end
|
72
|
+
|
73
|
+
context "but it doesn't match the Content-Type's sub-type" do
|
74
|
+
it { expect(subject.is_content_type?('plain')).to be false }
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
shared_examples "Content-Type method" do |method,*content_types|
|
80
|
+
content_types.each do |content_type|
|
81
|
+
context "when Content-Type includes #{content_type}" do
|
82
|
+
let(:content_type) { content_type }
|
83
|
+
|
84
|
+
it { expect(subject.send(method)).to be true }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when Content-Type does not include #{content_types.join(', ')}" do
|
89
|
+
let(:content_type) { 'unknown/unknown' }
|
90
|
+
|
91
|
+
it { expect(subject.send(method)).to be false }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "#plain_text?" do
|
96
|
+
include_examples "Content-Type method", :plain_text?, 'text/plain'
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "#directory?" do
|
100
|
+
include_examples "Content-Type method", :directory?, 'text/directory'
|
101
|
+
end
|
102
|
+
|
103
|
+
describe "#directory?" do
|
104
|
+
include_examples "Content-Type method", :html?, 'text/html'
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "#html?" do
|
108
|
+
include_examples "Content-Type method", :html?, 'text/html'
|
109
|
+
end
|
110
|
+
|
111
|
+
describe "#xml?" do
|
112
|
+
include_examples "Content-Type method", :xml?, 'text/xml', 'application/xml'
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "#xsl?" do
|
116
|
+
include_examples "Content-Type method", :xsl?, 'text/xsl'
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#javascript?" do
|
120
|
+
include_examples "Content-Type method", :javascript?, 'text/javascript', 'application/javascript'
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "#json?" do
|
124
|
+
include_examples "Content-Type method", :json?, 'application/json'
|
125
|
+
end
|
126
|
+
|
127
|
+
describe "#css?" do
|
128
|
+
include_examples "Content-Type method", :css?, 'text/css'
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "#rss?" do
|
132
|
+
include_examples "Content-Type method", :rss?, 'application/rss+xml', 'application/rdf+xml'
|
133
|
+
end
|
134
|
+
|
135
|
+
describe "#atom?" do
|
136
|
+
include_examples "Content-Type method", :atom?, 'application/atom+xml'
|
137
|
+
end
|
138
|
+
|
139
|
+
describe "#ms_word?" do
|
140
|
+
include_examples "Content-Type method", :ms_word?, 'application/msword'
|
141
|
+
end
|
142
|
+
|
143
|
+
describe "#pdf?" do
|
144
|
+
include_examples "Content-Type method", :pdf?, 'application/pdf'
|
145
|
+
end
|
146
|
+
|
147
|
+
describe "#zip?" do
|
148
|
+
include_examples "Content-Type method", :zip?, 'application/zip'
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
let(:name) { 'foo' }
|
10
|
+
let(:value) { 'bar' }
|
11
|
+
let(:path) { '/' }
|
12
|
+
let(:cookie) { "#{name}=#{value}; Path=#{path}; Domain=#{host}; Secure; HTTPOnly" }
|
13
|
+
let(:headers) do
|
14
|
+
{'Set-Cookie' => cookie}
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#cookie" do
|
18
|
+
it "should return the Set-Cookie header as a String" do
|
19
|
+
expect(subject.cookie).to be == cookie
|
20
|
+
end
|
21
|
+
|
22
|
+
context "when Set-Cookie is not set" do
|
23
|
+
let(:headers) { {} }
|
24
|
+
|
25
|
+
it "should return an empty String" do
|
26
|
+
expect(subject.cookie).to be == ''
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "#cookies" do
|
32
|
+
it "should return the Set-Cookie header as an Array" do
|
33
|
+
expect(subject.cookies).to be == [cookie]
|
34
|
+
end
|
35
|
+
|
36
|
+
context "when Set-Cookie is not set" do
|
37
|
+
let(:headers) { {} }
|
38
|
+
|
39
|
+
it "should return an empty Array" do
|
40
|
+
expect(subject.cookies).to be == []
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#cookie_params" do
|
46
|
+
it "should parse the cookie params into a Hash" do
|
47
|
+
expect(subject.cookie_params).to be == {name => value}
|
48
|
+
end
|
49
|
+
|
50
|
+
context "when the cookie has no value" do
|
51
|
+
let(:value) { '' }
|
52
|
+
|
53
|
+
it "should default the value to an empty String" do
|
54
|
+
expect(subject.cookie_params[name]).to be == ''
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,524 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
10
|
+
|
11
|
+
describe "#title" do
|
12
|
+
context "when there is a title" do
|
13
|
+
it "should return the title inner_text" do
|
14
|
+
expect(subject.title).to be == 'example'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context "when there is no title" do
|
19
|
+
let(:body) { %{<html><head></head><body><p>hello</p></body></html>} }
|
20
|
+
|
21
|
+
it "should return nil" do
|
22
|
+
expect(subject.title).to be nil
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#each_meta_redirect" do
|
28
|
+
context "when the Content-Type is text/html" do
|
29
|
+
let(:content_type) { 'text/html' }
|
30
|
+
|
31
|
+
context "and the HTML is valid" do
|
32
|
+
let(:link) { '/link' }
|
33
|
+
let(:refresh) { 'refresh' }
|
34
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /></head><body>Redirecting...</body></html>} }
|
35
|
+
|
36
|
+
it "should yield each meta http-equiv='refresh' URL" do
|
37
|
+
expect { |b|
|
38
|
+
subject.each_meta_redirect(&b)
|
39
|
+
}.to yield_successive_args(link)
|
40
|
+
end
|
41
|
+
|
42
|
+
context "but when http-equiv is REFRESH" do
|
43
|
+
let(:refresh) { 'REFRESH' }
|
44
|
+
|
45
|
+
it "should ignore the case of refresh" do
|
46
|
+
expect { |b|
|
47
|
+
subject.each_meta_redirect(&b)
|
48
|
+
}.to yield_successive_args(link)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "but the http-equiv attribute is missing" do
|
53
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta content="4; url=#{link}2" /></head><body>Redirecting...</body></html>} }
|
54
|
+
|
55
|
+
it "should ignore those meta tags" do
|
56
|
+
expect { |b|
|
57
|
+
subject.each_meta_redirect(&b)
|
58
|
+
}.to yield_successive_args(link)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context "but http-equiv is not refresh" do
|
63
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="content-type" content="#{content_type}" /></head><body></body></html>} }
|
64
|
+
|
65
|
+
it "should ignore those meta tags" do
|
66
|
+
expect { |b|
|
67
|
+
subject.each_meta_redirect(&b)
|
68
|
+
}.to yield_successive_args(link)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "but the content attribute is missing" do
|
73
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" /></head><body>Redirecting...</body></html>} }
|
74
|
+
|
75
|
+
it "should ignore those meta tags" do
|
76
|
+
expect { |b|
|
77
|
+
subject.each_meta_redirect(&b)
|
78
|
+
}.to yield_successive_args(link)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context "but the content attribute does not contain url=..." do
|
83
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" content="0" /></head><body>Redirecting...</body></html>} }
|
84
|
+
|
85
|
+
it "should ignore those meta tags" do
|
86
|
+
expect { |b|
|
87
|
+
subject.each_meta_redirect(&b)
|
88
|
+
}.to yield_successive_args(link)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "but the HTML cannot be parsed" do
|
94
|
+
let(:body) { "<html></" }
|
95
|
+
|
96
|
+
it "should yield nothing" do
|
97
|
+
expect { |b| subject(&b) }.not_to yield_control
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context "when the Content-Type is not text/html" do
|
103
|
+
let(:content_type) { 'text/xml' }
|
104
|
+
|
105
|
+
it "should yield nothing" do
|
106
|
+
expect { |b| subject(&b) }.not_to yield_control
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context "when not given a block" do
|
111
|
+
it "should return an Enumerator" do
|
112
|
+
expect(subject.each_meta_redirect).to be_kind_of(Enumerator)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe "#meta_redirect?" do
|
118
|
+
context "when there are meta refresh redirects" do
|
119
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=/link" /></head><body>Redirecting...</body></html>} }
|
120
|
+
|
121
|
+
it { expect(subject.meta_redirect?).to be true }
|
122
|
+
end
|
123
|
+
|
124
|
+
context "when there are no meta refresh redirects" do
|
125
|
+
let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
|
126
|
+
|
127
|
+
it { expect(subject.meta_redirect?).to be false }
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "#meta_redirects" do
|
132
|
+
context "when there are meta refresh redirects" do
|
133
|
+
let(:link1) { "/link1" }
|
134
|
+
let(:link2) { "/link2" }
|
135
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
|
136
|
+
|
137
|
+
it "should return each meta refresh redirect URL" do
|
138
|
+
expect(subject.meta_redirects).to be == [link1, link2]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context "when there are no meta refresh redirects" do
|
143
|
+
let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
|
144
|
+
|
145
|
+
it { expect(subject.meta_redirects).to be == [] }
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#each_redirect" do
|
150
|
+
context "when the Location header is set" do
|
151
|
+
let(:link) { "http://#{host}/link" }
|
152
|
+
let(:headers) { {'Location' => link} }
|
153
|
+
|
154
|
+
it "should yield the Location header" do
|
155
|
+
expect { |b|
|
156
|
+
subject.each_redirect(&b)
|
157
|
+
}.to yield_successive_args(link)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
context "when there are multiple Location headers" do
|
162
|
+
let(:link1) { "http://#{host}/link1" }
|
163
|
+
let(:link2) { "http://#{host}/link2" }
|
164
|
+
let(:headers) { {'Location' => [link1, link2]} }
|
165
|
+
|
166
|
+
it "should yield each Location header value" do
|
167
|
+
expect { |b|
|
168
|
+
subject.each_redirect(&b)
|
169
|
+
}.to yield_successive_args(link1, link2)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
context "when there is no Location header set" do
|
174
|
+
context "but there are meta refresh redirects" do
|
175
|
+
let(:link1) { "/link1" }
|
176
|
+
let(:link2) { "/link2" }
|
177
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
|
178
|
+
|
179
|
+
it "should yield each meta refresh redirect URL" do
|
180
|
+
expect { |b|
|
181
|
+
subject.each_redirect(&b)
|
182
|
+
}.to yield_successive_args(link1, link2)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
context "and there are no meta refresh redirects" do
|
187
|
+
it do
|
188
|
+
expect { |b|
|
189
|
+
subject.each_redirect(&b)
|
190
|
+
}.not_to yield_control
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
context "when not given a block" do
|
196
|
+
it "should return an Enumerator" do
|
197
|
+
expect(subject.each_redirect).to be_kind_of(Enumerator)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
context "#redirects_to" do
|
203
|
+
context "when there are redirects" do
|
204
|
+
let(:link1) { "http://#{host}/link1" }
|
205
|
+
let(:link2) { "http://#{host}/link2" }
|
206
|
+
let(:headers) { {'Location' => [link1, link2]} }
|
207
|
+
|
208
|
+
it "should return the redirects as an Array" do
|
209
|
+
expect(subject.redirects_to).to be == [link1, link2]
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
context "when there are no redirects" do
|
214
|
+
it { expect(subject.redirects_to).to be == [] }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
describe "#each_mailto" do
|
219
|
+
context "when the Content-Type is text/html" do
|
220
|
+
let(:content_type) { 'text/html' }
|
221
|
+
|
222
|
+
context "and the HTML is valid" do
|
223
|
+
let(:email1) { "bob@example.com" }
|
224
|
+
let(:email2) { "jim@example.com" }
|
225
|
+
let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
|
226
|
+
|
227
|
+
it "should yield each a link where the href starts with 'mailto:'" do
|
228
|
+
expect { |b|
|
229
|
+
subject.each_mailto(&b)
|
230
|
+
}.to yield_successive_args(email1, email2)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
context "but the HTML is not valid" do
|
235
|
+
let(:body) { "<html" }
|
236
|
+
|
237
|
+
it "should yield nothing" do
|
238
|
+
expect { |b|
|
239
|
+
subject.each_mailto(&b)
|
240
|
+
}.not_to yield_control
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
context "when the Content-Type is not text/html" do
|
246
|
+
let(:content_type) { 'text/plain' }
|
247
|
+
|
248
|
+
it "should yield nothing" do
|
249
|
+
expect { |b|
|
250
|
+
subject.each_mailto(&b)
|
251
|
+
}.not_to yield_control
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
describe "#mailtos" do
|
257
|
+
context "when there are 'mailto:' links" do
|
258
|
+
let(:email1) { "bob@example.com" }
|
259
|
+
let(:email2) { "jim@example.com" }
|
260
|
+
let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
|
261
|
+
|
262
|
+
it "should return all 'mailto:' links" do
|
263
|
+
expect(subject.mailtos).to be == [email1, email2]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
context "when there are no 'mailto:' links" do
|
268
|
+
it { expect(subject.mailtos).to be == [] }
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
describe "#each_link" do
|
273
|
+
context "when the page contains a links" do
|
274
|
+
let(:link1) { '/link1' }
|
275
|
+
let(:link2) { '/link2' }
|
276
|
+
let(:body) { %{<html><body><a href="#{link1}">link1</a> <a href="#{link2}">link2</a></body></html>} }
|
277
|
+
|
278
|
+
it "should yield each a/@href value" do
|
279
|
+
expect { |b|
|
280
|
+
subject.each_link(&b)
|
281
|
+
}.to yield_successive_args(link1, link2)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
context "when the page contains frames" do
|
286
|
+
let(:frame1) { '/frame1' }
|
287
|
+
let(:frame2) { '/frame2' }
|
288
|
+
let(:body) { %{<html><body><frameset><frame src="#{frame1}" /><frame src="#{frame2}" /></frameset></body></html>} }
|
289
|
+
|
290
|
+
it "should yield each frame/@src value" do
|
291
|
+
expect { |b|
|
292
|
+
subject.each_link(&b)
|
293
|
+
}.to yield_successive_args(frame1, frame2)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
context "when the page contains iframes" do
|
298
|
+
let(:iframe1) { '/iframe1' }
|
299
|
+
let(:iframe2) { '/iframe2' }
|
300
|
+
let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
|
301
|
+
|
302
|
+
it "should yield each iframe/@src value" do
|
303
|
+
expect { |b|
|
304
|
+
subject.each_link(&b)
|
305
|
+
}.to yield_successive_args(iframe1, iframe2)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
context "when the page contains remote stylesheets" do
|
310
|
+
let(:stylesheet1) { '/stylesheet1.css' }
|
311
|
+
let(:stylesheet2) { '/stylesheet2.css' }
|
312
|
+
let(:body) { %{<html><head><link rel="stylesheet" type="text/css" href="#{stylesheet1}" /><link rel="stylesheet" type="text/css" href="#{stylesheet2}" /><body><p>hello</p></body></html>} }
|
313
|
+
|
314
|
+
it "should yield each link/@href value" do
|
315
|
+
expect { |b|
|
316
|
+
subject.each_link(&b)
|
317
|
+
}.to yield_successive_args(stylesheet1, stylesheet2)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
context "when the page contains remote javascript" do
|
322
|
+
let(:javascript1) { '/script1.js' }
|
323
|
+
let(:javascript2) { '/script2.js' }
|
324
|
+
let(:body) { %{<html><head><script type="text/javascript" src="#{javascript1}"></script><script type="text/javascript" src="#{javascript2}"></script><body><p>hello</p></body></html>} }
|
325
|
+
|
326
|
+
it "should yield each script/@src value" do
|
327
|
+
expect { |b|
|
328
|
+
subject.each_link(&b)
|
329
|
+
}.to yield_successive_args(javascript1, javascript2)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
describe "#links" do
|
335
|
+
context "when the page contains links" do
|
336
|
+
let(:link) { '/link' }
|
337
|
+
let(:frame) { '/frame' }
|
338
|
+
let(:iframe) { '/iframe' }
|
339
|
+
let(:stylesheet) { '/stylesheet.css' }
|
340
|
+
let(:javascript) { '/script.js' }
|
341
|
+
let(:body) do
|
342
|
+
%{<html>} +
|
343
|
+
%{<head>} +
|
344
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
345
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
346
|
+
%{</head>} +
|
347
|
+
%{<body>} +
|
348
|
+
%{<a href="#{link}">link</a>} +
|
349
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
350
|
+
%{<iframe src="#{iframe}" />} +
|
351
|
+
%{</body>} +
|
352
|
+
%{</html>}
|
353
|
+
end
|
354
|
+
|
355
|
+
it "should return an Array of links" do
|
356
|
+
expect(subject.links).to be == [
|
357
|
+
link,
|
358
|
+
frame,
|
359
|
+
iframe,
|
360
|
+
stylesheet,
|
361
|
+
javascript
|
362
|
+
]
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
context "when the page does not contain any links" do
|
367
|
+
it { expect(subject.links).to be == [] }
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
describe "#each_url" do
|
372
|
+
context "when the page contains links" do
|
373
|
+
let(:link) { '/link' }
|
374
|
+
let(:frame) { '/frame' }
|
375
|
+
let(:iframe) { '/iframe' }
|
376
|
+
let(:stylesheet) { '/stylesheet.css' }
|
377
|
+
let(:javascript) { '/script.js' }
|
378
|
+
let(:body) do
|
379
|
+
%{<html>} +
|
380
|
+
%{<head>} +
|
381
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
382
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
383
|
+
%{</head>} +
|
384
|
+
%{<body>} +
|
385
|
+
%{<a href="#{link}">link</a>} +
|
386
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
387
|
+
%{<iframe src="#{iframe}" />} +
|
388
|
+
%{</body>} +
|
389
|
+
%{</html>}
|
390
|
+
end
|
391
|
+
|
392
|
+
it "should return an Array of absolute URIs" do
|
393
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
394
|
+
URI("http://#{host}#{link}"),
|
395
|
+
URI("http://#{host}#{frame}"),
|
396
|
+
URI("http://#{host}#{iframe}"),
|
397
|
+
URI("http://#{host}#{stylesheet}"),
|
398
|
+
URI("http://#{host}#{javascript}")
|
399
|
+
)
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
context "when the page contains no links" do
|
404
|
+
it do
|
405
|
+
expect { |b|
|
406
|
+
subject.each_url(&b)
|
407
|
+
}.not_to yield_control
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
describe "#urls" do
|
413
|
+
context "when the page contains links" do
|
414
|
+
let(:link) { '/link' }
|
415
|
+
let(:frame) { '/frame' }
|
416
|
+
let(:iframe) { '/iframe' }
|
417
|
+
let(:stylesheet) { '/stylesheet.css' }
|
418
|
+
let(:javascript) { '/script.js' }
|
419
|
+
let(:body) do
|
420
|
+
%{<html>} +
|
421
|
+
%{<head>} +
|
422
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
423
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
424
|
+
%{</head>} +
|
425
|
+
%{<body>} +
|
426
|
+
%{<a href="#{link}">link</a>} +
|
427
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
428
|
+
%{<iframe src="#{iframe}" />} +
|
429
|
+
%{</body>} +
|
430
|
+
%{</html>}
|
431
|
+
end
|
432
|
+
|
433
|
+
it "should return an Array of absolute URIs" do
|
434
|
+
expect(subject.urls).to be == [
|
435
|
+
URI("http://#{host}#{link}"),
|
436
|
+
URI("http://#{host}#{frame}"),
|
437
|
+
URI("http://#{host}#{iframe}"),
|
438
|
+
URI("http://#{host}#{stylesheet}"),
|
439
|
+
URI("http://#{host}#{javascript}")
|
440
|
+
]
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
context "when the page contains no links" do
|
445
|
+
it { expect(subject.urls).to be == [] }
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
describe "#to_absolute" do
|
450
|
+
context "when given an relative path" do
|
451
|
+
let(:path) { '/foo/' }
|
452
|
+
let(:url) { URI("http://#{host}#{path}") }
|
453
|
+
|
454
|
+
let(:relative_path) { 'bar' }
|
455
|
+
|
456
|
+
subject { super().to_absolute(relative_path) }
|
457
|
+
|
458
|
+
it "should merge it with the page's URI" do
|
459
|
+
expect(subject).to be == URI("http://#{host}#{path}#{relative_path}")
|
460
|
+
end
|
461
|
+
|
462
|
+
context "when given a relative path with directory traversal" do
|
463
|
+
let(:expanded_path) { '/bar' }
|
464
|
+
let(:relative_path) { "../../.././../#{expanded_path}" }
|
465
|
+
|
466
|
+
it "should expand the relative path before merging it" do
|
467
|
+
expect(subject).to be == URI("http://#{host}#{expanded_path}")
|
468
|
+
end
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
context "when given an absolute path" do
|
473
|
+
let(:path) { '/foo/' }
|
474
|
+
let(:url) { URI("http://#{host}#{path}") }
|
475
|
+
|
476
|
+
let(:absolute_path) { '/bar/' }
|
477
|
+
|
478
|
+
subject { super().to_absolute(absolute_path) }
|
479
|
+
|
480
|
+
it "should override the page URI's path" do
|
481
|
+
expect(subject).to be == URI("http://#{host}#{absolute_path}")
|
482
|
+
end
|
483
|
+
|
484
|
+
context "when given an absolute path with directory traversal" do
|
485
|
+
let(:expanded_path) { '/bar/' }
|
486
|
+
let(:absolute_path) { "/../../.././../#{expanded_path}" }
|
487
|
+
|
488
|
+
it "should expand the absolute path before merging it" do
|
489
|
+
expect(subject).to be == URI("http://#{host}#{expanded_path}")
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|
493
|
+
|
494
|
+
context "when given a remote link" do
|
495
|
+
let(:remote_host) { 'foo.example.com' }
|
496
|
+
let(:remote_path) { '/bar' }
|
497
|
+
let(:link) { "http://#{remote_host}#{remote_path}" }
|
498
|
+
|
499
|
+
subject { super().to_absolute(link) }
|
500
|
+
|
501
|
+
it "should override the page's URI" do
|
502
|
+
expect(subject).to be == URI(link)
|
503
|
+
end
|
504
|
+
|
505
|
+
context "when the remote link contains directory traversal" do
|
506
|
+
let(:expanded_path) { '/bar' }
|
507
|
+
let(:remote_path) { "/../.././../../#{expanded_path}" }
|
508
|
+
|
509
|
+
it "should expand the remote link's path" do
|
510
|
+
expect(subject).to be == URI("http://#{remote_host}#{expanded_path}")
|
511
|
+
end
|
512
|
+
end
|
513
|
+
|
514
|
+
context "when the remote link ftp://" do
|
515
|
+
let(:remote_path) { "/pub" }
|
516
|
+
let(:link) { "ftp://#{remote_host}#{remote_path}" }
|
517
|
+
|
518
|
+
it "should preserve the leading '/' of the path" do
|
519
|
+
expect(subject.path).to be == remote_path
|
520
|
+
end
|
521
|
+
end
|
522
|
+
end
|
523
|
+
end
|
524
|
+
end
|