spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/spec/example_app.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sinatra/base'
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
require 'spidr/agent'
|
6
|
+
|
7
|
+
RSpec.shared_context "example App" do
|
8
|
+
let(:host) { 'example.com' }
|
9
|
+
|
10
|
+
subject { Agent.new(host: host) }
|
11
|
+
|
12
|
+
def self.app(&block)
|
13
|
+
let(:app) do
|
14
|
+
klass = Class.new(Sinatra::Base)
|
15
|
+
klass.set :host, host
|
16
|
+
klass.set :port, 80
|
17
|
+
klass.class_eval(&block)
|
18
|
+
return klass
|
19
|
+
end
|
20
|
+
|
21
|
+
before do
|
22
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
23
|
+
|
24
|
+
subject.start_at("http://#{host}/")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
|
3
|
+
RSpec.shared_context "example Page" do
|
4
|
+
let(:code) { 200 }
|
5
|
+
let(:msg) { 'OK' }
|
6
|
+
let(:content_type) { 'text/html' }
|
7
|
+
let(:headers) { {} }
|
8
|
+
let(:body) { '' }
|
9
|
+
|
10
|
+
let(:response) do
|
11
|
+
Net::HTTPResponse.new('1.1', code.to_s, msg).tap do |response|
|
12
|
+
response.set_content_type(content_type) if content_type
|
13
|
+
|
14
|
+
headers.each do |name,values|
|
15
|
+
if values
|
16
|
+
Array(values).each do |value|
|
17
|
+
response.add_field(name,value)
|
18
|
+
end
|
19
|
+
else
|
20
|
+
response.remove_field(name)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# stub #body, otherwise Net::HTTP will check @socket
|
25
|
+
allow(response).to receive(:body).and_return(body)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
let(:host) { 'example.com' }
|
30
|
+
let(:url) { URI::HTTP.build(host: host) }
|
31
|
+
|
32
|
+
subject { described_class.new(url,response) }
|
33
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
describe "#content_type" do
|
10
|
+
it "should return the Content-Type as a String" do
|
11
|
+
expect(subject.content_type).to be == content_type
|
12
|
+
end
|
13
|
+
|
14
|
+
context "when content_type is missing" do
|
15
|
+
let(:content_type) { nil }
|
16
|
+
|
17
|
+
it "should return an empty String" do
|
18
|
+
expect(subject.content_type).to be == ''
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#content_types" do
|
24
|
+
it "should return the Content-Type as an Array" do
|
25
|
+
expect(subject.content_types).to be == [content_type]
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when content_type is missing" do
|
29
|
+
let(:content_type) { nil }
|
30
|
+
|
31
|
+
it "should return an empty Array" do
|
32
|
+
expect(subject.content_types).to be == []
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#content_charset" do
|
38
|
+
let(:charset) { 'utf8' }
|
39
|
+
let(:content_type) { "text/html;charset=#{charset}" }
|
40
|
+
|
41
|
+
it "should extract the 'charset=' param" do
|
42
|
+
expect(subject.content_charset).to be == charset
|
43
|
+
end
|
44
|
+
|
45
|
+
context "when there is no 'charset='" do
|
46
|
+
let(:content_type) { 'text/html' }
|
47
|
+
|
48
|
+
it { expect(subject.content_charset).to be nil }
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "#is_content_type?" do
|
53
|
+
let(:charset) { 'utf8' }
|
54
|
+
let(:sub_type) { 'html' }
|
55
|
+
let(:mime_type) { "text/#{sub_type}" }
|
56
|
+
let(:content_type) { "#{mime_type};charset=#{charset}" }
|
57
|
+
|
58
|
+
context "when given a full mime-type" do
|
59
|
+
context "and it matches the Content-Type's mime-type" do
|
60
|
+
it { expect(subject.is_content_type?(mime_type)).to be true }
|
61
|
+
end
|
62
|
+
|
63
|
+
context "but it doesn't match the Content-Type's mime-type" do
|
64
|
+
it { expect(subject.is_content_type?('text/plain')).to be false }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "when given a sub-type" do
|
69
|
+
context "and it matches the Content-Type's sub-type" do
|
70
|
+
it { expect(subject.is_content_type?(sub_type)).to be true }
|
71
|
+
end
|
72
|
+
|
73
|
+
context "but it doesn't match the Content-Type's sub-type" do
|
74
|
+
it { expect(subject.is_content_type?('plain')).to be false }
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
shared_examples "Content-Type method" do |method,*content_types|
|
80
|
+
content_types.each do |content_type|
|
81
|
+
context "when Content-Type includes #{content_type}" do
|
82
|
+
let(:content_type) { content_type }
|
83
|
+
|
84
|
+
it { expect(subject.send(method)).to be true }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when Content-Type does not include #{content_types.join(', ')}" do
|
89
|
+
let(:content_type) { 'unknown/unknown' }
|
90
|
+
|
91
|
+
it { expect(subject.send(method)).to be false }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe "#plain_text?" do
|
96
|
+
include_examples "Content-Type method", :plain_text?, 'text/plain'
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "#directory?" do
|
100
|
+
include_examples "Content-Type method", :directory?, 'text/directory'
|
101
|
+
end
|
102
|
+
|
103
|
+
describe "#directory?" do
|
104
|
+
include_examples "Content-Type method", :html?, 'text/html'
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "#html?" do
|
108
|
+
include_examples "Content-Type method", :html?, 'text/html'
|
109
|
+
end
|
110
|
+
|
111
|
+
describe "#xml?" do
|
112
|
+
include_examples "Content-Type method", :xml?, 'text/xml', 'application/xml'
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "#xsl?" do
|
116
|
+
include_examples "Content-Type method", :xsl?, 'text/xsl'
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#javascript?" do
|
120
|
+
include_examples "Content-Type method", :javascript?, 'text/javascript', 'application/javascript'
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "#json?" do
|
124
|
+
include_examples "Content-Type method", :json?, 'application/json'
|
125
|
+
end
|
126
|
+
|
127
|
+
describe "#css?" do
|
128
|
+
include_examples "Content-Type method", :css?, 'text/css'
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "#rss?" do
|
132
|
+
include_examples "Content-Type method", :rss?, 'application/rss+xml', 'application/rdf+xml'
|
133
|
+
end
|
134
|
+
|
135
|
+
describe "#atom?" do
|
136
|
+
include_examples "Content-Type method", :atom?, 'application/atom+xml'
|
137
|
+
end
|
138
|
+
|
139
|
+
describe "#ms_word?" do
|
140
|
+
include_examples "Content-Type method", :ms_word?, 'application/msword'
|
141
|
+
end
|
142
|
+
|
143
|
+
describe "#pdf?" do
|
144
|
+
include_examples "Content-Type method", :pdf?, 'application/pdf'
|
145
|
+
end
|
146
|
+
|
147
|
+
describe "#zip?" do
|
148
|
+
include_examples "Content-Type method", :zip?, 'application/zip'
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
let(:name) { 'foo' }
|
10
|
+
let(:value) { 'bar' }
|
11
|
+
let(:path) { '/' }
|
12
|
+
let(:cookie) { "#{name}=#{value}; Path=#{path}; Domain=#{host}; Secure; HTTPOnly" }
|
13
|
+
let(:headers) do
|
14
|
+
{'Set-Cookie' => cookie}
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#cookie" do
|
18
|
+
it "should return the Set-Cookie header as a String" do
|
19
|
+
expect(subject.cookie).to be == cookie
|
20
|
+
end
|
21
|
+
|
22
|
+
context "when Set-Cookie is not set" do
|
23
|
+
let(:headers) { {} }
|
24
|
+
|
25
|
+
it "should return an empty String" do
|
26
|
+
expect(subject.cookie).to be == ''
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "#cookies" do
|
32
|
+
it "should return the Set-Cookie header as an Array" do
|
33
|
+
expect(subject.cookies).to be == [cookie]
|
34
|
+
end
|
35
|
+
|
36
|
+
context "when Set-Cookie is not set" do
|
37
|
+
let(:headers) { {} }
|
38
|
+
|
39
|
+
it "should return an empty Array" do
|
40
|
+
expect(subject.cookies).to be == []
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "#cookie_params" do
|
46
|
+
it "should parse the cookie params into a Hash" do
|
47
|
+
expect(subject.cookie_params).to be == {name => value}
|
48
|
+
end
|
49
|
+
|
50
|
+
context "when the cookie has no value" do
|
51
|
+
let(:value) { '' }
|
52
|
+
|
53
|
+
it "should default the value to an empty String" do
|
54
|
+
expect(subject.cookie_params[name]).to be == ''
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,524 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'example_page'
|
3
|
+
|
4
|
+
require 'spidr/page'
|
5
|
+
|
6
|
+
describe Page do
|
7
|
+
include_context "example Page"
|
8
|
+
|
9
|
+
let(:body) { %{<html><head><title>example</title></head><body><p>hello</p></body></html>} }
|
10
|
+
|
11
|
+
describe "#title" do
|
12
|
+
context "when there is a title" do
|
13
|
+
it "should return the title inner_text" do
|
14
|
+
expect(subject.title).to be == 'example'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
context "when there is no title" do
|
19
|
+
let(:body) { %{<html><head></head><body><p>hello</p></body></html>} }
|
20
|
+
|
21
|
+
it "should return nil" do
|
22
|
+
expect(subject.title).to be nil
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#each_meta_redirect" do
|
28
|
+
context "when the Content-Type is text/html" do
|
29
|
+
let(:content_type) { 'text/html' }
|
30
|
+
|
31
|
+
context "and the HTML is valid" do
|
32
|
+
let(:link) { '/link' }
|
33
|
+
let(:refresh) { 'refresh' }
|
34
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /></head><body>Redirecting...</body></html>} }
|
35
|
+
|
36
|
+
it "should yield each meta http-equiv='refresh' URL" do
|
37
|
+
expect { |b|
|
38
|
+
subject.each_meta_redirect(&b)
|
39
|
+
}.to yield_successive_args(link)
|
40
|
+
end
|
41
|
+
|
42
|
+
context "but when http-equiv is REFRESH" do
|
43
|
+
let(:refresh) { 'REFRESH' }
|
44
|
+
|
45
|
+
it "should ignore the case of refresh" do
|
46
|
+
expect { |b|
|
47
|
+
subject.each_meta_redirect(&b)
|
48
|
+
}.to yield_successive_args(link)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "but the http-equiv attribute is missing" do
|
53
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta content="4; url=#{link}2" /></head><body>Redirecting...</body></html>} }
|
54
|
+
|
55
|
+
it "should ignore those meta tags" do
|
56
|
+
expect { |b|
|
57
|
+
subject.each_meta_redirect(&b)
|
58
|
+
}.to yield_successive_args(link)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context "but http-equiv is not refresh" do
|
63
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="content-type" content="#{content_type}" /></head><body></body></html>} }
|
64
|
+
|
65
|
+
it "should ignore those meta tags" do
|
66
|
+
expect { |b|
|
67
|
+
subject.each_meta_redirect(&b)
|
68
|
+
}.to yield_successive_args(link)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
context "but the content attribute is missing" do
|
73
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" /></head><body>Redirecting...</body></html>} }
|
74
|
+
|
75
|
+
it "should ignore those meta tags" do
|
76
|
+
expect { |b|
|
77
|
+
subject.each_meta_redirect(&b)
|
78
|
+
}.to yield_successive_args(link)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context "but the content attribute does not contain url=..." do
|
83
|
+
let(:body) { %{<html><head><meta http-equiv="#{refresh}" content="4; url=#{link}" /><meta http-equiv="#{refresh}" content="0" /></head><body>Redirecting...</body></html>} }
|
84
|
+
|
85
|
+
it "should ignore those meta tags" do
|
86
|
+
expect { |b|
|
87
|
+
subject.each_meta_redirect(&b)
|
88
|
+
}.to yield_successive_args(link)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "but the HTML cannot be parsed" do
|
94
|
+
let(:body) { "<html></" }
|
95
|
+
|
96
|
+
it "should yield nothing" do
|
97
|
+
expect { |b| subject(&b) }.not_to yield_control
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context "when the Content-Type is not text/html" do
|
103
|
+
let(:content_type) { 'text/xml' }
|
104
|
+
|
105
|
+
it "should yield nothing" do
|
106
|
+
expect { |b| subject(&b) }.not_to yield_control
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context "when not given a block" do
|
111
|
+
it "should return an Enumerator" do
|
112
|
+
expect(subject.each_meta_redirect).to be_kind_of(Enumerator)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe "#meta_redirect?" do
|
118
|
+
context "when there are meta refresh redirects" do
|
119
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=/link" /></head><body>Redirecting...</body></html>} }
|
120
|
+
|
121
|
+
it { expect(subject.meta_redirect?).to be true }
|
122
|
+
end
|
123
|
+
|
124
|
+
context "when there are no meta refresh redirects" do
|
125
|
+
let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
|
126
|
+
|
127
|
+
it { expect(subject.meta_redirect?).to be false }
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "#meta_redirects" do
|
132
|
+
context "when there are meta refresh redirects" do
|
133
|
+
let(:link1) { "/link1" }
|
134
|
+
let(:link2) { "/link2" }
|
135
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
|
136
|
+
|
137
|
+
it "should return each meta refresh redirect URL" do
|
138
|
+
expect(subject.meta_redirects).to be == [link1, link2]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context "when there are no meta refresh redirects" do
|
143
|
+
let(:body) { %{<html><head><meta http-equiv="content-type" content="text/html" /></head><body>Redirecting...</body></html>} }
|
144
|
+
|
145
|
+
it { expect(subject.meta_redirects).to be == [] }
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#each_redirect" do
|
150
|
+
context "when the Location header is set" do
|
151
|
+
let(:link) { "http://#{host}/link" }
|
152
|
+
let(:headers) { {'Location' => link} }
|
153
|
+
|
154
|
+
it "should yield the Location header" do
|
155
|
+
expect { |b|
|
156
|
+
subject.each_redirect(&b)
|
157
|
+
}.to yield_successive_args(link)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
context "when there are multiple Location headers" do
|
162
|
+
let(:link1) { "http://#{host}/link1" }
|
163
|
+
let(:link2) { "http://#{host}/link2" }
|
164
|
+
let(:headers) { {'Location' => [link1, link2]} }
|
165
|
+
|
166
|
+
it "should yield each Location header value" do
|
167
|
+
expect { |b|
|
168
|
+
subject.each_redirect(&b)
|
169
|
+
}.to yield_successive_args(link1, link2)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
context "when there is no Location header set" do
|
174
|
+
context "but there are meta refresh redirects" do
|
175
|
+
let(:link1) { "/link1" }
|
176
|
+
let(:link2) { "/link2" }
|
177
|
+
let(:body) { %{<html><head><meta http-equiv="refresh" content="4; url=#{link1}" /><meta http-equiv="refresh" content="1; url=#{link2}" /></head><body>Redirecting...</body></html>} }
|
178
|
+
|
179
|
+
it "should yield each meta refresh redirect URL" do
|
180
|
+
expect { |b|
|
181
|
+
subject.each_redirect(&b)
|
182
|
+
}.to yield_successive_args(link1, link2)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
context "and there are no meta refresh redirects" do
|
187
|
+
it do
|
188
|
+
expect { |b|
|
189
|
+
subject.each_redirect(&b)
|
190
|
+
}.not_to yield_control
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
context "when not given a block" do
|
196
|
+
it "should return an Enumerator" do
|
197
|
+
expect(subject.each_redirect).to be_kind_of(Enumerator)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
context "#redirects_to" do
|
203
|
+
context "when there are redirects" do
|
204
|
+
let(:link1) { "http://#{host}/link1" }
|
205
|
+
let(:link2) { "http://#{host}/link2" }
|
206
|
+
let(:headers) { {'Location' => [link1, link2]} }
|
207
|
+
|
208
|
+
it "should return the redirects as an Array" do
|
209
|
+
expect(subject.redirects_to).to be == [link1, link2]
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
context "when there are no redirects" do
|
214
|
+
it { expect(subject.redirects_to).to be == [] }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
describe "#each_mailto" do
|
219
|
+
context "when the Content-Type is text/html" do
|
220
|
+
let(:content_type) { 'text/html' }
|
221
|
+
|
222
|
+
context "and the HTML is valid" do
|
223
|
+
let(:email1) { "bob@example.com" }
|
224
|
+
let(:email2) { "jim@example.com" }
|
225
|
+
let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
|
226
|
+
|
227
|
+
it "should yield each a link where the href starts with 'mailto:'" do
|
228
|
+
expect { |b|
|
229
|
+
subject.each_mailto(&b)
|
230
|
+
}.to yield_successive_args(email1, email2)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
context "but the HTML is not valid" do
|
235
|
+
let(:body) { "<html" }
|
236
|
+
|
237
|
+
it "should yield nothing" do
|
238
|
+
expect { |b|
|
239
|
+
subject.each_mailto(&b)
|
240
|
+
}.not_to yield_control
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
context "when the Content-Type is not text/html" do
|
246
|
+
let(:content_type) { 'text/plain' }
|
247
|
+
|
248
|
+
it "should yield nothing" do
|
249
|
+
expect { |b|
|
250
|
+
subject.each_mailto(&b)
|
251
|
+
}.not_to yield_control
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
describe "#mailtos" do
|
257
|
+
context "when there are 'mailto:' links" do
|
258
|
+
let(:email1) { "bob@example.com" }
|
259
|
+
let(:email2) { "jim@example.com" }
|
260
|
+
let(:body) { %{<html><body><a href="mailto:#{email1}">email1</a> <a href="/link">link</a> <a href="mailto:#{email2}">email2</a></body></html>} }
|
261
|
+
|
262
|
+
it "should return all 'mailto:' links" do
|
263
|
+
expect(subject.mailtos).to be == [email1, email2]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
context "when there are no 'mailto:' links" do
|
268
|
+
it { expect(subject.mailtos).to be == [] }
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
describe "#each_link" do
|
273
|
+
context "when the page contains a links" do
|
274
|
+
let(:link1) { '/link1' }
|
275
|
+
let(:link2) { '/link2' }
|
276
|
+
let(:body) { %{<html><body><a href="#{link1}">link1</a> <a href="#{link2}">link2</a></body></html>} }
|
277
|
+
|
278
|
+
it "should yield each a/@href value" do
|
279
|
+
expect { |b|
|
280
|
+
subject.each_link(&b)
|
281
|
+
}.to yield_successive_args(link1, link2)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
context "when the page contains frames" do
|
286
|
+
let(:frame1) { '/frame1' }
|
287
|
+
let(:frame2) { '/frame2' }
|
288
|
+
let(:body) { %{<html><body><frameset><frame src="#{frame1}" /><frame src="#{frame2}" /></frameset></body></html>} }
|
289
|
+
|
290
|
+
it "should yield each frame/@src value" do
|
291
|
+
expect { |b|
|
292
|
+
subject.each_link(&b)
|
293
|
+
}.to yield_successive_args(frame1, frame2)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
context "when the page contains iframes" do
|
298
|
+
let(:iframe1) { '/iframe1' }
|
299
|
+
let(:iframe2) { '/iframe2' }
|
300
|
+
let(:body) { %{<html><body><iframe src="#{iframe1}" /><iframe src="#{iframe2}" /></body></html>} }
|
301
|
+
|
302
|
+
it "should yield each iframe/@src value" do
|
303
|
+
expect { |b|
|
304
|
+
subject.each_link(&b)
|
305
|
+
}.to yield_successive_args(iframe1, iframe2)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
context "when the page contains remote stylesheets" do
|
310
|
+
let(:stylesheet1) { '/stylesheet1.css' }
|
311
|
+
let(:stylesheet2) { '/stylesheet2.css' }
|
312
|
+
let(:body) { %{<html><head><link rel="stylesheet" type="text/css" href="#{stylesheet1}" /><link rel="stylesheet" type="text/css" href="#{stylesheet2}" /><body><p>hello</p></body></html>} }
|
313
|
+
|
314
|
+
it "should yield each link/@href value" do
|
315
|
+
expect { |b|
|
316
|
+
subject.each_link(&b)
|
317
|
+
}.to yield_successive_args(stylesheet1, stylesheet2)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
context "when the page contains remote javascript" do
|
322
|
+
let(:javascript1) { '/script1.js' }
|
323
|
+
let(:javascript2) { '/script2.js' }
|
324
|
+
let(:body) { %{<html><head><script type="text/javascript" src="#{javascript1}"></script><script type="text/javascript" src="#{javascript2}"></script><body><p>hello</p></body></html>} }
|
325
|
+
|
326
|
+
it "should yield each script/@src value" do
|
327
|
+
expect { |b|
|
328
|
+
subject.each_link(&b)
|
329
|
+
}.to yield_successive_args(javascript1, javascript2)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
describe "#links" do
|
335
|
+
context "when the page contains links" do
|
336
|
+
let(:link) { '/link' }
|
337
|
+
let(:frame) { '/frame' }
|
338
|
+
let(:iframe) { '/iframe' }
|
339
|
+
let(:stylesheet) { '/stylesheet.css' }
|
340
|
+
let(:javascript) { '/script.js' }
|
341
|
+
let(:body) do
|
342
|
+
%{<html>} +
|
343
|
+
%{<head>} +
|
344
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
345
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
346
|
+
%{</head>} +
|
347
|
+
%{<body>} +
|
348
|
+
%{<a href="#{link}">link</a>} +
|
349
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
350
|
+
%{<iframe src="#{iframe}" />} +
|
351
|
+
%{</body>} +
|
352
|
+
%{</html>}
|
353
|
+
end
|
354
|
+
|
355
|
+
it "should return an Array of links" do
|
356
|
+
expect(subject.links).to be == [
|
357
|
+
link,
|
358
|
+
frame,
|
359
|
+
iframe,
|
360
|
+
stylesheet,
|
361
|
+
javascript
|
362
|
+
]
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
context "when the page does not contain any links" do
|
367
|
+
it { expect(subject.links).to be == [] }
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
describe "#each_url" do
|
372
|
+
context "when the page contains links" do
|
373
|
+
let(:link) { '/link' }
|
374
|
+
let(:frame) { '/frame' }
|
375
|
+
let(:iframe) { '/iframe' }
|
376
|
+
let(:stylesheet) { '/stylesheet.css' }
|
377
|
+
let(:javascript) { '/script.js' }
|
378
|
+
let(:body) do
|
379
|
+
%{<html>} +
|
380
|
+
%{<head>} +
|
381
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
382
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
383
|
+
%{</head>} +
|
384
|
+
%{<body>} +
|
385
|
+
%{<a href="#{link}">link</a>} +
|
386
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
387
|
+
%{<iframe src="#{iframe}" />} +
|
388
|
+
%{</body>} +
|
389
|
+
%{</html>}
|
390
|
+
end
|
391
|
+
|
392
|
+
it "should return an Array of absolute URIs" do
|
393
|
+
expect { |b| subject.each_url(&b) }.to yield_successive_args(
|
394
|
+
URI("http://#{host}#{link}"),
|
395
|
+
URI("http://#{host}#{frame}"),
|
396
|
+
URI("http://#{host}#{iframe}"),
|
397
|
+
URI("http://#{host}#{stylesheet}"),
|
398
|
+
URI("http://#{host}#{javascript}")
|
399
|
+
)
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
context "when the page contains no links" do
|
404
|
+
it do
|
405
|
+
expect { |b|
|
406
|
+
subject.each_url(&b)
|
407
|
+
}.not_to yield_control
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
describe "#urls" do
|
413
|
+
context "when the page contains links" do
|
414
|
+
let(:link) { '/link' }
|
415
|
+
let(:frame) { '/frame' }
|
416
|
+
let(:iframe) { '/iframe' }
|
417
|
+
let(:stylesheet) { '/stylesheet.css' }
|
418
|
+
let(:javascript) { '/script.js' }
|
419
|
+
let(:body) do
|
420
|
+
%{<html>} +
|
421
|
+
%{<head>} +
|
422
|
+
%{<link rel="stylesheet" type="text/css" href="#{stylesheet}" />} +
|
423
|
+
%{<script type="text/javascript" src="#{javascript}"></script>} +
|
424
|
+
%{</head>} +
|
425
|
+
%{<body>} +
|
426
|
+
%{<a href="#{link}">link</a>} +
|
427
|
+
%{<frameset><frame src="#{frame}" /></frameset>} +
|
428
|
+
%{<iframe src="#{iframe}" />} +
|
429
|
+
%{</body>} +
|
430
|
+
%{</html>}
|
431
|
+
end
|
432
|
+
|
433
|
+
it "should return an Array of absolute URIs" do
|
434
|
+
expect(subject.urls).to be == [
|
435
|
+
URI("http://#{host}#{link}"),
|
436
|
+
URI("http://#{host}#{frame}"),
|
437
|
+
URI("http://#{host}#{iframe}"),
|
438
|
+
URI("http://#{host}#{stylesheet}"),
|
439
|
+
URI("http://#{host}#{javascript}")
|
440
|
+
]
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
context "when the page contains no links" do
|
445
|
+
it { expect(subject.urls).to be == [] }
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
describe "#to_absolute" do
|
450
|
+
context "when given an relative path" do
|
451
|
+
let(:path) { '/foo/' }
|
452
|
+
let(:url) { URI("http://#{host}#{path}") }
|
453
|
+
|
454
|
+
let(:relative_path) { 'bar' }
|
455
|
+
|
456
|
+
subject { super().to_absolute(relative_path) }
|
457
|
+
|
458
|
+
it "should merge it with the page's URI" do
|
459
|
+
expect(subject).to be == URI("http://#{host}#{path}#{relative_path}")
|
460
|
+
end
|
461
|
+
|
462
|
+
context "when given a relative path with directory traversal" do
|
463
|
+
let(:expanded_path) { '/bar' }
|
464
|
+
let(:relative_path) { "../../.././../#{expanded_path}" }
|
465
|
+
|
466
|
+
it "should expand the relative path before merging it" do
|
467
|
+
expect(subject).to be == URI("http://#{host}#{expanded_path}")
|
468
|
+
end
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
472
|
+
context "when given an absolute path" do
|
473
|
+
let(:path) { '/foo/' }
|
474
|
+
let(:url) { URI("http://#{host}#{path}") }
|
475
|
+
|
476
|
+
let(:absolute_path) { '/bar/' }
|
477
|
+
|
478
|
+
subject { super().to_absolute(absolute_path) }
|
479
|
+
|
480
|
+
it "should override the page URI's path" do
|
481
|
+
expect(subject).to be == URI("http://#{host}#{absolute_path}")
|
482
|
+
end
|
483
|
+
|
484
|
+
context "when given an absolute path with directory traversal" do
|
485
|
+
let(:expanded_path) { '/bar/' }
|
486
|
+
let(:absolute_path) { "/../../.././../#{expanded_path}" }
|
487
|
+
|
488
|
+
it "should expand the absolute path before merging it" do
|
489
|
+
expect(subject).to be == URI("http://#{host}#{expanded_path}")
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|
493
|
+
|
494
|
+
context "when given a remote link" do
|
495
|
+
let(:remote_host) { 'foo.example.com' }
|
496
|
+
let(:remote_path) { '/bar' }
|
497
|
+
let(:link) { "http://#{remote_host}#{remote_path}" }
|
498
|
+
|
499
|
+
subject { super().to_absolute(link) }
|
500
|
+
|
501
|
+
it "should override the page's URI" do
|
502
|
+
expect(subject).to be == URI(link)
|
503
|
+
end
|
504
|
+
|
505
|
+
context "when the remote link contains directory traversal" do
|
506
|
+
let(:expanded_path) { '/bar' }
|
507
|
+
let(:remote_path) { "/../.././../../#{expanded_path}" }
|
508
|
+
|
509
|
+
it "should expand the remote link's path" do
|
510
|
+
expect(subject).to be == URI("http://#{remote_host}#{expanded_path}")
|
511
|
+
end
|
512
|
+
end
|
513
|
+
|
514
|
+
context "when the remote link ftp://" do
|
515
|
+
let(:remote_path) { "/pub" }
|
516
|
+
let(:link) { "ftp://#{remote_host}#{remote_path}" }
|
517
|
+
|
518
|
+
it "should preserve the leading '/' of the path" do
|
519
|
+
expect(subject.path).to be == remote_path
|
520
|
+
end
|
521
|
+
end
|
522
|
+
end
|
523
|
+
end
|
524
|
+
end
|