spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/agent_spec.rb CHANGED
@@ -7,6 +7,250 @@ require 'spidr/agent'
7
7
  describe Agent do
8
8
  it_should_behave_like "includes Spidr::Settings::UserAgent"
9
9
 
10
+ describe ".start_at" do
11
+ module TestAgentStartAt
12
+ class ExampleApp < Sinatra::Base
13
+
14
+ set :host, 'example.com'
15
+ set :port, 80
16
+
17
+ get '/' do
18
+ '<html><body>should not get here</body></html>'
19
+ end
20
+
21
+ get '/entry-point' do
22
+ <<~HTML
23
+ <html>
24
+ <body>
25
+ <a href="/link1">link1</a>
26
+ <a href="http://other.com/offsite-link">offsite link</a>
27
+ <a href="/link2">link2</a>
28
+ </body>
29
+ </html>
30
+ HTML
31
+ end
32
+
33
+ get '/link1' do
34
+ '<html><body>got here</body></html>'
35
+ end
36
+
37
+ get '/link2' do
38
+ '<html><body>got here</body></html>'
39
+ end
40
+ end
41
+
42
+ class OtherApp < Sinatra::Base
43
+
44
+ set :host, 'other.com'
45
+ set :port, 80
46
+
47
+ get '/offsite-link' do
48
+ '<html><body>should not get here</body></html>'
49
+ end
50
+
51
+ end
52
+ end
53
+
54
+ subject { described_class }
55
+
56
+ let(:host) { 'example.com' }
57
+ let(:other_host) { 'other.com' }
58
+ let(:url) { URI("http://#{host}/entry-point") }
59
+
60
+ let(:app) { TestAgentStartAt::ExampleApp }
61
+ let(:other_app) { TestAgentStartAt::OtherApp }
62
+
63
+ before do
64
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
65
+ stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
66
+ end
67
+
68
+ it "must spider the website starting at the given URL" do
69
+ agent = subject.start_at(url)
70
+
71
+ expect(agent.history).to be == Set[
72
+ URI("http://#{host}/entry-point"),
73
+ URI("http://#{host}/link1"),
74
+ URI("http://#{other_host}/offsite-link"),
75
+ URI("http://#{host}/link2")
76
+ ]
77
+ end
78
+ end
79
+
80
+ describe ".site" do
81
+ module TestAgentSite
82
+ class ExampleApp < Sinatra::Base
83
+
84
+ set :host, 'example.com'
85
+ set :port, 80
86
+
87
+ get '/' do
88
+ '<html><body>should not get here</body></html>'
89
+ end
90
+
91
+ get '/entry-point' do
92
+ <<~HTML
93
+ <html>
94
+ <body>
95
+ <a href="/link1">link1</a>
96
+ <a href="http://other.com/offsite-link">offsite link</a>
97
+ <a href="/link2">link2</a>
98
+ </body>
99
+ </html>
100
+ HTML
101
+ end
102
+
103
+ get '/link1' do
104
+ '<html><body>got here</body></html>'
105
+ end
106
+
107
+ get '/link2' do
108
+ '<html><body>got here</body></html>'
109
+ end
110
+
111
+ end
112
+ end
113
+
114
+ subject { described_class }
115
+
116
+ let(:host) { 'example.com' }
117
+ let(:url) { URI("http://#{host}/entry-point") }
118
+
119
+ let(:app) { TestAgentSite::ExampleApp }
120
+
121
+ before do
122
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
123
+ end
124
+
125
+ it "must spider the website starting at the given URL" do
126
+ agent = subject.site(url)
127
+
128
+ expect(agent.history).to be == Set[
129
+ URI("http://#{host}/entry-point"),
130
+ URI("http://#{host}/link1"),
131
+ URI("http://#{host}/link2")
132
+ ]
133
+ end
134
+ end
135
+
136
+ describe ".host" do
137
+ module TestAgentHost
138
+ class ExampleApp < Sinatra::Base
139
+
140
+ set :host, 'example.com'
141
+ set :port, 80
142
+
143
+ get '/' do
144
+ <<~HTML
145
+ <html>
146
+ <body>
147
+ <a href="/link1">link1</a>
148
+ <a href="http://other.com/offsite-link">offsite link</a>
149
+ <a href="/link2">link2</a>
150
+ </body>
151
+ </html>
152
+ HTML
153
+ end
154
+
155
+ get '/link1' do
156
+ '<html><body>got here</body></html>'
157
+ end
158
+
159
+ get '/link2' do
160
+ '<html><body>got here</body></html>'
161
+ end
162
+
163
+ end
164
+ end
165
+
166
+ subject { described_class }
167
+
168
+ let(:host) { 'example.com' }
169
+ let(:app) { TestAgentHost::ExampleApp }
170
+
171
+ before do
172
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
173
+ end
174
+
175
+ it "must spider the website starting at the given URL" do
176
+ agent = subject.host(host)
177
+
178
+ # XXX: for some reason Set#== was returning false, so convert to an Array
179
+ expect(agent.history.to_a).to be == [
180
+ URI("http://#{host}/"),
181
+ URI("http://#{host}/link1"),
182
+ URI("http://#{host}/link2")
183
+ ]
184
+ end
185
+ end
186
+
187
+ describe ".domain" do
188
+ module TestAgentDomain
189
+ class ExampleApp < Sinatra::Base
190
+
191
+ set :host, 'example.com'
192
+ set :port, 80
193
+
194
+ get '/' do
195
+ <<~HTML
196
+ <html>
197
+ <body>
198
+ <a href="/link1">link1</a>
199
+ <a href="http://sub.example.com/subdomain-link">subdomain link</a>
200
+ <a href="/link2">link2</a>
201
+ </body>
202
+ </html>
203
+ HTML
204
+ end
205
+
206
+ get '/link1' do
207
+ '<html><body>got here</body></html>'
208
+ end
209
+
210
+ get '/link2' do
211
+ '<html><body>got here</body></html>'
212
+ end
213
+
214
+ end
215
+
216
+ class SubDomainApp < Sinatra::Base
217
+
218
+ set :host, 'sub.example.com'
219
+ set :port, 80
220
+
221
+ get '/subdomain-link' do
222
+ '<html><body>should get here</body></html>'
223
+ end
224
+
225
+ end
226
+ end
227
+
228
+ subject { described_class }
229
+
230
+ let(:domain) { 'example.com' }
231
+ let(:domain_app) { TestAgentDomain::ExampleApp }
232
+
233
+ let(:subdomain) { 'sub.example.com' }
234
+ let(:subdomain_app) { TestAgentDomain::SubDomainApp }
235
+
236
+ before do
237
+ stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
238
+ stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
239
+ end
240
+
241
+ it "must spider the domain and subdomains starting at the given domain" do
242
+ agent = subject.domain(domain)
243
+
244
+ # XXX: for some reason Set#== was returning false, so convert to an Array
245
+ expect(agent.history.to_a).to be == [
246
+ URI("http://#{domain}/"),
247
+ URI("http://#{domain}/link1"),
248
+ URI("http://#{subdomain}/subdomain-link"),
249
+ URI("http://#{domain}/link2")
250
+ ]
251
+ end
252
+ end
253
+
10
254
  describe "#initialize" do
11
255
  it "should not be running" do
12
256
  expect(subject).to_not be_running
@@ -32,6 +276,68 @@ describe Agent do
32
276
  expect(subject.sessions).to be_kind_of(SessionCache)
33
277
  end
34
278
 
279
+ context "when the proxy: keyword argument is given" do
280
+ let(:proxy) do
281
+ Spidr::Proxy.new(host: 'example.com')
282
+ end
283
+
284
+ subject { described_class.new(proxy: proxy) }
285
+
286
+ it "must initialize the #proxy of #session_cache" do
287
+ expect(subject.sessions.proxy).to be(proxy)
288
+ end
289
+ end
290
+
291
+ context "when the open_timeout: keyword argument is given" do
292
+ let(:open_timeout) { 5 }
293
+
294
+ subject { described_class.new(open_timeout: open_timeout) }
295
+
296
+ it "must initialize the #open_timeout of #session_cache" do
297
+ expect(subject.sessions.open_timeout).to eq(open_timeout)
298
+ end
299
+ end
300
+
301
+ context "when the ssl_timeout: keyword argument is given" do
302
+ let(:ssl_timeout) { 5 }
303
+
304
+ subject { described_class.new(ssl_timeout: ssl_timeout) }
305
+
306
+ it "must initialize the #ssl_timeout of #session_cache" do
307
+ expect(subject.sessions.ssl_timeout).to eq(ssl_timeout)
308
+ end
309
+ end
310
+
311
+ context "when the read_timeout: keyword argument is given" do
312
+ let(:read_timeout) { 5 }
313
+
314
+ subject { described_class.new(read_timeout: read_timeout) }
315
+
316
+ it "must initialize the #read_timeout of #session_cache" do
317
+ expect(subject.sessions.read_timeout).to eq(read_timeout)
318
+ end
319
+ end
320
+
321
+ context "when the continue_timeout: keyword argument is given" do
322
+ let(:continue_timeout) { 5 }
323
+
324
+ subject { described_class.new(continue_timeout: continue_timeout) }
325
+
326
+ it "must initialize the #continue_timeout of #session_cache" do
327
+ expect(subject.sessions.continue_timeout).to eq(continue_timeout)
328
+ end
329
+ end
330
+
331
+ context "when the keep_alive_timeout: keyword argument is given" do
332
+ let(:keep_alive_timeout) { 5 }
333
+
334
+ subject { described_class.new(keep_alive_timeout: keep_alive_timeout) }
335
+
336
+ it "must initialize the #keep_alive_timeout of #session_cache" do
337
+ expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout)
338
+ end
339
+ end
340
+
35
341
  it "should initialize the #cookie_jar" do
36
342
  expect(subject.cookies).to be_kind_of(CookieJar)
37
343
  end
@@ -386,7 +692,15 @@ describe Agent do
386
692
  context "frames" do
387
693
  app do
388
694
  get '/' do
389
- %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
695
+ <<~HTML
696
+ <html>
697
+ <body>
698
+ <frameset>
699
+ <frame src="/frame" />
700
+ </frameset>
701
+ </body>
702
+ </html>
703
+ HTML
390
704
  end
391
705
 
392
706
  get '/frame' do
@@ -614,7 +928,14 @@ describe Agent do
614
928
  end
615
929
 
616
930
  get '/redirect' do
617
- %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
931
+ <<~HTML
932
+ <html>
933
+ <head>
934
+ <meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" />
935
+ </head>
936
+ <body>Redirecting...</body>
937
+ </html>
938
+ HTML
618
939
  end
619
940
 
620
941
  get '/link' do
@@ -674,7 +995,14 @@ describe Agent do
674
995
 
675
996
  app do
676
997
  get '/' do
677
- %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
998
+ <<~HTML
999
+ <html>
1000
+ <body>
1001
+ <a href="http://google.com/">external link</a>
1002
+ <a href="/link">local link</a>
1003
+ </body>
1004
+ </html>
1005
+ HTML
678
1006
  end
679
1007
 
680
1008
  get '/link' do
@@ -726,13 +1054,27 @@ describe Agent do
726
1054
 
727
1055
  app do
728
1056
  get '/' do
729
- %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
1057
+ <<~HTML
1058
+ <html>
1059
+ <body>
1060
+ <a href="/left?d=1">left</a>
1061
+ <a href="/right?d=1">right</a>
1062
+ </body>
1063
+ </html>
1064
+ HTML
730
1065
  end
731
1066
 
732
- get %r{^/left|/right} do
1067
+ get %r{/left|/right} do
733
1068
  d = Integer(params['d'])
734
1069
 
735
- %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
1070
+ <<~HTML
1071
+ <html>
1072
+ <body>
1073
+ <a href="/left?d=#{d+1}">left</a>
1074
+ <a href="/right?d=#{d+1}">right</a>
1075
+ </body>
1076
+ </html>
1077
+ HTML
736
1078
  end
737
1079
  end
738
1080
 
@@ -774,7 +1116,14 @@ describe Agent do
774
1116
 
775
1117
  app do
776
1118
  get '/' do
777
- %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
1119
+ <<~HTML
1120
+ <html>
1121
+ <body>
1122
+ <a href="/secret">don't follow this link</a>
1123
+ <a href="/pub">follow this link</a>
1124
+ </body>
1125
+ </html>
1126
+ HTML
778
1127
  end
779
1128
 
780
1129
  get '/pub' do
@@ -786,14 +1135,12 @@ describe Agent do
786
1135
 
787
1136
  [
788
1137
  "User-agent: *",
789
- 'Disallow: /',
1138
+ 'Disallow: /secret',
790
1139
  ].join($/)
791
1140
  end
792
1141
  end
793
1142
 
794
1143
  it "should not follow links Disallowed by robots.txt" do
795
- pending "https://github.com/bblimke/webmock/issues/642"
796
-
797
1144
  expect(subject.history).to be == Set[
798
1145
  URI("http://#{host}/"),
799
1146
  URI("http://#{host}/pub")
data/spec/example_page.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'rspec'
2
+ require 'net/http'
3
+ require 'uri'
2
4
 
3
5
  RSpec.shared_context "example Page" do
4
6
  let(:code) { 200 }
@@ -147,4 +147,26 @@ describe Page do
147
147
  describe "#zip?" do
148
148
  include_examples "Content-Type method", :zip?, 'application/zip'
149
149
  end
150
+
151
+ describe "#png?" do
152
+ include_examples "Content-Type method", :png?, 'image/png'
153
+ end
154
+
155
+ describe "#gif?" do
156
+ include_examples "Content-Type method", :gif?, 'image/gif'
157
+ end
158
+
159
+ describe "#jpeg?" do
160
+ include_examples "Content-Type method", :jpeg?, 'image/jpeg'
161
+ end
162
+
163
+ describe "#ico?" do
164
+ context "when 'Content-Type' is 'image/x-icon'" do
165
+ include_examples "Content-Type method", :ico?, 'image/x-icon'
166
+ end
167
+
168
+ context "when 'Content-Type' is 'image/vnd.microsoft.icon'" do
169
+ include_examples "Content-Type method", :ico?, 'image/vnd.microsoft.icon'
170
+ end
171
+ end
150
172
  end