spidr 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,37 +17,39 @@ module Spidr
17
17
  #
18
18
  # Creates a new session cache.
19
19
  #
20
- # @param [Hash] options
21
- # Configuration options.
22
- #
23
- # @option [Hash] :proxy (Spidr.proxy)
20
+ # @param [Hash] proxy
24
21
  # Proxy options.
25
22
  #
26
- # @option [Integer] :open_timeout (Spidr.open_timeout)
27
- # Optional open timeout.
23
+ # @param [Integer] open_timeout
24
+ # Optional open connection timeout.
28
25
  #
29
- # @option [Integer] :ssl_timeout (Spidr.ssl_timeout)
30
- # Optional ssl timeout.
26
+ # @param [Integer] ssl_timeout
27
+ # Optional SSL connection timeout.
31
28
  #
32
- # @option [Integer] :read_timeout (Spidr.read_timeout)
29
+ # @param [Integer] read_timeout
33
30
  # Optional read timeout.
34
31
  #
35
- # @option [Integer] :continue_timeout (Spidr.continue_timeout)
32
+ # @param [Integer] continue_timeout
36
33
  # Optional `Continue` timeout.
37
34
  #
38
- # @option [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
35
+ # @param [Integer] keep_alive_timeout
39
36
  # Optional `Keep-Alive` timeout.
40
37
  #
41
38
  # @since 0.6.0
42
39
  #
43
- def initialize(options={})
44
- @proxy = options.fetch(:proxy,Spidr.proxy)
45
-
46
- @open_timeout = options.fetch(:open_timeout,Spidr.open_timeout)
47
- @ssl_timeout = options.fetch(:ssl_timeout,Spidr.ssl_timeout)
48
- @read_timeout = options.fetch(:read_timeout,Spidr.read_timeout)
49
- @continue_timeout = options.fetch(:continue_timeout,Spidr.continue_timeout)
50
- @keep_alive_timeout = options.fetch(:keep_alive_timeout,Spidr.keep_alive_timeout)
40
+ def initialize(proxy: Spidr.proxy,
41
+ open_timeout: Spidr.open_timeout,
42
+ ssl_timeout: Spidr.ssl_timeout,
43
+ read_timeout: Spidr.read_timeout,
44
+ continue_timeout: Spidr.continue_timeout,
45
+ keep_alive_timeout: Spidr.keep_alive_timeout)
46
+ self.proxy = proxy
47
+
48
+ self.open_timeout = open_timeout
49
+ self.ssl_timeout = ssl_timeout
50
+ self.read_timeout = read_timeout
51
+ self.continue_timeout = continue_timeout
52
+ self.keep_alive_timeout = keep_alive_timeout
51
53
 
52
54
  @sessions = {}
53
55
  end
@@ -1,5 +1,7 @@
1
1
  require 'spidr/proxy'
2
2
 
3
+ require 'uri/http'
4
+
3
5
  module Spidr
4
6
  module Settings
5
7
  #
@@ -21,7 +23,7 @@ module Spidr
21
23
  #
22
24
  # Sets the proxy information used by Agent objects.
23
25
  #
24
- # @param [Spidr::Proxy, Hash, nil] new_proxy
26
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
25
27
  # The new proxy information.
26
28
  #
27
29
  # @option new_proxy [String] :host
@@ -41,11 +43,23 @@ module Spidr
41
43
  #
42
44
  def proxy=(new_proxy)
43
45
  @proxy = case new_proxy
44
- when Spidr::Proxy then new_proxy
45
- when Hash then Spidr::Proxy.new(new_proxy)
46
- when nil then Spidr::Proxy.new
46
+ when Spidr::Proxy
47
+ new_proxy
48
+ when Hash
49
+ Spidr::Proxy.new(**new_proxy)
50
+ when String, URI::HTTP
51
+ proxy_uri = URI(new_proxy)
52
+
53
+ Spidr::Proxy.new(
54
+ host: proxy_uri.host,
55
+ port: proxy_uri.port,
56
+ user: proxy_uri.user,
57
+ password: proxy_uri.password
58
+ )
59
+ when nil
60
+ Spidr::Proxy.new
47
61
  else
48
- raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
62
+ raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
49
63
  end
50
64
  end
51
65
 
data/lib/spidr/spidr.rb CHANGED
@@ -36,22 +36,31 @@ module Spidr
36
36
  #
37
37
  # @see Agent.start_at
38
38
  #
39
- def self.start_at(url,options={},&block)
40
- Agent.start_at(url,options,&block)
39
+ def self.start_at(url,**kwargs,&block)
40
+ Agent.start_at(url,**kwargs,&block)
41
41
  end
42
42
 
43
43
  #
44
44
  # @see Agent.host
45
45
  #
46
- def self.host(name,options={},&block)
47
- Agent.host(name,options,&block)
46
+ def self.host(name,**kwargs,&block)
47
+ Agent.host(name,**kwargs,&block)
48
+ end
49
+
50
+ #
51
+ # @see Agent.domain
52
+ #
53
+ # @since 0.7.0
54
+ #
55
+ def self.domain(name,options={},&block)
56
+ Agent.domain(name,options,&block)
48
57
  end
49
58
 
50
59
  #
51
60
  # @see Agent.site
52
61
  #
53
- def self.site(url,options={},&block)
54
- Agent.site(url,options,&block)
62
+ def self.site(url,**kwargs,&block)
63
+ Agent.site(url,**kwargs,&block)
55
64
  end
56
65
 
57
66
  #
data/lib/spidr/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.6.1'
3
+ VERSION = '0.7.0'
4
4
  end
data/spec/agent_spec.rb CHANGED
@@ -7,6 +7,250 @@ require 'spidr/agent'
7
7
  describe Agent do
8
8
  it_should_behave_like "includes Spidr::Settings::UserAgent"
9
9
 
10
+ describe ".start_at" do
11
+ module TestAgentStartAt
12
+ class ExampleApp < Sinatra::Base
13
+
14
+ set :host, 'example.com'
15
+ set :port, 80
16
+
17
+ get '/' do
18
+ '<html><body>should not get here</body></html>'
19
+ end
20
+
21
+ get '/entry-point' do
22
+ <<~HTML
23
+ <html>
24
+ <body>
25
+ <a href="/link1">link1</a>
26
+ <a href="http://other.com/offsite-link">offsite link</a>
27
+ <a href="/link2">link2</a>
28
+ </body>
29
+ </html>
30
+ HTML
31
+ end
32
+
33
+ get '/link1' do
34
+ '<html><body>got here</body></html>'
35
+ end
36
+
37
+ get '/link2' do
38
+ '<html><body>got here</body></html>'
39
+ end
40
+ end
41
+
42
+ class OtherApp < Sinatra::Base
43
+
44
+ set :host, 'other.com'
45
+ set :port, 80
46
+
47
+ get '/offsite-link' do
48
+ '<html><body>should not get here</body></html>'
49
+ end
50
+
51
+ end
52
+ end
53
+
54
+ subject { described_class }
55
+
56
+ let(:host) { 'example.com' }
57
+ let(:other_host) { 'other.com' }
58
+ let(:url) { URI("http://#{host}/entry-point") }
59
+
60
+ let(:app) { TestAgentStartAt::ExampleApp }
61
+ let(:other_app) { TestAgentStartAt::OtherApp }
62
+
63
+ before do
64
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
65
+ stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
66
+ end
67
+
68
+ it "must spider the website starting at the given URL" do
69
+ agent = subject.start_at(url)
70
+
71
+ expect(agent.history).to be == Set[
72
+ URI("http://#{host}/entry-point"),
73
+ URI("http://#{host}/link1"),
74
+ URI("http://#{other_host}/offsite-link"),
75
+ URI("http://#{host}/link2")
76
+ ]
77
+ end
78
+ end
79
+
80
+ describe ".site" do
81
+ module TestAgentSite
82
+ class ExampleApp < Sinatra::Base
83
+
84
+ set :host, 'example.com'
85
+ set :port, 80
86
+
87
+ get '/' do
88
+ '<html><body>should not get here</body></html>'
89
+ end
90
+
91
+ get '/entry-point' do
92
+ <<~HTML
93
+ <html>
94
+ <body>
95
+ <a href="/link1">link1</a>
96
+ <a href="http://other.com/offsite-link">offsite link</a>
97
+ <a href="/link2">link2</a>
98
+ </body>
99
+ </html>
100
+ HTML
101
+ end
102
+
103
+ get '/link1' do
104
+ '<html><body>got here</body></html>'
105
+ end
106
+
107
+ get '/link2' do
108
+ '<html><body>got here</body></html>'
109
+ end
110
+
111
+ end
112
+ end
113
+
114
+ subject { described_class }
115
+
116
+ let(:host) { 'example.com' }
117
+ let(:url) { URI("http://#{host}/entry-point") }
118
+
119
+ let(:app) { TestAgentSite::ExampleApp }
120
+
121
+ before do
122
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
123
+ end
124
+
125
+ it "must spider the website starting at the given URL" do
126
+ agent = subject.site(url)
127
+
128
+ expect(agent.history).to be == Set[
129
+ URI("http://#{host}/entry-point"),
130
+ URI("http://#{host}/link1"),
131
+ URI("http://#{host}/link2")
132
+ ]
133
+ end
134
+ end
135
+
136
+ describe ".host" do
137
+ module TestAgentHost
138
+ class ExampleApp < Sinatra::Base
139
+
140
+ set :host, 'example.com'
141
+ set :port, 80
142
+
143
+ get '/' do
144
+ <<~HTML
145
+ <html>
146
+ <body>
147
+ <a href="/link1">link1</a>
148
+ <a href="http://other.com/offsite-link">offsite link</a>
149
+ <a href="/link2">link2</a>
150
+ </body>
151
+ </html>
152
+ HTML
153
+ end
154
+
155
+ get '/link1' do
156
+ '<html><body>got here</body></html>'
157
+ end
158
+
159
+ get '/link2' do
160
+ '<html><body>got here</body></html>'
161
+ end
162
+
163
+ end
164
+ end
165
+
166
+ subject { described_class }
167
+
168
+ let(:host) { 'example.com' }
169
+ let(:app) { TestAgentHost::ExampleApp }
170
+
171
+ before do
172
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
173
+ end
174
+
175
+ it "must spider the website starting at the given URL" do
176
+ agent = subject.host(host)
177
+
178
+ # XXX: for some reason Set#== was returning false, so convert to an Array
179
+ expect(agent.history.to_a).to be == [
180
+ URI("http://#{host}/"),
181
+ URI("http://#{host}/link1"),
182
+ URI("http://#{host}/link2")
183
+ ]
184
+ end
185
+ end
186
+
187
+ describe ".domain" do
188
+ module TestAgentDomain
189
+ class ExampleApp < Sinatra::Base
190
+
191
+ set :host, 'example.com'
192
+ set :port, 80
193
+
194
+ get '/' do
195
+ <<~HTML
196
+ <html>
197
+ <body>
198
+ <a href="/link1">link1</a>
199
+ <a href="http://sub.example.com/subdomain-link">subdomain link</a>
200
+ <a href="/link2">link2</a>
201
+ </body>
202
+ </html>
203
+ HTML
204
+ end
205
+
206
+ get '/link1' do
207
+ '<html><body>got here</body></html>'
208
+ end
209
+
210
+ get '/link2' do
211
+ '<html><body>got here</body></html>'
212
+ end
213
+
214
+ end
215
+
216
+ class SubDomainApp < Sinatra::Base
217
+
218
+ set :host, 'sub.example.com'
219
+ set :port, 80
220
+
221
+ get '/subdomain-link' do
222
+ '<html><body>should get here</body></html>'
223
+ end
224
+
225
+ end
226
+ end
227
+
228
+ subject { described_class }
229
+
230
+ let(:domain) { 'example.com' }
231
+ let(:domain_app) { TestAgentDomain::ExampleApp }
232
+
233
+ let(:subdomain) { 'sub.example.com' }
234
+ let(:subdomain_app) { TestAgentDomain::SubDomainApp }
235
+
236
+ before do
237
+ stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
238
+ stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
239
+ end
240
+
241
+ it "must spider the domain and subdomains starting at the given domain" do
242
+ agent = subject.domain(domain)
243
+
244
+ # XXX: for some reason Set#== was returning false, so convert to an Array
245
+ expect(agent.history.to_a).to be == [
246
+ URI("http://#{domain}/"),
247
+ URI("http://#{domain}/link1"),
248
+ URI("http://#{subdomain}/subdomain-link"),
249
+ URI("http://#{domain}/link2")
250
+ ]
251
+ end
252
+ end
253
+
10
254
  describe "#initialize" do
11
255
  it "should not be running" do
12
256
  expect(subject).to_not be_running
@@ -32,6 +276,68 @@ describe Agent do
32
276
  expect(subject.sessions).to be_kind_of(SessionCache)
33
277
  end
34
278
 
279
+ context "when the proxy: keyword argument is given" do
280
+ let(:proxy) do
281
+ Spidr::Proxy.new(host: 'example.com')
282
+ end
283
+
284
+ subject { described_class.new(proxy: proxy) }
285
+
286
+ it "must initialize the #proxy of #session_cache" do
287
+ expect(subject.sessions.proxy).to be(proxy)
288
+ end
289
+ end
290
+
291
+ context "when the open_timeout: keyword argument is given" do
292
+ let(:open_timeout) { 5 }
293
+
294
+ subject { described_class.new(open_timeout: open_timeout) }
295
+
296
+ it "must initialize the #open_timeout of #session_cache" do
297
+ expect(subject.sessions.open_timeout).to eq(open_timeout)
298
+ end
299
+ end
300
+
301
+ context "when the ssl_timeout: keyword argument is given" do
302
+ let(:ssl_timeout) { 5 }
303
+
304
+ subject { described_class.new(ssl_timeout: ssl_timeout) }
305
+
306
+ it "must initialize the #ssl_timeout of #session_cache" do
307
+ expect(subject.sessions.ssl_timeout).to eq(ssl_timeout)
308
+ end
309
+ end
310
+
311
+ context "when the read_timeout: keyword argument is given" do
312
+ let(:read_timeout) { 5 }
313
+
314
+ subject { described_class.new(read_timeout: read_timeout) }
315
+
316
+ it "must initialize the #read_timeout of #session_cache" do
317
+ expect(subject.sessions.read_timeout).to eq(read_timeout)
318
+ end
319
+ end
320
+
321
+ context "when the continue_timeout: keyword argument is given" do
322
+ let(:continue_timeout) { 5 }
323
+
324
+ subject { described_class.new(continue_timeout: continue_timeout) }
325
+
326
+ it "must initialize the #continue_timeout of #session_cache" do
327
+ expect(subject.sessions.continue_timeout).to eq(continue_timeout)
328
+ end
329
+ end
330
+
331
+ context "when the keep_alive_timeout: keyword argument is given" do
332
+ let(:keep_alive_timeout) { 5 }
333
+
334
+ subject { described_class.new(keep_alive_timeout: keep_alive_timeout) }
335
+
336
+ it "must initialize the #keep_alive_timeout of #session_cache" do
337
+ expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout)
338
+ end
339
+ end
340
+
35
341
  it "should initialize the #cookie_jar" do
36
342
  expect(subject.cookies).to be_kind_of(CookieJar)
37
343
  end
@@ -386,7 +692,15 @@ describe Agent do
386
692
  context "frames" do
387
693
  app do
388
694
  get '/' do
389
- %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
695
+ <<~HTML
696
+ <html>
697
+ <body>
698
+ <frameset>
699
+ <frame src="/frame" />
700
+ </frameset>
701
+ </body>
702
+ </html>
703
+ HTML
390
704
  end
391
705
 
392
706
  get '/frame' do
@@ -614,7 +928,14 @@ describe Agent do
614
928
  end
615
929
 
616
930
  get '/redirect' do
617
- %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
931
+ <<~HTML
932
+ <html>
933
+ <head>
934
+ <meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" />
935
+ </head>
936
+ <body>Redirecting...</body>
937
+ </html>
938
+ HTML
618
939
  end
619
940
 
620
941
  get '/link' do
@@ -674,7 +995,14 @@ describe Agent do
674
995
 
675
996
  app do
676
997
  get '/' do
677
- %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
998
+ <<~HTML
999
+ <html>
1000
+ <body>
1001
+ <a href="http://google.com/">external link</a>
1002
+ <a href="/link">local link</a>
1003
+ </body>
1004
+ </html>
1005
+ HTML
678
1006
  end
679
1007
 
680
1008
  get '/link' do
@@ -726,13 +1054,27 @@ describe Agent do
726
1054
 
727
1055
  app do
728
1056
  get '/' do
729
- %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
1057
+ <<~HTML
1058
+ <html>
1059
+ <body>
1060
+ <a href="/left?d=1">left</a>
1061
+ <a href="/right?d=1">right</a>
1062
+ </body>
1063
+ </html>
1064
+ HTML
730
1065
  end
731
1066
 
732
- get %r{^/left|/right} do
1067
+ get %r{/left|/right} do
733
1068
  d = Integer(params['d'])
734
1069
 
735
- %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
1070
+ <<~HTML
1071
+ <html>
1072
+ <body>
1073
+ <a href="/left?d=#{d+1}">left</a>
1074
+ <a href="/right?d=#{d+1}">right</a>
1075
+ </body>
1076
+ </html>
1077
+ HTML
736
1078
  end
737
1079
  end
738
1080
 
@@ -774,7 +1116,14 @@ describe Agent do
774
1116
 
775
1117
  app do
776
1118
  get '/' do
777
- %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
1119
+ <<~HTML
1120
+ <html>
1121
+ <body>
1122
+ <a href="/secret">don't follow this link</a>
1123
+ <a href="/pub">follow this link</a>
1124
+ </body>
1125
+ </html>
1126
+ HTML
778
1127
  end
779
1128
 
780
1129
  get '/pub' do
data/spec/example_page.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'rspec'
2
+ require 'net/http'
3
+ require 'uri'
2
4
 
3
5
  RSpec.shared_context "example Page" do
4
6
  let(:code) { 200 }
@@ -147,4 +147,26 @@ describe Page do
147
147
  describe "#zip?" do
148
148
  include_examples "Content-Type method", :zip?, 'application/zip'
149
149
  end
150
+
151
+ describe "#png?" do
152
+ include_examples "Content-Type method", :png?, 'image/png'
153
+ end
154
+
155
+ describe "#gif?" do
156
+ include_examples "Content-Type method", :gif?, 'image/gif'
157
+ end
158
+
159
+ describe "#jpeg?" do
160
+ include_examples "Content-Type method", :jpeg?, 'image/jpeg'
161
+ end
162
+
163
+ describe "#ico?" do
164
+ context "when 'Content-Type' is 'image/x-icon'" do
165
+ include_examples "Content-Type method", :ico?, 'image/x-icon'
166
+ end
167
+
168
+ context "when 'Content-Type' is 'image/vnd.microsoft.icon'" do
169
+ include_examples "Content-Type method", :ico?, 'image/vnd.microsoft.icon'
170
+ end
171
+ end
150
172
  end