spidr 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +11 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +52 -53
- data/lib/spidr/agent/sanitizers.rb +5 -8
- data/lib/spidr/agent.rb +219 -97
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +16 -18
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +21 -19
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +15 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +8 -7
- data/.travis.yml +0 -16
data/lib/spidr/agent.rb
CHANGED
@@ -19,12 +19,12 @@ module Spidr
|
|
19
19
|
|
20
20
|
include Settings::UserAgent
|
21
21
|
|
22
|
-
# HTTP Host Header to use
|
22
|
+
# HTTP Host `Header` to use
|
23
23
|
#
|
24
24
|
# @return [String]
|
25
25
|
attr_accessor :host_header
|
26
26
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
27
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
28
|
#
|
29
29
|
# @return [Hash{String,Regexp => String}]
|
30
30
|
attr_reader :host_headers
|
@@ -96,70 +96,110 @@ module Spidr
|
|
96
96
|
#
|
97
97
|
# Creates a new Agent object.
|
98
98
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
99
|
+
# @param [String, nil] host_header
|
100
|
+
# The HTTP `Host` header to use with each request.
|
101
101
|
#
|
102
|
-
# @
|
103
|
-
#
|
102
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
103
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
104
|
#
|
105
|
-
# @
|
105
|
+
# @param [Hash{String => String}] default_headers
|
106
|
+
# Default headers to set for every request.
|
107
|
+
#
|
108
|
+
# @param [String, nil] user_agent
|
109
|
+
# The `User-Agent` string to send with each requests.
|
110
|
+
#
|
111
|
+
# @param [String, nil] referer
|
112
|
+
# The `Referer` URL to send with each request.
|
113
|
+
#
|
114
|
+
# @param [Integer, nil] open_timeout
|
115
|
+
# Optional open connection timeout.
|
116
|
+
#
|
117
|
+
# @param [Integer, nil] read_timeout
|
106
118
|
# Optional read timeout.
|
107
119
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
120
|
+
# @param [Integer, nil] ssl_timeout
|
121
|
+
# Optional SSL connection timeout.
|
110
122
|
#
|
111
|
-
# @
|
123
|
+
# @param [Integer, nil] continue_timeout
|
112
124
|
# Optional continue timeout.
|
113
125
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
126
|
+
# @param [Integer, nil] keep_alive_timeout
|
127
|
+
# Optional `Keep-Alive` timeout.
|
116
128
|
#
|
117
|
-
# @
|
129
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
130
|
# The proxy information to use.
|
119
131
|
#
|
120
|
-
# @option
|
132
|
+
# @option proxy [String] :host
|
121
133
|
# The host the proxy is running on.
|
122
134
|
#
|
123
|
-
# @option
|
135
|
+
# @option proxy [Integer] :port (8080)
|
124
136
|
# The port the proxy is running on.
|
125
137
|
#
|
126
|
-
# @option
|
138
|
+
# @option proxy [String, nil] :user
|
127
139
|
# The user to authenticate as with the proxy.
|
128
140
|
#
|
129
|
-
# @option
|
141
|
+
# @option proxy [String, nil] :password
|
130
142
|
# The password to authenticate with.
|
131
143
|
#
|
132
|
-
# @
|
133
|
-
#
|
144
|
+
# @param [Integer] delay
|
145
|
+
# The number of seconds to pause between each request.
|
134
146
|
#
|
135
|
-
# @
|
136
|
-
# The
|
147
|
+
# @param [Integer, nil] limit
|
148
|
+
# The maximum number of pages to visit.
|
137
149
|
#
|
138
|
-
# @
|
139
|
-
# The
|
150
|
+
# @param [Integer, nil] max_depth
|
151
|
+
# The maximum link depth to follow.
|
140
152
|
#
|
141
|
-
# @
|
142
|
-
# The
|
153
|
+
# @param [Set, Array, nil] queue
|
154
|
+
# The initial queue of URLs to visit.
|
143
155
|
#
|
144
|
-
# @
|
145
|
-
# The
|
156
|
+
# @param [Set, Array, nil] history
|
157
|
+
# The initial list of visited URLs.
|
146
158
|
#
|
147
|
-
# @
|
148
|
-
#
|
159
|
+
# @param [Boolean] strip_fragments
|
160
|
+
# Controls whether to strip the fragment components from the URLs.
|
149
161
|
#
|
150
|
-
# @
|
151
|
-
#
|
162
|
+
# @param [Boolean] strip_query
|
163
|
+
# Controls whether to strip the query components from the URLs.
|
152
164
|
#
|
153
|
-
# @
|
154
|
-
# The
|
165
|
+
# @param [Array<String>] schemes
|
166
|
+
# The list of acceptable URI schemes to visit.
|
167
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
155
168
|
#
|
156
|
-
# @
|
157
|
-
# The
|
169
|
+
# @param [String] host
|
170
|
+
# The host-name to visit.
|
158
171
|
#
|
159
|
-
# @
|
160
|
-
# The
|
172
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
173
|
+
# The patterns which match the host-names to visit.
|
161
174
|
#
|
162
|
-
# @
|
175
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
176
|
+
# The patterns which match the host-names to not visit.
|
177
|
+
#
|
178
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
179
|
+
# The patterns which match the ports to visit.
|
180
|
+
#
|
181
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
182
|
+
# The patterns which match the ports to not visit.
|
183
|
+
#
|
184
|
+
# @param [Array<String, Regexp, Proc>] links
|
185
|
+
# The patterns which match the links to visit.
|
186
|
+
#
|
187
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
188
|
+
# The patterns which match the links to not visit.
|
189
|
+
#
|
190
|
+
# @param [Array<String, Regexp, Proc>] urls
|
191
|
+
# The patterns which match the URLs to visit.
|
192
|
+
#
|
193
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
194
|
+
# The patterns which match the URLs to not visit.
|
195
|
+
#
|
196
|
+
# @param [Array<String, Regexp, Proc>] exts
|
197
|
+
# The patterns which match the URI path extensions to visit.
|
198
|
+
#
|
199
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
200
|
+
# The patterns which match the URI path extensions to not visit.
|
201
|
+
#
|
202
|
+
# @param [Boolean] robots
|
163
203
|
# Specifies whether `robots.txt` should be honored.
|
164
204
|
#
|
165
205
|
# @yield [agent]
|
@@ -169,58 +209,99 @@ module Spidr
|
|
169
209
|
# @yieldparam [Agent] agent
|
170
210
|
# The newly created agent.
|
171
211
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
212
|
+
def initialize(# header keyword arguments
|
213
|
+
host_header: nil,
|
214
|
+
host_headers: {},
|
215
|
+
default_headers: {},
|
216
|
+
user_agent: Spidr.user_agent,
|
217
|
+
referer: nil,
|
218
|
+
# session cache keyword arguments
|
219
|
+
proxy: Spidr.proxy,
|
220
|
+
open_timeout: Spidr.open_timeout,
|
221
|
+
ssl_timeout: Spidr.ssl_timeout,
|
222
|
+
read_timeout: Spidr.read_timeout,
|
223
|
+
continue_timeout: Spidr.continue_timeout,
|
224
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
225
|
+
# spidering controls keyword arguments
|
226
|
+
delay: 0,
|
227
|
+
limit: nil,
|
228
|
+
max_depth: nil,
|
229
|
+
# history keyword arguments
|
230
|
+
queue: nil,
|
231
|
+
history: nil,
|
232
|
+
# sanitizer keyword arguments
|
233
|
+
strip_fragments: true,
|
234
|
+
strip_query: false,
|
235
|
+
# filtering keyword arguments
|
236
|
+
schemes: self.class.default_schemes,
|
237
|
+
host: nil,
|
238
|
+
hosts: nil,
|
239
|
+
ignore_hosts: nil,
|
240
|
+
ports: nil,
|
241
|
+
ignore_ports: nil,
|
242
|
+
links: nil,
|
243
|
+
ignore_links: nil,
|
244
|
+
urls: nil,
|
245
|
+
ignore_urls: nil,
|
246
|
+
exts: nil,
|
247
|
+
ignore_exts: nil,
|
248
|
+
# robots keyword arguments
|
249
|
+
robots: Spidr.robots?)
|
250
|
+
@host_header = host_header
|
251
|
+
@host_headers = host_headers
|
252
|
+
|
253
|
+
@default_headers = default_headers
|
254
|
+
|
255
|
+
@user_agent = user_agent
|
256
|
+
@referer = referer
|
257
|
+
|
258
|
+
@sessions = SessionCache.new(
|
259
|
+
proxy: proxy,
|
260
|
+
open_timeout: open_timeout,
|
261
|
+
ssl_timeout: ssl_timeout,
|
262
|
+
read_timeout: read_timeout,
|
263
|
+
continue_timeout: continue_timeout,
|
264
|
+
keep_alive_timeout: keep_alive_timeout
|
265
|
+
)
|
195
266
|
@cookies = CookieJar.new
|
196
267
|
@authorized = AuthStore.new
|
197
268
|
|
198
269
|
@running = false
|
199
|
-
@delay =
|
270
|
+
@delay = delay
|
200
271
|
@history = Set[]
|
201
272
|
@failures = Set[]
|
202
273
|
@queue = []
|
203
274
|
|
204
|
-
@limit =
|
275
|
+
@limit = limit
|
205
276
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
277
|
+
@max_depth = max_depth
|
278
|
+
|
279
|
+
self.queue = queue if queue
|
280
|
+
self.history = history if history
|
281
|
+
|
282
|
+
initialize_sanitizers(
|
283
|
+
strip_fragments: strip_fragments,
|
284
|
+
strip_query: strip_query
|
285
|
+
)
|
286
|
+
|
287
|
+
initialize_filters(
|
288
|
+
schemes: schemes,
|
289
|
+
host: host,
|
290
|
+
hosts: hosts,
|
291
|
+
ignore_hosts: ignore_hosts,
|
292
|
+
ports: ports,
|
293
|
+
ignore_ports: ignore_ports,
|
294
|
+
links: links,
|
295
|
+
ignore_links: ignore_links,
|
296
|
+
urls: urls,
|
297
|
+
ignore_urls: ignore_urls,
|
298
|
+
exts: exts,
|
299
|
+
ignore_exts: ignore_exts
|
300
|
+
)
|
301
|
+
initialize_actions
|
302
|
+
initialize_events
|
303
|
+
|
304
|
+
initialize_robots if robots
|
224
305
|
|
225
306
|
yield self if block_given?
|
226
307
|
end
|
@@ -231,8 +312,8 @@ module Spidr
|
|
231
312
|
# @param [URI::HTTP, String] url
|
232
313
|
# The URL to start spidering at.
|
233
314
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
315
|
+
# @param [Hash{Symbol => Object}] kwargs
|
316
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
317
|
#
|
237
318
|
# @yield [agent]
|
238
319
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +322,16 @@ module Spidr
|
|
241
322
|
# @yieldparam [Agent] agent
|
242
323
|
# The newly created agent.
|
243
324
|
#
|
325
|
+
# @return [Agent]
|
326
|
+
# The created agent object.
|
327
|
+
#
|
244
328
|
# @see #initialize
|
245
329
|
# @see #start_at
|
246
330
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
331
|
+
def self.start_at(url,**kwargs,&block)
|
332
|
+
agent = new(**kwargs,&block)
|
249
333
|
agent.start_at(url)
|
334
|
+
return agent
|
250
335
|
end
|
251
336
|
|
252
337
|
#
|
@@ -255,8 +340,8 @@ module Spidr
|
|
255
340
|
# @param [URI::HTTP, String] url
|
256
341
|
# The web-site to spider.
|
257
342
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
343
|
+
# @param [Hash{Symbol => Object}] kwargs
|
344
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
345
|
#
|
261
346
|
# @yield [agent]
|
262
347
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +350,17 @@ module Spidr
|
|
265
350
|
# @yieldparam [Agent] agent
|
266
351
|
# The newly created agent.
|
267
352
|
#
|
353
|
+
# @return [Agent]
|
354
|
+
# The created agent object.
|
355
|
+
#
|
268
356
|
# @see #initialize
|
269
357
|
#
|
270
|
-
def self.site(url
|
358
|
+
def self.site(url,**kwargs,&block)
|
271
359
|
url = URI(url)
|
272
360
|
|
273
|
-
agent = new(
|
361
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
362
|
agent.start_at(url)
|
363
|
+
return agent
|
275
364
|
end
|
276
365
|
|
277
366
|
#
|
@@ -280,8 +369,35 @@ module Spidr
|
|
280
369
|
# @param [String] name
|
281
370
|
# The host-name to spider.
|
282
371
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
372
|
+
# @param [Hash{Symbol => Object}] kwargs
|
373
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
374
|
+
#
|
375
|
+
# @yield [agent]
|
376
|
+
# If a block is given, it will be passed the newly created agent
|
377
|
+
# before it begins spidering.
|
378
|
+
#
|
379
|
+
# @yieldparam [Agent] agent
|
380
|
+
# The newly created agent.
|
381
|
+
#
|
382
|
+
# @return [Agent]
|
383
|
+
# The created agent object.
|
384
|
+
#
|
385
|
+
# @see #initialize
|
386
|
+
#
|
387
|
+
def self.host(name,**kwargs,&block)
|
388
|
+
agent = new(host: name, **kwargs, &block)
|
389
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
390
|
+
return agent
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Creates a new agent and spiders the entire domain.
|
395
|
+
#
|
396
|
+
# @param [String] name
|
397
|
+
# The top-level domain to spider.
|
398
|
+
#
|
399
|
+
# @param [Hash{Symbol => Object}] kwargs
|
400
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
401
|
#
|
286
402
|
# @yield [agent]
|
287
403
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +406,17 @@ module Spidr
|
|
290
406
|
# @yieldparam [Agent] agent
|
291
407
|
# The newly created agent.
|
292
408
|
#
|
409
|
+
# @return [Agent]
|
410
|
+
# The created agent object.
|
411
|
+
#
|
293
412
|
# @see #initialize
|
294
413
|
#
|
295
|
-
|
296
|
-
|
414
|
+
# @since 0.7.0
|
415
|
+
#
|
416
|
+
def self.domain(name,**kwargs,&block)
|
417
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
297
418
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
419
|
+
return agent
|
298
420
|
end
|
299
421
|
|
300
422
|
#
|
@@ -314,10 +436,10 @@ module Spidr
|
|
314
436
|
#
|
315
437
|
# Sets the proxy information that the agent uses.
|
316
438
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
439
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
440
|
# The new proxy information.
|
319
441
|
#
|
320
|
-
# @return [
|
442
|
+
# @return [Proxy]
|
321
443
|
# The new proxy information.
|
322
444
|
#
|
323
445
|
# @see SessionCache#proxy=
|
@@ -534,7 +656,7 @@ module Spidr
|
|
534
656
|
def enqueue(url,level=0)
|
535
657
|
url = sanitize_url(url)
|
536
658
|
|
537
|
-
if (!
|
659
|
+
if (!queued?(url) && visit?(url))
|
538
660
|
link = url.to_s
|
539
661
|
|
540
662
|
begin
|
@@ -633,7 +755,7 @@ module Spidr
|
|
633
755
|
end
|
634
756
|
|
635
757
|
#
|
636
|
-
# Visits a given URL, and
|
758
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
637
759
|
# to be visited later.
|
638
760
|
#
|
639
761
|
# @param [URI::HTTP, String] url
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -221,5 +221,56 @@ module Spidr
|
|
221
221
|
def zip?
|
222
222
|
is_content_type?('application/zip')
|
223
223
|
end
|
224
|
+
|
225
|
+
#
|
226
|
+
# Determines if the page is a PNG image.
|
227
|
+
#
|
228
|
+
# @return [Boolean]
|
229
|
+
# Specifies whether the page is a PNG image.
|
230
|
+
#
|
231
|
+
# @since 0.7.0
|
232
|
+
#
|
233
|
+
def png?
|
234
|
+
is_content_type?('image/png')
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Determines if the page is a GIF image.
|
239
|
+
#
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the page is a GIF image.
|
242
|
+
#
|
243
|
+
# @since 0.7.0
|
244
|
+
#
|
245
|
+
def gif?
|
246
|
+
is_content_type?('image/gif')
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Determines if the page is a JPEG image.
|
251
|
+
#
|
252
|
+
# @return [Boolean]
|
253
|
+
# Specifies whether the page is a JPEG image.
|
254
|
+
#
|
255
|
+
# @since 0.7.0
|
256
|
+
#
|
257
|
+
def jpeg?
|
258
|
+
is_content_type?('image/jpeg')
|
259
|
+
end
|
260
|
+
|
261
|
+
#
|
262
|
+
# Determines if the page is a ICO image.
|
263
|
+
#
|
264
|
+
# @return [Boolean]
|
265
|
+
# Specifies whether the page is a ICO image.
|
266
|
+
#
|
267
|
+
# @since 0.7.0
|
268
|
+
#
|
269
|
+
def ico?
|
270
|
+
is_content_type?('image/x-icon') ||
|
271
|
+
is_content_type?('image/vnd.microsoft.icon')
|
272
|
+
end
|
273
|
+
|
274
|
+
alias icon? ico?
|
224
275
|
end
|
225
276
|
end
|
data/lib/spidr/page/html.rb
CHANGED
@@ -105,7 +105,9 @@ module Spidr
|
|
105
105
|
def each_redirect(&block)
|
106
106
|
return enum_for(__method__) unless block
|
107
107
|
|
108
|
-
|
108
|
+
locations = @response.get_fields('Location')
|
109
|
+
|
110
|
+
unless (locations.nil? || locations.empty?)
|
109
111
|
# Location headers override any meta-refresh redirects in the HTML
|
110
112
|
locations.each(&block)
|
111
113
|
else
|
@@ -175,34 +177,30 @@ module Spidr
|
|
175
177
|
#
|
176
178
|
# @since 0.3.0
|
177
179
|
#
|
178
|
-
def each_link
|
180
|
+
def each_link(&block)
|
179
181
|
return enum_for(__method__) unless block_given?
|
180
182
|
|
181
|
-
|
182
|
-
yield url unless (url.nil? || url.empty?)
|
183
|
-
}
|
184
|
-
|
185
|
-
each_redirect(&filter) if is_redirect?
|
183
|
+
each_redirect(&block) if is_redirect?
|
186
184
|
|
187
185
|
if (html? && doc)
|
188
|
-
doc.search('//a[@href]').each do |a|
|
189
|
-
|
186
|
+
doc.search('//a[@href[string()]]').each do |a|
|
187
|
+
yield a.get_attribute('href')
|
190
188
|
end
|
191
189
|
|
192
|
-
doc.search('//frame[@src]').each do |iframe|
|
193
|
-
|
190
|
+
doc.search('//frame[@src[string()]]').each do |iframe|
|
191
|
+
yield iframe.get_attribute('src')
|
194
192
|
end
|
195
193
|
|
196
|
-
doc.search('//iframe[@src]').each do |iframe|
|
197
|
-
|
194
|
+
doc.search('//iframe[@src[string()]]').each do |iframe|
|
195
|
+
yield iframe.get_attribute('src')
|
198
196
|
end
|
199
197
|
|
200
|
-
doc.search('//link[@href]').each do |link|
|
201
|
-
|
198
|
+
doc.search('//link[@href[string()]]').each do |link|
|
199
|
+
yield link.get_attribute('href')
|
202
200
|
end
|
203
201
|
|
204
|
-
doc.search('//script[@src]').each do |script|
|
205
|
-
|
202
|
+
doc.search('//script[@src[string()]]').each do |script|
|
203
|
+
yield script.get_attribute('src')
|
206
204
|
end
|
207
205
|
end
|
208
206
|
end
|
@@ -211,7 +209,7 @@ module Spidr
|
|
211
209
|
# The links from within the page.
|
212
210
|
#
|
213
211
|
# @return [Array<String>]
|
214
|
-
# All links within the HTML page, frame
|
212
|
+
# All links within the HTML page, `frame`/`iframe` source URLs and any
|
215
213
|
# links in the `Location` header.
|
216
214
|
#
|
217
215
|
def links
|
@@ -22,16 +22,6 @@ module Spidr
|
|
22
22
|
|
23
23
|
alias ok? is_ok?
|
24
24
|
|
25
|
-
#
|
26
|
-
# Determines if the response code is `308`.
|
27
|
-
#
|
28
|
-
# @return [Boolean]
|
29
|
-
# Specifies whether the response code is `308`.
|
30
|
-
#
|
31
|
-
def timedout?
|
32
|
-
code == 308
|
33
|
-
end
|
34
|
-
|
35
25
|
#
|
36
26
|
# Determines if the response code is `400`.
|
37
27
|
#
|
@@ -78,6 +68,18 @@ module Spidr
|
|
78
68
|
|
79
69
|
alias missing? is_missing?
|
80
70
|
|
71
|
+
#
|
72
|
+
# Determines if the response code is `408`.
|
73
|
+
#
|
74
|
+
# @return [Boolean]
|
75
|
+
# Specifies whether the response code is `408`.
|
76
|
+
#
|
77
|
+
def is_timedout?
|
78
|
+
code == 408
|
79
|
+
end
|
80
|
+
|
81
|
+
alias timedout? is_timedout?
|
82
|
+
|
81
83
|
#
|
82
84
|
# Determines if the response code is `500`.
|
83
85
|
#
|
data/lib/spidr/proxy.rb
CHANGED
@@ -10,28 +10,20 @@ module Spidr
|
|
10
10
|
#
|
11
11
|
# Initializes the proxy.
|
12
12
|
#
|
13
|
-
# @param [
|
14
|
-
# Attributes for the proxy.
|
15
|
-
#
|
16
|
-
# @option attributes [String] :host
|
13
|
+
# @param [String] host
|
17
14
|
# The host the proxy is running on.
|
18
15
|
#
|
19
|
-
# @
|
16
|
+
# @param [Integer] port
|
20
17
|
# The port the proxy is running on.
|
21
18
|
#
|
22
|
-
# @
|
19
|
+
# @param [String] user
|
23
20
|
# The user to authenticate as with the proxy.
|
24
21
|
#
|
25
|
-
# @
|
22
|
+
# @param [String] password
|
26
23
|
# The password to authenticate with.
|
27
24
|
#
|
28
|
-
def initialize(
|
29
|
-
super(
|
30
|
-
attributes[:host],
|
31
|
-
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
-
attributes[:user],
|
33
|
-
attributes[:password]
|
34
|
-
)
|
25
|
+
def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
|
26
|
+
super(host,port,user,password)
|
35
27
|
end
|
36
28
|
|
37
29
|
#
|
data/lib/spidr/rules.rb
CHANGED
@@ -14,21 +14,18 @@ module Spidr
|
|
14
14
|
#
|
15
15
|
# Creates a new Rules object.
|
16
16
|
#
|
17
|
-
# @param [
|
18
|
-
# Additional options.
|
19
|
-
#
|
20
|
-
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# @param [Array<String, Regexp, Proc>, nil] accept
|
21
18
|
# The patterns to accept data with.
|
22
19
|
#
|
23
|
-
# @
|
20
|
+
# @param [Array<String, Regexp, Proc>, nil] reject
|
24
21
|
# The patterns to reject data with.
|
25
22
|
#
|
26
|
-
def initialize(
|
23
|
+
def initialize(accept: nil, reject: nil)
|
27
24
|
@accept = []
|
28
25
|
@reject = []
|
29
26
|
|
30
|
-
@accept +=
|
31
|
-
@reject +=
|
27
|
+
@accept += accept if accept
|
28
|
+
@reject += reject if reject
|
32
29
|
end
|
33
30
|
|
34
31
|
#
|