spidr 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +11 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +52 -53
- data/lib/spidr/agent/sanitizers.rb +5 -8
- data/lib/spidr/agent.rb +219 -97
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +16 -18
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +21 -19
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +15 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +8 -7
- data/.travis.yml +0 -16
data/lib/spidr/agent.rb
CHANGED
@@ -19,12 +19,12 @@ module Spidr
|
|
19
19
|
|
20
20
|
include Settings::UserAgent
|
21
21
|
|
22
|
-
# HTTP Host Header to use
|
22
|
+
# HTTP Host `Header` to use
|
23
23
|
#
|
24
24
|
# @return [String]
|
25
25
|
attr_accessor :host_header
|
26
26
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
27
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
28
|
#
|
29
29
|
# @return [Hash{String,Regexp => String}]
|
30
30
|
attr_reader :host_headers
|
@@ -96,70 +96,110 @@ module Spidr
|
|
96
96
|
#
|
97
97
|
# Creates a new Agent object.
|
98
98
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
99
|
+
# @param [String, nil] host_header
|
100
|
+
# The HTTP `Host` header to use with each request.
|
101
101
|
#
|
102
|
-
# @
|
103
|
-
#
|
102
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
103
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
104
|
#
|
105
|
-
# @
|
105
|
+
# @param [Hash{String => String}] default_headers
|
106
|
+
# Default headers to set for every request.
|
107
|
+
#
|
108
|
+
# @param [String, nil] user_agent
|
109
|
+
# The `User-Agent` string to send with each requests.
|
110
|
+
#
|
111
|
+
# @param [String, nil] referer
|
112
|
+
# The `Referer` URL to send with each request.
|
113
|
+
#
|
114
|
+
# @param [Integer, nil] open_timeout
|
115
|
+
# Optional open connection timeout.
|
116
|
+
#
|
117
|
+
# @param [Integer, nil] read_timeout
|
106
118
|
# Optional read timeout.
|
107
119
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
120
|
+
# @param [Integer, nil] ssl_timeout
|
121
|
+
# Optional SSL connection timeout.
|
110
122
|
#
|
111
|
-
# @
|
123
|
+
# @param [Integer, nil] continue_timeout
|
112
124
|
# Optional continue timeout.
|
113
125
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
126
|
+
# @param [Integer, nil] keep_alive_timeout
|
127
|
+
# Optional `Keep-Alive` timeout.
|
116
128
|
#
|
117
|
-
# @
|
129
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
130
|
# The proxy information to use.
|
119
131
|
#
|
120
|
-
# @option
|
132
|
+
# @option proxy [String] :host
|
121
133
|
# The host the proxy is running on.
|
122
134
|
#
|
123
|
-
# @option
|
135
|
+
# @option proxy [Integer] :port (8080)
|
124
136
|
# The port the proxy is running on.
|
125
137
|
#
|
126
|
-
# @option
|
138
|
+
# @option proxy [String, nil] :user
|
127
139
|
# The user to authenticate as with the proxy.
|
128
140
|
#
|
129
|
-
# @option
|
141
|
+
# @option proxy [String, nil] :password
|
130
142
|
# The password to authenticate with.
|
131
143
|
#
|
132
|
-
# @
|
133
|
-
#
|
144
|
+
# @param [Integer] delay
|
145
|
+
# The number of seconds to pause between each request.
|
134
146
|
#
|
135
|
-
# @
|
136
|
-
# The
|
147
|
+
# @param [Integer, nil] limit
|
148
|
+
# The maximum number of pages to visit.
|
137
149
|
#
|
138
|
-
# @
|
139
|
-
# The
|
150
|
+
# @param [Integer, nil] max_depth
|
151
|
+
# The maximum link depth to follow.
|
140
152
|
#
|
141
|
-
# @
|
142
|
-
# The
|
153
|
+
# @param [Set, Array, nil] queue
|
154
|
+
# The initial queue of URLs to visit.
|
143
155
|
#
|
144
|
-
# @
|
145
|
-
# The
|
156
|
+
# @param [Set, Array, nil] history
|
157
|
+
# The initial list of visited URLs.
|
146
158
|
#
|
147
|
-
# @
|
148
|
-
#
|
159
|
+
# @param [Boolean] strip_fragments
|
160
|
+
# Controls whether to strip the fragment components from the URLs.
|
149
161
|
#
|
150
|
-
# @
|
151
|
-
#
|
162
|
+
# @param [Boolean] strip_query
|
163
|
+
# Controls whether to strip the query components from the URLs.
|
152
164
|
#
|
153
|
-
# @
|
154
|
-
# The
|
165
|
+
# @param [Array<String>] schemes
|
166
|
+
# The list of acceptable URI schemes to visit.
|
167
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
155
168
|
#
|
156
|
-
# @
|
157
|
-
# The
|
169
|
+
# @param [String] host
|
170
|
+
# The host-name to visit.
|
158
171
|
#
|
159
|
-
# @
|
160
|
-
# The
|
172
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
173
|
+
# The patterns which match the host-names to visit.
|
161
174
|
#
|
162
|
-
# @
|
175
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
176
|
+
# The patterns which match the host-names to not visit.
|
177
|
+
#
|
178
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
179
|
+
# The patterns which match the ports to visit.
|
180
|
+
#
|
181
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
182
|
+
# The patterns which match the ports to not visit.
|
183
|
+
#
|
184
|
+
# @param [Array<String, Regexp, Proc>] links
|
185
|
+
# The patterns which match the links to visit.
|
186
|
+
#
|
187
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
188
|
+
# The patterns which match the links to not visit.
|
189
|
+
#
|
190
|
+
# @param [Array<String, Regexp, Proc>] urls
|
191
|
+
# The patterns which match the URLs to visit.
|
192
|
+
#
|
193
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
194
|
+
# The patterns which match the URLs to not visit.
|
195
|
+
#
|
196
|
+
# @param [Array<String, Regexp, Proc>] exts
|
197
|
+
# The patterns which match the URI path extensions to visit.
|
198
|
+
#
|
199
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
200
|
+
# The patterns which match the URI path extensions to not visit.
|
201
|
+
#
|
202
|
+
# @param [Boolean] robots
|
163
203
|
# Specifies whether `robots.txt` should be honored.
|
164
204
|
#
|
165
205
|
# @yield [agent]
|
@@ -169,58 +209,99 @@ module Spidr
|
|
169
209
|
# @yieldparam [Agent] agent
|
170
210
|
# The newly created agent.
|
171
211
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
212
|
+
def initialize(# header keyword arguments
|
213
|
+
host_header: nil,
|
214
|
+
host_headers: {},
|
215
|
+
default_headers: {},
|
216
|
+
user_agent: Spidr.user_agent,
|
217
|
+
referer: nil,
|
218
|
+
# session cache keyword arguments
|
219
|
+
proxy: Spidr.proxy,
|
220
|
+
open_timeout: Spidr.open_timeout,
|
221
|
+
ssl_timeout: Spidr.ssl_timeout,
|
222
|
+
read_timeout: Spidr.read_timeout,
|
223
|
+
continue_timeout: Spidr.continue_timeout,
|
224
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
225
|
+
# spidering controls keyword arguments
|
226
|
+
delay: 0,
|
227
|
+
limit: nil,
|
228
|
+
max_depth: nil,
|
229
|
+
# history keyword arguments
|
230
|
+
queue: nil,
|
231
|
+
history: nil,
|
232
|
+
# sanitizer keyword arguments
|
233
|
+
strip_fragments: true,
|
234
|
+
strip_query: false,
|
235
|
+
# filtering keyword arguments
|
236
|
+
schemes: self.class.default_schemes,
|
237
|
+
host: nil,
|
238
|
+
hosts: nil,
|
239
|
+
ignore_hosts: nil,
|
240
|
+
ports: nil,
|
241
|
+
ignore_ports: nil,
|
242
|
+
links: nil,
|
243
|
+
ignore_links: nil,
|
244
|
+
urls: nil,
|
245
|
+
ignore_urls: nil,
|
246
|
+
exts: nil,
|
247
|
+
ignore_exts: nil,
|
248
|
+
# robots keyword arguments
|
249
|
+
robots: Spidr.robots?)
|
250
|
+
@host_header = host_header
|
251
|
+
@host_headers = host_headers
|
252
|
+
|
253
|
+
@default_headers = default_headers
|
254
|
+
|
255
|
+
@user_agent = user_agent
|
256
|
+
@referer = referer
|
257
|
+
|
258
|
+
@sessions = SessionCache.new(
|
259
|
+
proxy: proxy,
|
260
|
+
open_timeout: open_timeout,
|
261
|
+
ssl_timeout: ssl_timeout,
|
262
|
+
read_timeout: read_timeout,
|
263
|
+
continue_timeout: continue_timeout,
|
264
|
+
keep_alive_timeout: keep_alive_timeout
|
265
|
+
)
|
195
266
|
@cookies = CookieJar.new
|
196
267
|
@authorized = AuthStore.new
|
197
268
|
|
198
269
|
@running = false
|
199
|
-
@delay =
|
270
|
+
@delay = delay
|
200
271
|
@history = Set[]
|
201
272
|
@failures = Set[]
|
202
273
|
@queue = []
|
203
274
|
|
204
|
-
@limit =
|
275
|
+
@limit = limit
|
205
276
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
277
|
+
@max_depth = max_depth
|
278
|
+
|
279
|
+
self.queue = queue if queue
|
280
|
+
self.history = history if history
|
281
|
+
|
282
|
+
initialize_sanitizers(
|
283
|
+
strip_fragments: strip_fragments,
|
284
|
+
strip_query: strip_query
|
285
|
+
)
|
286
|
+
|
287
|
+
initialize_filters(
|
288
|
+
schemes: schemes,
|
289
|
+
host: host,
|
290
|
+
hosts: hosts,
|
291
|
+
ignore_hosts: ignore_hosts,
|
292
|
+
ports: ports,
|
293
|
+
ignore_ports: ignore_ports,
|
294
|
+
links: links,
|
295
|
+
ignore_links: ignore_links,
|
296
|
+
urls: urls,
|
297
|
+
ignore_urls: ignore_urls,
|
298
|
+
exts: exts,
|
299
|
+
ignore_exts: ignore_exts
|
300
|
+
)
|
301
|
+
initialize_actions
|
302
|
+
initialize_events
|
303
|
+
|
304
|
+
initialize_robots if robots
|
224
305
|
|
225
306
|
yield self if block_given?
|
226
307
|
end
|
@@ -231,8 +312,8 @@ module Spidr
|
|
231
312
|
# @param [URI::HTTP, String] url
|
232
313
|
# The URL to start spidering at.
|
233
314
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
315
|
+
# @param [Hash{Symbol => Object}] kwargs
|
316
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
317
|
#
|
237
318
|
# @yield [agent]
|
238
319
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +322,16 @@ module Spidr
|
|
241
322
|
# @yieldparam [Agent] agent
|
242
323
|
# The newly created agent.
|
243
324
|
#
|
325
|
+
# @return [Agent]
|
326
|
+
# The created agent object.
|
327
|
+
#
|
244
328
|
# @see #initialize
|
245
329
|
# @see #start_at
|
246
330
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
331
|
+
def self.start_at(url,**kwargs,&block)
|
332
|
+
agent = new(**kwargs,&block)
|
249
333
|
agent.start_at(url)
|
334
|
+
return agent
|
250
335
|
end
|
251
336
|
|
252
337
|
#
|
@@ -255,8 +340,8 @@ module Spidr
|
|
255
340
|
# @param [URI::HTTP, String] url
|
256
341
|
# The web-site to spider.
|
257
342
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
343
|
+
# @param [Hash{Symbol => Object}] kwargs
|
344
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
345
|
#
|
261
346
|
# @yield [agent]
|
262
347
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +350,17 @@ module Spidr
|
|
265
350
|
# @yieldparam [Agent] agent
|
266
351
|
# The newly created agent.
|
267
352
|
#
|
353
|
+
# @return [Agent]
|
354
|
+
# The created agent object.
|
355
|
+
#
|
268
356
|
# @see #initialize
|
269
357
|
#
|
270
|
-
def self.site(url
|
358
|
+
def self.site(url,**kwargs,&block)
|
271
359
|
url = URI(url)
|
272
360
|
|
273
|
-
agent = new(
|
361
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
362
|
agent.start_at(url)
|
363
|
+
return agent
|
275
364
|
end
|
276
365
|
|
277
366
|
#
|
@@ -280,8 +369,35 @@ module Spidr
|
|
280
369
|
# @param [String] name
|
281
370
|
# The host-name to spider.
|
282
371
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
372
|
+
# @param [Hash{Symbol => Object}] kwargs
|
373
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
374
|
+
#
|
375
|
+
# @yield [agent]
|
376
|
+
# If a block is given, it will be passed the newly created agent
|
377
|
+
# before it begins spidering.
|
378
|
+
#
|
379
|
+
# @yieldparam [Agent] agent
|
380
|
+
# The newly created agent.
|
381
|
+
#
|
382
|
+
# @return [Agent]
|
383
|
+
# The created agent object.
|
384
|
+
#
|
385
|
+
# @see #initialize
|
386
|
+
#
|
387
|
+
def self.host(name,**kwargs,&block)
|
388
|
+
agent = new(host: name, **kwargs, &block)
|
389
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
390
|
+
return agent
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Creates a new agent and spiders the entire domain.
|
395
|
+
#
|
396
|
+
# @param [String] name
|
397
|
+
# The top-level domain to spider.
|
398
|
+
#
|
399
|
+
# @param [Hash{Symbol => Object}] kwargs
|
400
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
401
|
#
|
286
402
|
# @yield [agent]
|
287
403
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +406,17 @@ module Spidr
|
|
290
406
|
# @yieldparam [Agent] agent
|
291
407
|
# The newly created agent.
|
292
408
|
#
|
409
|
+
# @return [Agent]
|
410
|
+
# The created agent object.
|
411
|
+
#
|
293
412
|
# @see #initialize
|
294
413
|
#
|
295
|
-
|
296
|
-
|
414
|
+
# @since 0.7.0
|
415
|
+
#
|
416
|
+
def self.domain(name,**kwargs,&block)
|
417
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
297
418
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
419
|
+
return agent
|
298
420
|
end
|
299
421
|
|
300
422
|
#
|
@@ -314,10 +436,10 @@ module Spidr
|
|
314
436
|
#
|
315
437
|
# Sets the proxy information that the agent uses.
|
316
438
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
439
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
440
|
# The new proxy information.
|
319
441
|
#
|
320
|
-
# @return [
|
442
|
+
# @return [Proxy]
|
321
443
|
# The new proxy information.
|
322
444
|
#
|
323
445
|
# @see SessionCache#proxy=
|
@@ -534,7 +656,7 @@ module Spidr
|
|
534
656
|
def enqueue(url,level=0)
|
535
657
|
url = sanitize_url(url)
|
536
658
|
|
537
|
-
if (!
|
659
|
+
if (!queued?(url) && visit?(url))
|
538
660
|
link = url.to_s
|
539
661
|
|
540
662
|
begin
|
@@ -633,7 +755,7 @@ module Spidr
|
|
633
755
|
end
|
634
756
|
|
635
757
|
#
|
636
|
-
# Visits a given URL, and
|
758
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
637
759
|
# to be visited later.
|
638
760
|
#
|
639
761
|
# @param [URI::HTTP, String] url
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -221,5 +221,56 @@ module Spidr
|
|
221
221
|
def zip?
|
222
222
|
is_content_type?('application/zip')
|
223
223
|
end
|
224
|
+
|
225
|
+
#
|
226
|
+
# Determines if the page is a PNG image.
|
227
|
+
#
|
228
|
+
# @return [Boolean]
|
229
|
+
# Specifies whether the page is a PNG image.
|
230
|
+
#
|
231
|
+
# @since 0.7.0
|
232
|
+
#
|
233
|
+
def png?
|
234
|
+
is_content_type?('image/png')
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Determines if the page is a GIF image.
|
239
|
+
#
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the page is a GIF image.
|
242
|
+
#
|
243
|
+
# @since 0.7.0
|
244
|
+
#
|
245
|
+
def gif?
|
246
|
+
is_content_type?('image/gif')
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Determines if the page is a JPEG image.
|
251
|
+
#
|
252
|
+
# @return [Boolean]
|
253
|
+
# Specifies whether the page is a JPEG image.
|
254
|
+
#
|
255
|
+
# @since 0.7.0
|
256
|
+
#
|
257
|
+
def jpeg?
|
258
|
+
is_content_type?('image/jpeg')
|
259
|
+
end
|
260
|
+
|
261
|
+
#
|
262
|
+
# Determines if the page is a ICO image.
|
263
|
+
#
|
264
|
+
# @return [Boolean]
|
265
|
+
# Specifies whether the page is a ICO image.
|
266
|
+
#
|
267
|
+
# @since 0.7.0
|
268
|
+
#
|
269
|
+
def ico?
|
270
|
+
is_content_type?('image/x-icon') ||
|
271
|
+
is_content_type?('image/vnd.microsoft.icon')
|
272
|
+
end
|
273
|
+
|
274
|
+
alias icon? ico?
|
224
275
|
end
|
225
276
|
end
|
data/lib/spidr/page/html.rb
CHANGED
@@ -105,7 +105,9 @@ module Spidr
|
|
105
105
|
def each_redirect(&block)
|
106
106
|
return enum_for(__method__) unless block
|
107
107
|
|
108
|
-
|
108
|
+
locations = @response.get_fields('Location')
|
109
|
+
|
110
|
+
unless (locations.nil? || locations.empty?)
|
109
111
|
# Location headers override any meta-refresh redirects in the HTML
|
110
112
|
locations.each(&block)
|
111
113
|
else
|
@@ -175,34 +177,30 @@ module Spidr
|
|
175
177
|
#
|
176
178
|
# @since 0.3.0
|
177
179
|
#
|
178
|
-
def each_link
|
180
|
+
def each_link(&block)
|
179
181
|
return enum_for(__method__) unless block_given?
|
180
182
|
|
181
|
-
|
182
|
-
yield url unless (url.nil? || url.empty?)
|
183
|
-
}
|
184
|
-
|
185
|
-
each_redirect(&filter) if is_redirect?
|
183
|
+
each_redirect(&block) if is_redirect?
|
186
184
|
|
187
185
|
if (html? && doc)
|
188
|
-
doc.search('//a[@href]').each do |a|
|
189
|
-
|
186
|
+
doc.search('//a[@href[string()]]').each do |a|
|
187
|
+
yield a.get_attribute('href')
|
190
188
|
end
|
191
189
|
|
192
|
-
doc.search('//frame[@src]').each do |iframe|
|
193
|
-
|
190
|
+
doc.search('//frame[@src[string()]]').each do |iframe|
|
191
|
+
yield iframe.get_attribute('src')
|
194
192
|
end
|
195
193
|
|
196
|
-
doc.search('//iframe[@src]').each do |iframe|
|
197
|
-
|
194
|
+
doc.search('//iframe[@src[string()]]').each do |iframe|
|
195
|
+
yield iframe.get_attribute('src')
|
198
196
|
end
|
199
197
|
|
200
|
-
doc.search('//link[@href]').each do |link|
|
201
|
-
|
198
|
+
doc.search('//link[@href[string()]]').each do |link|
|
199
|
+
yield link.get_attribute('href')
|
202
200
|
end
|
203
201
|
|
204
|
-
doc.search('//script[@src]').each do |script|
|
205
|
-
|
202
|
+
doc.search('//script[@src[string()]]').each do |script|
|
203
|
+
yield script.get_attribute('src')
|
206
204
|
end
|
207
205
|
end
|
208
206
|
end
|
@@ -211,7 +209,7 @@ module Spidr
|
|
211
209
|
# The links from within the page.
|
212
210
|
#
|
213
211
|
# @return [Array<String>]
|
214
|
-
# All links within the HTML page, frame
|
212
|
+
# All links within the HTML page, `frame`/`iframe` source URLs and any
|
215
213
|
# links in the `Location` header.
|
216
214
|
#
|
217
215
|
def links
|
@@ -22,16 +22,6 @@ module Spidr
|
|
22
22
|
|
23
23
|
alias ok? is_ok?
|
24
24
|
|
25
|
-
#
|
26
|
-
# Determines if the response code is `308`.
|
27
|
-
#
|
28
|
-
# @return [Boolean]
|
29
|
-
# Specifies whether the response code is `308`.
|
30
|
-
#
|
31
|
-
def timedout?
|
32
|
-
code == 308
|
33
|
-
end
|
34
|
-
|
35
25
|
#
|
36
26
|
# Determines if the response code is `400`.
|
37
27
|
#
|
@@ -78,6 +68,18 @@ module Spidr
|
|
78
68
|
|
79
69
|
alias missing? is_missing?
|
80
70
|
|
71
|
+
#
|
72
|
+
# Determines if the response code is `408`.
|
73
|
+
#
|
74
|
+
# @return [Boolean]
|
75
|
+
# Specifies whether the response code is `408`.
|
76
|
+
#
|
77
|
+
def is_timedout?
|
78
|
+
code == 408
|
79
|
+
end
|
80
|
+
|
81
|
+
alias timedout? is_timedout?
|
82
|
+
|
81
83
|
#
|
82
84
|
# Determines if the response code is `500`.
|
83
85
|
#
|
data/lib/spidr/proxy.rb
CHANGED
@@ -10,28 +10,20 @@ module Spidr
|
|
10
10
|
#
|
11
11
|
# Initializes the proxy.
|
12
12
|
#
|
13
|
-
# @param [
|
14
|
-
# Attributes for the proxy.
|
15
|
-
#
|
16
|
-
# @option attributes [String] :host
|
13
|
+
# @param [String] host
|
17
14
|
# The host the proxy is running on.
|
18
15
|
#
|
19
|
-
# @
|
16
|
+
# @param [Integer] port
|
20
17
|
# The port the proxy is running on.
|
21
18
|
#
|
22
|
-
# @
|
19
|
+
# @param [String] user
|
23
20
|
# The user to authenticate as with the proxy.
|
24
21
|
#
|
25
|
-
# @
|
22
|
+
# @param [String] password
|
26
23
|
# The password to authenticate with.
|
27
24
|
#
|
28
|
-
def initialize(
|
29
|
-
super(
|
30
|
-
attributes[:host],
|
31
|
-
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
-
attributes[:user],
|
33
|
-
attributes[:password]
|
34
|
-
)
|
25
|
+
def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
|
26
|
+
super(host,port,user,password)
|
35
27
|
end
|
36
28
|
|
37
29
|
#
|
data/lib/spidr/rules.rb
CHANGED
@@ -14,21 +14,18 @@ module Spidr
|
|
14
14
|
#
|
15
15
|
# Creates a new Rules object.
|
16
16
|
#
|
17
|
-
# @param [
|
18
|
-
# Additional options.
|
19
|
-
#
|
20
|
-
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# @param [Array<String, Regexp, Proc>, nil] accept
|
21
18
|
# The patterns to accept data with.
|
22
19
|
#
|
23
|
-
# @
|
20
|
+
# @param [Array<String, Regexp, Proc>, nil] reject
|
24
21
|
# The patterns to reject data with.
|
25
22
|
#
|
26
|
-
def initialize(
|
23
|
+
def initialize(accept: nil, reject: nil)
|
27
24
|
@accept = []
|
28
25
|
@reject = []
|
29
26
|
|
30
|
-
@accept +=
|
31
|
-
@reject +=
|
27
|
+
@accept += accept if accept
|
28
|
+
@reject += reject if reject
|
32
29
|
end
|
33
30
|
|
34
31
|
#
|