spidr 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/lib/spidr/agent.rb
CHANGED
@@ -19,12 +19,12 @@ module Spidr
|
|
19
19
|
|
20
20
|
include Settings::UserAgent
|
21
21
|
|
22
|
-
# HTTP Host Header to use
|
22
|
+
# HTTP Host `Header` to use
|
23
23
|
#
|
24
24
|
# @return [String]
|
25
25
|
attr_accessor :host_header
|
26
26
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
27
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
28
|
#
|
29
29
|
# @return [Hash{String,Regexp => String}]
|
30
30
|
attr_reader :host_headers
|
@@ -96,70 +96,110 @@ module Spidr
|
|
96
96
|
#
|
97
97
|
# Creates a new Agent object.
|
98
98
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
99
|
+
# @param [String, nil] host_header
|
100
|
+
# The HTTP `Host` header to use with each request.
|
101
101
|
#
|
102
|
-
# @
|
103
|
-
#
|
102
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
103
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
104
|
#
|
105
|
-
# @
|
105
|
+
# @param [Hash{String => String}] default_headers
|
106
|
+
# Default headers to set for every request.
|
107
|
+
#
|
108
|
+
# @param [String, nil] user_agent
|
109
|
+
# The `User-Agent` string to send with each requests.
|
110
|
+
#
|
111
|
+
# @param [String, nil] referer
|
112
|
+
# The `Referer` URL to send with each request.
|
113
|
+
#
|
114
|
+
# @param [Integer, nil] open_timeout
|
115
|
+
# Optional open connection timeout.
|
116
|
+
#
|
117
|
+
# @param [Integer, nil] read_timeout
|
106
118
|
# Optional read timeout.
|
107
119
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
120
|
+
# @param [Integer, nil] ssl_timeout
|
121
|
+
# Optional SSL connection timeout.
|
110
122
|
#
|
111
|
-
# @
|
123
|
+
# @param [Integer, nil] continue_timeout
|
112
124
|
# Optional continue timeout.
|
113
125
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
126
|
+
# @param [Integer, nil] keep_alive_timeout
|
127
|
+
# Optional `Keep-Alive` timeout.
|
116
128
|
#
|
117
|
-
# @
|
129
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
130
|
# The proxy information to use.
|
119
131
|
#
|
120
|
-
# @option
|
132
|
+
# @option proxy [String] :host
|
121
133
|
# The host the proxy is running on.
|
122
134
|
#
|
123
|
-
# @option
|
135
|
+
# @option proxy [Integer] :port (8080)
|
124
136
|
# The port the proxy is running on.
|
125
137
|
#
|
126
|
-
# @option
|
138
|
+
# @option proxy [String, nil] :user
|
127
139
|
# The user to authenticate as with the proxy.
|
128
140
|
#
|
129
|
-
# @option
|
141
|
+
# @option proxy [String, nil] :password
|
130
142
|
# The password to authenticate with.
|
131
143
|
#
|
132
|
-
# @
|
133
|
-
#
|
144
|
+
# @param [Integer] delay
|
145
|
+
# The number of seconds to pause between each request.
|
146
|
+
#
|
147
|
+
# @param [Integer, nil] limit
|
148
|
+
# The maximum number of pages to visit.
|
134
149
|
#
|
135
|
-
# @
|
136
|
-
# The
|
150
|
+
# @param [Integer, nil] max_depth
|
151
|
+
# The maximum link depth to follow.
|
137
152
|
#
|
138
|
-
# @
|
139
|
-
# The
|
153
|
+
# @param [Set, Array, nil] queue
|
154
|
+
# The initial queue of URLs to visit.
|
140
155
|
#
|
141
|
-
# @
|
142
|
-
# The
|
156
|
+
# @param [Set, Array, nil] history
|
157
|
+
# The initial list of visited URLs.
|
143
158
|
#
|
144
|
-
# @
|
145
|
-
#
|
159
|
+
# @param [Boolean] strip_fragments
|
160
|
+
# Controls whether to strip the fragment components from the URLs.
|
146
161
|
#
|
147
|
-
# @
|
148
|
-
#
|
162
|
+
# @param [Boolean] strip_query
|
163
|
+
# Controls whether to strip the query components from the URLs.
|
149
164
|
#
|
150
|
-
# @
|
151
|
-
# The
|
165
|
+
# @param [Array<String>] schemes
|
166
|
+
# The list of acceptable URI schemes to visit.
|
167
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
152
168
|
#
|
153
|
-
# @
|
154
|
-
# The
|
169
|
+
# @param [String] host
|
170
|
+
# The host-name to visit.
|
155
171
|
#
|
156
|
-
# @
|
157
|
-
# The
|
172
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
173
|
+
# The patterns which match the host-names to visit.
|
158
174
|
#
|
159
|
-
# @
|
160
|
-
# The
|
175
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
176
|
+
# The patterns which match the host-names to not visit.
|
177
|
+
#
|
178
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
179
|
+
# The patterns which match the ports to visit.
|
161
180
|
#
|
162
|
-
# @
|
181
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
182
|
+
# The patterns which match the ports to not visit.
|
183
|
+
#
|
184
|
+
# @param [Array<String, Regexp, Proc>] links
|
185
|
+
# The patterns which match the links to visit.
|
186
|
+
#
|
187
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
188
|
+
# The patterns which match the links to not visit.
|
189
|
+
#
|
190
|
+
# @param [Array<String, Regexp, Proc>] urls
|
191
|
+
# The patterns which match the URLs to visit.
|
192
|
+
#
|
193
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
194
|
+
# The patterns which match the URLs to not visit.
|
195
|
+
#
|
196
|
+
# @param [Array<String, Regexp, Proc>] exts
|
197
|
+
# The patterns which match the URI path extensions to visit.
|
198
|
+
#
|
199
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
200
|
+
# The patterns which match the URI path extensions to not visit.
|
201
|
+
#
|
202
|
+
# @param [Boolean] robots
|
163
203
|
# Specifies whether `robots.txt` should be honored.
|
164
204
|
#
|
165
205
|
# @yield [agent]
|
@@ -169,58 +209,99 @@ module Spidr
|
|
169
209
|
# @yieldparam [Agent] agent
|
170
210
|
# The newly created agent.
|
171
211
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
212
|
+
def initialize(# header keyword arguments
|
213
|
+
host_header: nil,
|
214
|
+
host_headers: {},
|
215
|
+
default_headers: {},
|
216
|
+
user_agent: Spidr.user_agent,
|
217
|
+
referer: nil,
|
218
|
+
# session cache keyword arguments
|
219
|
+
proxy: Spidr.proxy,
|
220
|
+
open_timeout: Spidr.open_timeout,
|
221
|
+
ssl_timeout: Spidr.ssl_timeout,
|
222
|
+
read_timeout: Spidr.read_timeout,
|
223
|
+
continue_timeout: Spidr.continue_timeout,
|
224
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
225
|
+
# spidering controls keyword arguments
|
226
|
+
delay: 0,
|
227
|
+
limit: nil,
|
228
|
+
max_depth: nil,
|
229
|
+
# history keyword arguments
|
230
|
+
queue: nil,
|
231
|
+
history: nil,
|
232
|
+
# sanitizer keyword arguments
|
233
|
+
strip_fragments: true,
|
234
|
+
strip_query: false,
|
235
|
+
# filtering keyword arguments
|
236
|
+
schemes: self.class.default_schemes,
|
237
|
+
host: nil,
|
238
|
+
hosts: nil,
|
239
|
+
ignore_hosts: nil,
|
240
|
+
ports: nil,
|
241
|
+
ignore_ports: nil,
|
242
|
+
links: nil,
|
243
|
+
ignore_links: nil,
|
244
|
+
urls: nil,
|
245
|
+
ignore_urls: nil,
|
246
|
+
exts: nil,
|
247
|
+
ignore_exts: nil,
|
248
|
+
# robots keyword arguments
|
249
|
+
robots: Spidr.robots?)
|
250
|
+
@host_header = host_header
|
251
|
+
@host_headers = host_headers
|
252
|
+
|
253
|
+
@default_headers = default_headers
|
254
|
+
|
255
|
+
@user_agent = user_agent
|
256
|
+
@referer = referer
|
257
|
+
|
258
|
+
@sessions = SessionCache.new(
|
259
|
+
proxy: proxy,
|
260
|
+
open_timeout: open_timeout,
|
261
|
+
ssl_timeout: ssl_timeout,
|
262
|
+
read_timeout: read_timeout,
|
263
|
+
continue_timeout: continue_timeout,
|
264
|
+
keep_alive_timeout: keep_alive_timeout
|
265
|
+
)
|
195
266
|
@cookies = CookieJar.new
|
196
267
|
@authorized = AuthStore.new
|
197
268
|
|
198
269
|
@running = false
|
199
|
-
@delay =
|
270
|
+
@delay = delay
|
200
271
|
@history = Set[]
|
201
272
|
@failures = Set[]
|
202
273
|
@queue = []
|
203
274
|
|
204
|
-
@limit =
|
275
|
+
@limit = limit
|
205
276
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
277
|
+
@max_depth = max_depth
|
278
|
+
|
279
|
+
self.queue = queue if queue
|
280
|
+
self.history = history if history
|
281
|
+
|
282
|
+
initialize_sanitizers(
|
283
|
+
strip_fragments: strip_fragments,
|
284
|
+
strip_query: strip_query
|
285
|
+
)
|
286
|
+
|
287
|
+
initialize_filters(
|
288
|
+
schemes: schemes,
|
289
|
+
host: host,
|
290
|
+
hosts: hosts,
|
291
|
+
ignore_hosts: ignore_hosts,
|
292
|
+
ports: ports,
|
293
|
+
ignore_ports: ignore_ports,
|
294
|
+
links: links,
|
295
|
+
ignore_links: ignore_links,
|
296
|
+
urls: urls,
|
297
|
+
ignore_urls: ignore_urls,
|
298
|
+
exts: exts,
|
299
|
+
ignore_exts: ignore_exts
|
300
|
+
)
|
301
|
+
initialize_actions
|
302
|
+
initialize_events
|
303
|
+
|
304
|
+
initialize_robots if robots
|
224
305
|
|
225
306
|
yield self if block_given?
|
226
307
|
end
|
@@ -231,8 +312,8 @@ module Spidr
|
|
231
312
|
# @param [URI::HTTP, String] url
|
232
313
|
# The URL to start spidering at.
|
233
314
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
315
|
+
# @param [Hash{Symbol => Object}] kwargs
|
316
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
317
|
#
|
237
318
|
# @yield [agent]
|
238
319
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +322,16 @@ module Spidr
|
|
241
322
|
# @yieldparam [Agent] agent
|
242
323
|
# The newly created agent.
|
243
324
|
#
|
325
|
+
# @return [Agent]
|
326
|
+
# The created agent object.
|
327
|
+
#
|
244
328
|
# @see #initialize
|
245
329
|
# @see #start_at
|
246
330
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
331
|
+
def self.start_at(url,**kwargs,&block)
|
332
|
+
agent = new(**kwargs,&block)
|
249
333
|
agent.start_at(url)
|
334
|
+
return agent
|
250
335
|
end
|
251
336
|
|
252
337
|
#
|
@@ -255,8 +340,8 @@ module Spidr
|
|
255
340
|
# @param [URI::HTTP, String] url
|
256
341
|
# The web-site to spider.
|
257
342
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
343
|
+
# @param [Hash{Symbol => Object}] kwargs
|
344
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
345
|
#
|
261
346
|
# @yield [agent]
|
262
347
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +350,17 @@ module Spidr
|
|
265
350
|
# @yieldparam [Agent] agent
|
266
351
|
# The newly created agent.
|
267
352
|
#
|
353
|
+
# @return [Agent]
|
354
|
+
# The created agent object.
|
355
|
+
#
|
268
356
|
# @see #initialize
|
269
357
|
#
|
270
|
-
def self.site(url
|
271
|
-
url = URI(url
|
358
|
+
def self.site(url,**kwargs,&block)
|
359
|
+
url = URI(url)
|
272
360
|
|
273
|
-
agent = new(
|
361
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
362
|
agent.start_at(url)
|
363
|
+
return agent
|
275
364
|
end
|
276
365
|
|
277
366
|
#
|
@@ -280,8 +369,8 @@ module Spidr
|
|
280
369
|
# @param [String] name
|
281
370
|
# The host-name to spider.
|
282
371
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
372
|
+
# @param [Hash{Symbol => Object}] kwargs
|
373
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
374
|
#
|
286
375
|
# @yield [agent]
|
287
376
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +379,44 @@ module Spidr
|
|
290
379
|
# @yieldparam [Agent] agent
|
291
380
|
# The newly created agent.
|
292
381
|
#
|
382
|
+
# @return [Agent]
|
383
|
+
# The created agent object.
|
384
|
+
#
|
293
385
|
# @see #initialize
|
294
386
|
#
|
295
|
-
def self.host(name
|
296
|
-
agent = new(
|
387
|
+
def self.host(name,**kwargs,&block)
|
388
|
+
agent = new(host: name, **kwargs, &block)
|
297
389
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
390
|
+
return agent
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Creates a new agent and spiders the entire domain.
|
395
|
+
#
|
396
|
+
# @param [String] name
|
397
|
+
# The top-level domain to spider.
|
398
|
+
#
|
399
|
+
# @param [Hash{Symbol => Object}] kwargs
|
400
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
401
|
+
#
|
402
|
+
# @yield [agent]
|
403
|
+
# If a block is given, it will be passed the newly created agent
|
404
|
+
# before it begins spidering.
|
405
|
+
#
|
406
|
+
# @yieldparam [Agent] agent
|
407
|
+
# The newly created agent.
|
408
|
+
#
|
409
|
+
# @return [Agent]
|
410
|
+
# The created agent object.
|
411
|
+
#
|
412
|
+
# @see #initialize
|
413
|
+
#
|
414
|
+
# @since 0.7.0
|
415
|
+
#
|
416
|
+
def self.domain(name,**kwargs,&block)
|
417
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
418
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
419
|
+
return agent
|
298
420
|
end
|
299
421
|
|
300
422
|
#
|
@@ -314,10 +436,10 @@ module Spidr
|
|
314
436
|
#
|
315
437
|
# Sets the proxy information that the agent uses.
|
316
438
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
439
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
440
|
# The new proxy information.
|
319
441
|
#
|
320
|
-
# @return [
|
442
|
+
# @return [Proxy]
|
321
443
|
# The new proxy information.
|
322
444
|
#
|
323
445
|
# @see SessionCache#proxy=
|
@@ -408,9 +530,7 @@ module Spidr
|
|
408
530
|
@history.clear
|
409
531
|
|
410
532
|
new_history.each do |url|
|
411
|
-
|
412
|
-
|
413
|
-
@history << url
|
533
|
+
@history << URI(url)
|
414
534
|
end
|
415
535
|
|
416
536
|
return @history
|
@@ -425,7 +545,7 @@ module Spidr
|
|
425
545
|
# The links which have been visited.
|
426
546
|
#
|
427
547
|
def visited_links
|
428
|
-
@history.map
|
548
|
+
@history.map(&:to_s)
|
429
549
|
end
|
430
550
|
|
431
551
|
#
|
@@ -435,7 +555,7 @@ module Spidr
|
|
435
555
|
# The hosts which have been visited.
|
436
556
|
#
|
437
557
|
def visited_hosts
|
438
|
-
visited_urls.map
|
558
|
+
visited_urls.map(&:host).uniq
|
439
559
|
end
|
440
560
|
|
441
561
|
#
|
@@ -448,9 +568,7 @@ module Spidr
|
|
448
568
|
# Specifies whether a URL was visited.
|
449
569
|
#
|
450
570
|
def visited?(url)
|
451
|
-
|
452
|
-
|
453
|
-
return @history.include?(url)
|
571
|
+
@history.include?(URI(url))
|
454
572
|
end
|
455
573
|
|
456
574
|
#
|
@@ -469,9 +587,7 @@ module Spidr
|
|
469
587
|
@failures.clear
|
470
588
|
|
471
589
|
new_failures.each do |url|
|
472
|
-
|
473
|
-
|
474
|
-
@failures << url
|
590
|
+
@failures << URI(url)
|
475
591
|
end
|
476
592
|
|
477
593
|
return @failures
|
@@ -487,9 +603,7 @@ module Spidr
|
|
487
603
|
# Specifies whether the given URL was unable to be visited.
|
488
604
|
#
|
489
605
|
def failed?(url)
|
490
|
-
|
491
|
-
|
492
|
-
return @failures.include?(url)
|
606
|
+
@failures.include?(URI(url))
|
493
607
|
end
|
494
608
|
|
495
609
|
alias pending_urls queue
|
@@ -510,9 +624,7 @@ module Spidr
|
|
510
624
|
@queue.clear
|
511
625
|
|
512
626
|
new_queue.each do |url|
|
513
|
-
|
514
|
-
|
515
|
-
@queue << url
|
627
|
+
@queue << URI(url)
|
516
628
|
end
|
517
629
|
|
518
630
|
return @queue
|
@@ -544,7 +656,7 @@ module Spidr
|
|
544
656
|
def enqueue(url,level=0)
|
545
657
|
url = sanitize_url(url)
|
546
658
|
|
547
|
-
if (!
|
659
|
+
if (!queued?(url) && visit?(url))
|
548
660
|
link = url.to_s
|
549
661
|
|
550
662
|
begin
|
@@ -594,7 +706,7 @@ module Spidr
|
|
594
706
|
# The page for the response, or `nil` if the request failed.
|
595
707
|
#
|
596
708
|
def get_page(url)
|
597
|
-
url = URI(url
|
709
|
+
url = URI(url)
|
598
710
|
|
599
711
|
prepare_request(url) do |session,path,headers|
|
600
712
|
new_page = Page.new(url,session.get(path,headers))
|
@@ -629,7 +741,7 @@ module Spidr
|
|
629
741
|
# @since 0.2.2
|
630
742
|
#
|
631
743
|
def post_page(url,post_data='')
|
632
|
-
url = URI(url
|
744
|
+
url = URI(url)
|
633
745
|
|
634
746
|
prepare_request(url) do |session,path,headers|
|
635
747
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -643,7 +755,7 @@ module Spidr
|
|
643
755
|
end
|
644
756
|
|
645
757
|
#
|
646
|
-
# Visits a given URL, and
|
758
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
647
759
|
# to be visited later.
|
648
760
|
#
|
649
761
|
# @param [URI::HTTP, String] url
|
@@ -725,7 +837,7 @@ module Spidr
|
|
725
837
|
|
726
838
|
unless @host_headers.empty?
|
727
839
|
@host_headers.each do |name,header|
|
728
|
-
if host.match(name)
|
840
|
+
if url.host.match(name)
|
729
841
|
headers['Host'] = header
|
730
842
|
break
|
731
843
|
end
|
@@ -769,8 +881,6 @@ module Spidr
|
|
769
881
|
# @since 0.2.2
|
770
882
|
#
|
771
883
|
def prepare_request(url,&block)
|
772
|
-
host = url.host
|
773
|
-
port = url.port
|
774
884
|
path = unless url.path.empty?
|
775
885
|
url.path
|
776
886
|
else
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -34,7 +34,7 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
def [](url)
|
36
36
|
# normalize the url
|
37
|
-
url = URI(url
|
37
|
+
url = URI(url)
|
38
38
|
|
39
39
|
key = [url.scheme, url.host, url.port]
|
40
40
|
paths = @credentials[key]
|
@@ -42,7 +42,7 @@ module Spidr
|
|
42
42
|
return nil unless paths
|
43
43
|
|
44
44
|
# longest path first
|
45
|
-
ordered_paths = paths.keys.sort_by { |
|
45
|
+
ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
|
46
46
|
|
47
47
|
# directories of the path
|
48
48
|
path_dirs = URI.expand_path(url.path).split('/')
|
@@ -70,7 +70,7 @@ module Spidr
|
|
70
70
|
#
|
71
71
|
def []=(url,auth)
|
72
72
|
# normalize the url
|
73
|
-
url = URI(url
|
73
|
+
url = URI(url)
|
74
74
|
|
75
75
|
# normalize the URL path
|
76
76
|
path = URI.expand_path(url.path)
|
@@ -109,7 +109,7 @@ module Spidr
|
|
109
109
|
# or `nil` if no authorization exists.
|
110
110
|
#
|
111
111
|
# @param [URI] url
|
112
|
-
# The
|
112
|
+
# The URL.
|
113
113
|
#
|
114
114
|
# @return [String, nil]
|
115
115
|
# The base64 encoded authorizatio string or `nil`.
|
@@ -118,7 +118,7 @@ module Spidr
|
|
118
118
|
#
|
119
119
|
def for_url(url)
|
120
120
|
if (auth = self[url])
|
121
|
-
|
121
|
+
Base64.encode64("#{auth.username}:#{auth.password}")
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -144,7 +144,11 @@ module Spidr
|
|
144
144
|
# @since 0.2.2
|
145
145
|
#
|
146
146
|
def size
|
147
|
-
|
147
|
+
total = 0
|
148
|
+
|
149
|
+
@credentials.each_value { |paths| total += paths.length }
|
150
|
+
|
151
|
+
return total
|
148
152
|
end
|
149
153
|
|
150
154
|
#
|
@@ -221,5 +221,56 @@ module Spidr
|
|
221
221
|
def zip?
|
222
222
|
is_content_type?('application/zip')
|
223
223
|
end
|
224
|
+
|
225
|
+
#
|
226
|
+
# Determines if the page is a PNG image.
|
227
|
+
#
|
228
|
+
# @return [Boolean]
|
229
|
+
# Specifies whether the page is a PNG image.
|
230
|
+
#
|
231
|
+
# @since 0.7.0
|
232
|
+
#
|
233
|
+
def png?
|
234
|
+
is_content_type?('image/png')
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Determines if the page is a GIF image.
|
239
|
+
#
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the page is a GIF image.
|
242
|
+
#
|
243
|
+
# @since 0.7.0
|
244
|
+
#
|
245
|
+
def gif?
|
246
|
+
is_content_type?('image/gif')
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Determines if the page is a JPEG image.
|
251
|
+
#
|
252
|
+
# @return [Boolean]
|
253
|
+
# Specifies whether the page is a JPEG image.
|
254
|
+
#
|
255
|
+
# @since 0.7.0
|
256
|
+
#
|
257
|
+
def jpeg?
|
258
|
+
is_content_type?('image/jpeg')
|
259
|
+
end
|
260
|
+
|
261
|
+
#
|
262
|
+
# Determines if the page is a ICO image.
|
263
|
+
#
|
264
|
+
# @return [Boolean]
|
265
|
+
# Specifies whether the page is a ICO image.
|
266
|
+
#
|
267
|
+
# @since 0.7.0
|
268
|
+
#
|
269
|
+
def ico?
|
270
|
+
is_content_type?('image/x-icon') ||
|
271
|
+
is_content_type?('image/vnd.microsoft.icon')
|
272
|
+
end
|
273
|
+
|
274
|
+
alias icon? ico?
|
224
275
|
end
|
225
276
|
end
|