spidr 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/lib/spidr/agent.rb
CHANGED
@@ -19,12 +19,12 @@ module Spidr
|
|
19
19
|
|
20
20
|
include Settings::UserAgent
|
21
21
|
|
22
|
-
# HTTP Host Header to use
|
22
|
+
# HTTP Host `Header` to use
|
23
23
|
#
|
24
24
|
# @return [String]
|
25
25
|
attr_accessor :host_header
|
26
26
|
|
27
|
-
# HTTP Host Headers to use for specific hosts
|
27
|
+
# HTTP `Host` Headers to use for specific hosts
|
28
28
|
#
|
29
29
|
# @return [Hash{String,Regexp => String}]
|
30
30
|
attr_reader :host_headers
|
@@ -96,70 +96,110 @@ module Spidr
|
|
96
96
|
#
|
97
97
|
# Creates a new Agent object.
|
98
98
|
#
|
99
|
-
# @param [
|
100
|
-
#
|
99
|
+
# @param [String, nil] host_header
|
100
|
+
# The HTTP `Host` header to use with each request.
|
101
101
|
#
|
102
|
-
# @
|
103
|
-
#
|
102
|
+
# @param [Hash{String,Regexp => String}] host_headers
|
103
|
+
# The HTTP `Host` headers to use for specific hosts.
|
104
104
|
#
|
105
|
-
# @
|
105
|
+
# @param [Hash{String => String}] default_headers
|
106
|
+
# Default headers to set for every request.
|
107
|
+
#
|
108
|
+
# @param [String, nil] user_agent
|
109
|
+
# The `User-Agent` string to send with each requests.
|
110
|
+
#
|
111
|
+
# @param [String, nil] referer
|
112
|
+
# The `Referer` URL to send with each request.
|
113
|
+
#
|
114
|
+
# @param [Integer, nil] open_timeout
|
115
|
+
# Optional open connection timeout.
|
116
|
+
#
|
117
|
+
# @param [Integer, nil] read_timeout
|
106
118
|
# Optional read timeout.
|
107
119
|
#
|
108
|
-
# @
|
109
|
-
# Optional
|
120
|
+
# @param [Integer, nil] ssl_timeout
|
121
|
+
# Optional SSL connection timeout.
|
110
122
|
#
|
111
|
-
# @
|
123
|
+
# @param [Integer, nil] continue_timeout
|
112
124
|
# Optional continue timeout.
|
113
125
|
#
|
114
|
-
# @
|
115
|
-
# Optional
|
126
|
+
# @param [Integer, nil] keep_alive_timeout
|
127
|
+
# Optional `Keep-Alive` timeout.
|
116
128
|
#
|
117
|
-
# @
|
129
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
|
118
130
|
# The proxy information to use.
|
119
131
|
#
|
120
|
-
# @option
|
132
|
+
# @option proxy [String] :host
|
121
133
|
# The host the proxy is running on.
|
122
134
|
#
|
123
|
-
# @option
|
135
|
+
# @option proxy [Integer] :port (8080)
|
124
136
|
# The port the proxy is running on.
|
125
137
|
#
|
126
|
-
# @option
|
138
|
+
# @option proxy [String, nil] :user
|
127
139
|
# The user to authenticate as with the proxy.
|
128
140
|
#
|
129
|
-
# @option
|
141
|
+
# @option proxy [String, nil] :password
|
130
142
|
# The password to authenticate with.
|
131
143
|
#
|
132
|
-
# @
|
133
|
-
#
|
144
|
+
# @param [Integer] delay
|
145
|
+
# The number of seconds to pause between each request.
|
146
|
+
#
|
147
|
+
# @param [Integer, nil] limit
|
148
|
+
# The maximum number of pages to visit.
|
134
149
|
#
|
135
|
-
# @
|
136
|
-
# The
|
150
|
+
# @param [Integer, nil] max_depth
|
151
|
+
# The maximum link depth to follow.
|
137
152
|
#
|
138
|
-
# @
|
139
|
-
# The
|
153
|
+
# @param [Set, Array, nil] queue
|
154
|
+
# The initial queue of URLs to visit.
|
140
155
|
#
|
141
|
-
# @
|
142
|
-
# The
|
156
|
+
# @param [Set, Array, nil] history
|
157
|
+
# The initial list of visited URLs.
|
143
158
|
#
|
144
|
-
# @
|
145
|
-
#
|
159
|
+
# @param [Boolean] strip_fragments
|
160
|
+
# Controls whether to strip the fragment components from the URLs.
|
146
161
|
#
|
147
|
-
# @
|
148
|
-
#
|
162
|
+
# @param [Boolean] strip_query
|
163
|
+
# Controls whether to strip the query components from the URLs.
|
149
164
|
#
|
150
|
-
# @
|
151
|
-
# The
|
165
|
+
# @param [Array<String>] schemes
|
166
|
+
# The list of acceptable URI schemes to visit.
|
167
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
152
168
|
#
|
153
|
-
# @
|
154
|
-
# The
|
169
|
+
# @param [String] host
|
170
|
+
# The host-name to visit.
|
155
171
|
#
|
156
|
-
# @
|
157
|
-
# The
|
172
|
+
# @param [Array<String, Regexp, Proc>] hosts
|
173
|
+
# The patterns which match the host-names to visit.
|
158
174
|
#
|
159
|
-
# @
|
160
|
-
# The
|
175
|
+
# @param [Array<String, Regexp, Proc>] ignore_hosts
|
176
|
+
# The patterns which match the host-names to not visit.
|
177
|
+
#
|
178
|
+
# @param [Array<Integer, Regexp, Proc>] ports
|
179
|
+
# The patterns which match the ports to visit.
|
161
180
|
#
|
162
|
-
# @
|
181
|
+
# @param [Array<Integer, Regexp, Proc>] ignore_ports
|
182
|
+
# The patterns which match the ports to not visit.
|
183
|
+
#
|
184
|
+
# @param [Array<String, Regexp, Proc>] links
|
185
|
+
# The patterns which match the links to visit.
|
186
|
+
#
|
187
|
+
# @param [Array<String, Regexp, Proc>] ignore_links
|
188
|
+
# The patterns which match the links to not visit.
|
189
|
+
#
|
190
|
+
# @param [Array<String, Regexp, Proc>] urls
|
191
|
+
# The patterns which match the URLs to visit.
|
192
|
+
#
|
193
|
+
# @param [Array<String, Regexp, Proc>] ignore_urls
|
194
|
+
# The patterns which match the URLs to not visit.
|
195
|
+
#
|
196
|
+
# @param [Array<String, Regexp, Proc>] exts
|
197
|
+
# The patterns which match the URI path extensions to visit.
|
198
|
+
#
|
199
|
+
# @param [Array<String, Regexp, Proc>] ignore_exts
|
200
|
+
# The patterns which match the URI path extensions to not visit.
|
201
|
+
#
|
202
|
+
# @param [Boolean] robots
|
163
203
|
# Specifies whether `robots.txt` should be honored.
|
164
204
|
#
|
165
205
|
# @yield [agent]
|
@@ -169,58 +209,99 @@ module Spidr
|
|
169
209
|
# @yieldparam [Agent] agent
|
170
210
|
# The newly created agent.
|
171
211
|
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
212
|
+
def initialize(# header keyword arguments
|
213
|
+
host_header: nil,
|
214
|
+
host_headers: {},
|
215
|
+
default_headers: {},
|
216
|
+
user_agent: Spidr.user_agent,
|
217
|
+
referer: nil,
|
218
|
+
# session cache keyword arguments
|
219
|
+
proxy: Spidr.proxy,
|
220
|
+
open_timeout: Spidr.open_timeout,
|
221
|
+
ssl_timeout: Spidr.ssl_timeout,
|
222
|
+
read_timeout: Spidr.read_timeout,
|
223
|
+
continue_timeout: Spidr.continue_timeout,
|
224
|
+
keep_alive_timeout: Spidr.keep_alive_timeout,
|
225
|
+
# spidering controls keyword arguments
|
226
|
+
delay: 0,
|
227
|
+
limit: nil,
|
228
|
+
max_depth: nil,
|
229
|
+
# history keyword arguments
|
230
|
+
queue: nil,
|
231
|
+
history: nil,
|
232
|
+
# sanitizer keyword arguments
|
233
|
+
strip_fragments: true,
|
234
|
+
strip_query: false,
|
235
|
+
# filtering keyword arguments
|
236
|
+
schemes: self.class.default_schemes,
|
237
|
+
host: nil,
|
238
|
+
hosts: nil,
|
239
|
+
ignore_hosts: nil,
|
240
|
+
ports: nil,
|
241
|
+
ignore_ports: nil,
|
242
|
+
links: nil,
|
243
|
+
ignore_links: nil,
|
244
|
+
urls: nil,
|
245
|
+
ignore_urls: nil,
|
246
|
+
exts: nil,
|
247
|
+
ignore_exts: nil,
|
248
|
+
# robots keyword arguments
|
249
|
+
robots: Spidr.robots?)
|
250
|
+
@host_header = host_header
|
251
|
+
@host_headers = host_headers
|
252
|
+
|
253
|
+
@default_headers = default_headers
|
254
|
+
|
255
|
+
@user_agent = user_agent
|
256
|
+
@referer = referer
|
257
|
+
|
258
|
+
@sessions = SessionCache.new(
|
259
|
+
proxy: proxy,
|
260
|
+
open_timeout: open_timeout,
|
261
|
+
ssl_timeout: ssl_timeout,
|
262
|
+
read_timeout: read_timeout,
|
263
|
+
continue_timeout: continue_timeout,
|
264
|
+
keep_alive_timeout: keep_alive_timeout
|
265
|
+
)
|
195
266
|
@cookies = CookieJar.new
|
196
267
|
@authorized = AuthStore.new
|
197
268
|
|
198
269
|
@running = false
|
199
|
-
@delay =
|
270
|
+
@delay = delay
|
200
271
|
@history = Set[]
|
201
272
|
@failures = Set[]
|
202
273
|
@queue = []
|
203
274
|
|
204
|
-
@limit =
|
275
|
+
@limit = limit
|
205
276
|
@levels = Hash.new(0)
|
206
|
-
@max_depth =
|
207
|
-
|
208
|
-
if
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
277
|
+
@max_depth = max_depth
|
278
|
+
|
279
|
+
self.queue = queue if queue
|
280
|
+
self.history = history if history
|
281
|
+
|
282
|
+
initialize_sanitizers(
|
283
|
+
strip_fragments: strip_fragments,
|
284
|
+
strip_query: strip_query
|
285
|
+
)
|
286
|
+
|
287
|
+
initialize_filters(
|
288
|
+
schemes: schemes,
|
289
|
+
host: host,
|
290
|
+
hosts: hosts,
|
291
|
+
ignore_hosts: ignore_hosts,
|
292
|
+
ports: ports,
|
293
|
+
ignore_ports: ignore_ports,
|
294
|
+
links: links,
|
295
|
+
ignore_links: ignore_links,
|
296
|
+
urls: urls,
|
297
|
+
ignore_urls: ignore_urls,
|
298
|
+
exts: exts,
|
299
|
+
ignore_exts: ignore_exts
|
300
|
+
)
|
301
|
+
initialize_actions
|
302
|
+
initialize_events
|
303
|
+
|
304
|
+
initialize_robots if robots
|
224
305
|
|
225
306
|
yield self if block_given?
|
226
307
|
end
|
@@ -231,8 +312,8 @@ module Spidr
|
|
231
312
|
# @param [URI::HTTP, String] url
|
232
313
|
# The URL to start spidering at.
|
233
314
|
#
|
234
|
-
# @param [Hash]
|
235
|
-
# Additional
|
315
|
+
# @param [Hash{Symbol => Object}] kwargs
|
316
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
236
317
|
#
|
237
318
|
# @yield [agent]
|
238
319
|
# If a block is given, it will be passed the newly created agent
|
@@ -241,12 +322,16 @@ module Spidr
|
|
241
322
|
# @yieldparam [Agent] agent
|
242
323
|
# The newly created agent.
|
243
324
|
#
|
325
|
+
# @return [Agent]
|
326
|
+
# The created agent object.
|
327
|
+
#
|
244
328
|
# @see #initialize
|
245
329
|
# @see #start_at
|
246
330
|
#
|
247
|
-
def self.start_at(url
|
248
|
-
agent = new(
|
331
|
+
def self.start_at(url,**kwargs,&block)
|
332
|
+
agent = new(**kwargs,&block)
|
249
333
|
agent.start_at(url)
|
334
|
+
return agent
|
250
335
|
end
|
251
336
|
|
252
337
|
#
|
@@ -255,8 +340,8 @@ module Spidr
|
|
255
340
|
# @param [URI::HTTP, String] url
|
256
341
|
# The web-site to spider.
|
257
342
|
#
|
258
|
-
# @param [Hash]
|
259
|
-
# Additional
|
343
|
+
# @param [Hash{Symbol => Object}] kwargs
|
344
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
260
345
|
#
|
261
346
|
# @yield [agent]
|
262
347
|
# If a block is given, it will be passed the newly created agent
|
@@ -265,13 +350,17 @@ module Spidr
|
|
265
350
|
# @yieldparam [Agent] agent
|
266
351
|
# The newly created agent.
|
267
352
|
#
|
353
|
+
# @return [Agent]
|
354
|
+
# The created agent object.
|
355
|
+
#
|
268
356
|
# @see #initialize
|
269
357
|
#
|
270
|
-
def self.site(url
|
271
|
-
url = URI(url
|
358
|
+
def self.site(url,**kwargs,&block)
|
359
|
+
url = URI(url)
|
272
360
|
|
273
|
-
agent = new(
|
361
|
+
agent = new(host: url.host, **kwargs, &block)
|
274
362
|
agent.start_at(url)
|
363
|
+
return agent
|
275
364
|
end
|
276
365
|
|
277
366
|
#
|
@@ -280,8 +369,8 @@ module Spidr
|
|
280
369
|
# @param [String] name
|
281
370
|
# The host-name to spider.
|
282
371
|
#
|
283
|
-
# @param [Hash]
|
284
|
-
# Additional
|
372
|
+
# @param [Hash{Symbol => Object}] kwargs
|
373
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
285
374
|
#
|
286
375
|
# @yield [agent]
|
287
376
|
# If a block is given, it will be passed the newly created agent
|
@@ -290,11 +379,44 @@ module Spidr
|
|
290
379
|
# @yieldparam [Agent] agent
|
291
380
|
# The newly created agent.
|
292
381
|
#
|
382
|
+
# @return [Agent]
|
383
|
+
# The created agent object.
|
384
|
+
#
|
293
385
|
# @see #initialize
|
294
386
|
#
|
295
|
-
def self.host(name
|
296
|
-
agent = new(
|
387
|
+
def self.host(name,**kwargs,&block)
|
388
|
+
agent = new(host: name, **kwargs, &block)
|
297
389
|
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
390
|
+
return agent
|
391
|
+
end
|
392
|
+
|
393
|
+
#
|
394
|
+
# Creates a new agent and spiders the entire domain.
|
395
|
+
#
|
396
|
+
# @param [String] name
|
397
|
+
# The top-level domain to spider.
|
398
|
+
#
|
399
|
+
# @param [Hash{Symbol => Object}] kwargs
|
400
|
+
# Additional keyword arguments. See {Agent#initialize}.
|
401
|
+
#
|
402
|
+
# @yield [agent]
|
403
|
+
# If a block is given, it will be passed the newly created agent
|
404
|
+
# before it begins spidering.
|
405
|
+
#
|
406
|
+
# @yieldparam [Agent] agent
|
407
|
+
# The newly created agent.
|
408
|
+
#
|
409
|
+
# @return [Agent]
|
410
|
+
# The created agent object.
|
411
|
+
#
|
412
|
+
# @see #initialize
|
413
|
+
#
|
414
|
+
# @since 0.7.0
|
415
|
+
#
|
416
|
+
def self.domain(name,**kwargs,&block)
|
417
|
+
agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
|
418
|
+
agent.start_at(URI::HTTP.build(host: name, path: '/'))
|
419
|
+
return agent
|
298
420
|
end
|
299
421
|
|
300
422
|
#
|
@@ -314,10 +436,10 @@ module Spidr
|
|
314
436
|
#
|
315
437
|
# Sets the proxy information that the agent uses.
|
316
438
|
#
|
317
|
-
# @param [Proxy] new_proxy
|
439
|
+
# @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
318
440
|
# The new proxy information.
|
319
441
|
#
|
320
|
-
# @return [
|
442
|
+
# @return [Proxy]
|
321
443
|
# The new proxy information.
|
322
444
|
#
|
323
445
|
# @see SessionCache#proxy=
|
@@ -408,9 +530,7 @@ module Spidr
|
|
408
530
|
@history.clear
|
409
531
|
|
410
532
|
new_history.each do |url|
|
411
|
-
|
412
|
-
|
413
|
-
@history << url
|
533
|
+
@history << URI(url)
|
414
534
|
end
|
415
535
|
|
416
536
|
return @history
|
@@ -425,7 +545,7 @@ module Spidr
|
|
425
545
|
# The links which have been visited.
|
426
546
|
#
|
427
547
|
def visited_links
|
428
|
-
@history.map
|
548
|
+
@history.map(&:to_s)
|
429
549
|
end
|
430
550
|
|
431
551
|
#
|
@@ -435,7 +555,7 @@ module Spidr
|
|
435
555
|
# The hosts which have been visited.
|
436
556
|
#
|
437
557
|
def visited_hosts
|
438
|
-
visited_urls.map
|
558
|
+
visited_urls.map(&:host).uniq
|
439
559
|
end
|
440
560
|
|
441
561
|
#
|
@@ -448,9 +568,7 @@ module Spidr
|
|
448
568
|
# Specifies whether a URL was visited.
|
449
569
|
#
|
450
570
|
def visited?(url)
|
451
|
-
|
452
|
-
|
453
|
-
return @history.include?(url)
|
571
|
+
@history.include?(URI(url))
|
454
572
|
end
|
455
573
|
|
456
574
|
#
|
@@ -469,9 +587,7 @@ module Spidr
|
|
469
587
|
@failures.clear
|
470
588
|
|
471
589
|
new_failures.each do |url|
|
472
|
-
|
473
|
-
|
474
|
-
@failures << url
|
590
|
+
@failures << URI(url)
|
475
591
|
end
|
476
592
|
|
477
593
|
return @failures
|
@@ -487,9 +603,7 @@ module Spidr
|
|
487
603
|
# Specifies whether the given URL was unable to be visited.
|
488
604
|
#
|
489
605
|
def failed?(url)
|
490
|
-
|
491
|
-
|
492
|
-
return @failures.include?(url)
|
606
|
+
@failures.include?(URI(url))
|
493
607
|
end
|
494
608
|
|
495
609
|
alias pending_urls queue
|
@@ -510,9 +624,7 @@ module Spidr
|
|
510
624
|
@queue.clear
|
511
625
|
|
512
626
|
new_queue.each do |url|
|
513
|
-
|
514
|
-
|
515
|
-
@queue << url
|
627
|
+
@queue << URI(url)
|
516
628
|
end
|
517
629
|
|
518
630
|
return @queue
|
@@ -544,7 +656,7 @@ module Spidr
|
|
544
656
|
def enqueue(url,level=0)
|
545
657
|
url = sanitize_url(url)
|
546
658
|
|
547
|
-
if (!
|
659
|
+
if (!queued?(url) && visit?(url))
|
548
660
|
link = url.to_s
|
549
661
|
|
550
662
|
begin
|
@@ -594,7 +706,7 @@ module Spidr
|
|
594
706
|
# The page for the response, or `nil` if the request failed.
|
595
707
|
#
|
596
708
|
def get_page(url)
|
597
|
-
url = URI(url
|
709
|
+
url = URI(url)
|
598
710
|
|
599
711
|
prepare_request(url) do |session,path,headers|
|
600
712
|
new_page = Page.new(url,session.get(path,headers))
|
@@ -629,7 +741,7 @@ module Spidr
|
|
629
741
|
# @since 0.2.2
|
630
742
|
#
|
631
743
|
def post_page(url,post_data='')
|
632
|
-
url = URI(url
|
744
|
+
url = URI(url)
|
633
745
|
|
634
746
|
prepare_request(url) do |session,path,headers|
|
635
747
|
new_page = Page.new(url,session.post(path,post_data,headers))
|
@@ -643,7 +755,7 @@ module Spidr
|
|
643
755
|
end
|
644
756
|
|
645
757
|
#
|
646
|
-
# Visits a given URL, and
|
758
|
+
# Visits a given URL, and enqueues the links recovered from the URL
|
647
759
|
# to be visited later.
|
648
760
|
#
|
649
761
|
# @param [URI::HTTP, String] url
|
@@ -725,7 +837,7 @@ module Spidr
|
|
725
837
|
|
726
838
|
unless @host_headers.empty?
|
727
839
|
@host_headers.each do |name,header|
|
728
|
-
if host.match(name)
|
840
|
+
if url.host.match(name)
|
729
841
|
headers['Host'] = header
|
730
842
|
break
|
731
843
|
end
|
@@ -769,8 +881,6 @@ module Spidr
|
|
769
881
|
# @since 0.2.2
|
770
882
|
#
|
771
883
|
def prepare_request(url,&block)
|
772
|
-
host = url.host
|
773
|
-
port = url.port
|
774
884
|
path = unless url.path.empty?
|
775
885
|
url.path
|
776
886
|
else
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -34,7 +34,7 @@ module Spidr
|
|
34
34
|
#
|
35
35
|
def [](url)
|
36
36
|
# normalize the url
|
37
|
-
url = URI(url
|
37
|
+
url = URI(url)
|
38
38
|
|
39
39
|
key = [url.scheme, url.host, url.port]
|
40
40
|
paths = @credentials[key]
|
@@ -42,7 +42,7 @@ module Spidr
|
|
42
42
|
return nil unless paths
|
43
43
|
|
44
44
|
# longest path first
|
45
|
-
ordered_paths = paths.keys.sort_by { |
|
45
|
+
ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
|
46
46
|
|
47
47
|
# directories of the path
|
48
48
|
path_dirs = URI.expand_path(url.path).split('/')
|
@@ -70,7 +70,7 @@ module Spidr
|
|
70
70
|
#
|
71
71
|
def []=(url,auth)
|
72
72
|
# normalize the url
|
73
|
-
url = URI(url
|
73
|
+
url = URI(url)
|
74
74
|
|
75
75
|
# normalize the URL path
|
76
76
|
path = URI.expand_path(url.path)
|
@@ -109,7 +109,7 @@ module Spidr
|
|
109
109
|
# or `nil` if no authorization exists.
|
110
110
|
#
|
111
111
|
# @param [URI] url
|
112
|
-
# The
|
112
|
+
# The URL.
|
113
113
|
#
|
114
114
|
# @return [String, nil]
|
115
115
|
# The base64 encoded authorizatio string or `nil`.
|
@@ -118,7 +118,7 @@ module Spidr
|
|
118
118
|
#
|
119
119
|
def for_url(url)
|
120
120
|
if (auth = self[url])
|
121
|
-
|
121
|
+
Base64.encode64("#{auth.username}:#{auth.password}")
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -144,7 +144,11 @@ module Spidr
|
|
144
144
|
# @since 0.2.2
|
145
145
|
#
|
146
146
|
def size
|
147
|
-
|
147
|
+
total = 0
|
148
|
+
|
149
|
+
@credentials.each_value { |paths| total += paths.length }
|
150
|
+
|
151
|
+
return total
|
148
152
|
end
|
149
153
|
|
150
154
|
#
|
@@ -221,5 +221,56 @@ module Spidr
|
|
221
221
|
def zip?
|
222
222
|
is_content_type?('application/zip')
|
223
223
|
end
|
224
|
+
|
225
|
+
#
|
226
|
+
# Determines if the page is a PNG image.
|
227
|
+
#
|
228
|
+
# @return [Boolean]
|
229
|
+
# Specifies whether the page is a PNG image.
|
230
|
+
#
|
231
|
+
# @since 0.7.0
|
232
|
+
#
|
233
|
+
def png?
|
234
|
+
is_content_type?('image/png')
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
# Determines if the page is a GIF image.
|
239
|
+
#
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the page is a GIF image.
|
242
|
+
#
|
243
|
+
# @since 0.7.0
|
244
|
+
#
|
245
|
+
def gif?
|
246
|
+
is_content_type?('image/gif')
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Determines if the page is a JPEG image.
|
251
|
+
#
|
252
|
+
# @return [Boolean]
|
253
|
+
# Specifies whether the page is a JPEG image.
|
254
|
+
#
|
255
|
+
# @since 0.7.0
|
256
|
+
#
|
257
|
+
def jpeg?
|
258
|
+
is_content_type?('image/jpeg')
|
259
|
+
end
|
260
|
+
|
261
|
+
#
|
262
|
+
# Determines if the page is a ICO image.
|
263
|
+
#
|
264
|
+
# @return [Boolean]
|
265
|
+
# Specifies whether the page is a ICO image.
|
266
|
+
#
|
267
|
+
# @since 0.7.0
|
268
|
+
#
|
269
|
+
def ico?
|
270
|
+
is_content_type?('image/x-icon') ||
|
271
|
+
is_content_type?('image/vnd.microsoft.icon')
|
272
|
+
end
|
273
|
+
|
274
|
+
alias icon? ico?
|
224
275
|
end
|
225
276
|
end
|