spidr 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/spidr/agent.rb CHANGED
@@ -19,12 +19,12 @@ module Spidr
19
19
 
20
20
  include Settings::UserAgent
21
21
 
22
- # HTTP Host Header to use
22
+ # HTTP Host `Header` to use
23
23
  #
24
24
  # @return [String]
25
25
  attr_accessor :host_header
26
26
 
27
- # HTTP Host Headers to use for specific hosts
27
+ # HTTP `Host` Headers to use for specific hosts
28
28
  #
29
29
  # @return [Hash{String,Regexp => String}]
30
30
  attr_reader :host_headers
@@ -96,70 +96,110 @@ module Spidr
96
96
  #
97
97
  # Creates a new Agent object.
98
98
  #
99
- # @param [Hash] options
100
- # Additional options
99
+ # @param [String, nil] host_header
100
+ # The HTTP `Host` header to use with each request.
101
101
  #
102
- # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
- # Optional open timeout.
102
+ # @param [Hash{String,Regexp => String}] host_headers
103
+ # The HTTP `Host` headers to use for specific hosts.
104
104
  #
105
- # @option options [Integer] :read_timeout (Spidr.read_timeout)
105
+ # @param [Hash{String => String}] default_headers
106
+ # Default headers to set for every request.
107
+ #
108
+ # @param [String, nil] user_agent
109
+ # The `User-Agent` string to send with each requests.
110
+ #
111
+ # @param [String, nil] referer
112
+ # The `Referer` URL to send with each request.
113
+ #
114
+ # @param [Integer, nil] open_timeout
115
+ # Optional open connection timeout.
116
+ #
117
+ # @param [Integer, nil] read_timeout
106
118
  # Optional read timeout.
107
119
  #
108
- # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
- # Optional ssl timeout.
120
+ # @param [Integer, nil] ssl_timeout
121
+ # Optional SSL connection timeout.
110
122
  #
111
- # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
123
+ # @param [Integer, nil] continue_timeout
112
124
  # Optional continue timeout.
113
125
  #
114
- # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
- # Optional keep_alive timeout.
126
+ # @param [Integer, nil] keep_alive_timeout
127
+ # Optional `Keep-Alive` timeout.
116
128
  #
117
- # @option options [Hash] :proxy (Spidr.proxy)
129
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
118
130
  # The proxy information to use.
119
131
  #
120
- # @option :proxy [String] :host
132
+ # @option proxy [String] :host
121
133
  # The host the proxy is running on.
122
134
  #
123
- # @option :proxy [Integer] :port
135
+ # @option proxy [Integer] :port (8080)
124
136
  # The port the proxy is running on.
125
137
  #
126
- # @option :proxy [String] :user
138
+ # @option proxy [String, nil] :user
127
139
  # The user to authenticate as with the proxy.
128
140
  #
129
- # @option :proxy [String] :password
141
+ # @option proxy [String, nil] :password
130
142
  # The password to authenticate with.
131
143
  #
132
- # @option options [Hash{String => String}] :default_headers
133
- # Default headers to set for every request.
144
+ # @param [Integer] delay
145
+ # The number of seconds to pause between each request.
146
+ #
147
+ # @param [Integer, nil] limit
148
+ # The maximum number of pages to visit.
134
149
  #
135
- # @option options [String] :host_header
136
- # The HTTP Host header to use with each request.
150
+ # @param [Integer, nil] max_depth
151
+ # The maximum link depth to follow.
137
152
  #
138
- # @option options [Hash{String,Regexp => String}] :host_headers
139
- # The HTTP Host headers to use for specific hosts.
153
+ # @param [Set, Array, nil] queue
154
+ # The initial queue of URLs to visit.
140
155
  #
141
- # @option options [String] :user_agent (Spidr.user_agent)
142
- # The User-Agent string to send with each requests.
156
+ # @param [Set, Array, nil] history
157
+ # The initial list of visited URLs.
143
158
  #
144
- # @option options [String] :referer
145
- # The Referer URL to send with each request.
159
+ # @param [Boolean] strip_fragments
160
+ # Controls whether to strip the fragment components from the URLs.
146
161
  #
147
- # @option options [Integer] :delay (0)
148
- # The number of seconds to pause between each request.
162
+ # @param [Boolean] strip_query
163
+ # Controls whether to strip the query components from the URLs.
149
164
  #
150
- # @option options [Set, Array] :queue
151
- # The initial queue of URLs to visit.
165
+ # @param [Array<String>] schemes
166
+ # The list of acceptable URI schemes to visit.
167
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
152
168
  #
153
- # @option options [Set, Array] :history
154
- # The initial list of visited URLs.
169
+ # @param [String] host
170
+ # The host-name to visit.
155
171
  #
156
- # @option options [Integer] :limit
157
- # The maximum number of pages to visit.
172
+ # @param [Array<String, Regexp, Proc>] hosts
173
+ # The patterns which match the host-names to visit.
158
174
  #
159
- # @option options [Integer] :max_depth
160
- # The maximum link depth to follow.
175
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
176
+ # The patterns which match the host-names to not visit.
177
+ #
178
+ # @param [Array<Integer, Regexp, Proc>] ports
179
+ # The patterns which match the ports to visit.
161
180
  #
162
- # @option options [Boolean] :robots (Spidr.robots?)
181
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
182
+ # The patterns which match the ports to not visit.
183
+ #
184
+ # @param [Array<String, Regexp, Proc>] links
185
+ # The patterns which match the links to visit.
186
+ #
187
+ # @param [Array<String, Regexp, Proc>] ignore_links
188
+ # The patterns which match the links to not visit.
189
+ #
190
+ # @param [Array<String, Regexp, Proc>] urls
191
+ # The patterns which match the URLs to visit.
192
+ #
193
+ # @param [Array<String, Regexp, Proc>] ignore_urls
194
+ # The patterns which match the URLs to not visit.
195
+ #
196
+ # @param [Array<String, Regexp, Proc>] exts
197
+ # The patterns which match the URI path extensions to visit.
198
+ #
199
+ # @param [Array<String, Regexp, Proc>] ignore_exts
200
+ # The patterns which match the URI path extensions to not visit.
201
+ #
202
+ # @param [Boolean] robots
163
203
  # Specifies whether `robots.txt` should be honored.
164
204
  #
165
205
  # @yield [agent]
@@ -169,58 +209,99 @@ module Spidr
169
209
  # @yieldparam [Agent] agent
170
210
  # The newly created agent.
171
211
  #
172
- # @see #initialize_sanitizers
173
- # @see #initialize_filters
174
- # @see #initialize_actions
175
- # @see #initialize_events
176
- #
177
- def initialize(options={})
178
- @host_header = options[:host_header]
179
- @host_headers = {}
180
-
181
- if options[:host_headers]
182
- @host_headers.merge!(options[:host_headers])
183
- end
184
-
185
- @default_headers = {}
186
-
187
- if options[:default_headers]
188
- @default_headers.merge!(options[:default_headers])
189
- end
190
-
191
- @user_agent = options.fetch(:user_agent,Spidr.user_agent)
192
- @referer = options[:referer]
193
-
194
- @sessions = SessionCache.new(options)
212
+ def initialize(# header keyword arguments
213
+ host_header: nil,
214
+ host_headers: {},
215
+ default_headers: {},
216
+ user_agent: Spidr.user_agent,
217
+ referer: nil,
218
+ # session cache keyword arguments
219
+ proxy: Spidr.proxy,
220
+ open_timeout: Spidr.open_timeout,
221
+ ssl_timeout: Spidr.ssl_timeout,
222
+ read_timeout: Spidr.read_timeout,
223
+ continue_timeout: Spidr.continue_timeout,
224
+ keep_alive_timeout: Spidr.keep_alive_timeout,
225
+ # spidering controls keyword arguments
226
+ delay: 0,
227
+ limit: nil,
228
+ max_depth: nil,
229
+ # history keyword arguments
230
+ queue: nil,
231
+ history: nil,
232
+ # sanitizer keyword arguments
233
+ strip_fragments: true,
234
+ strip_query: false,
235
+ # filtering keyword arguments
236
+ schemes: self.class.default_schemes,
237
+ host: nil,
238
+ hosts: nil,
239
+ ignore_hosts: nil,
240
+ ports: nil,
241
+ ignore_ports: nil,
242
+ links: nil,
243
+ ignore_links: nil,
244
+ urls: nil,
245
+ ignore_urls: nil,
246
+ exts: nil,
247
+ ignore_exts: nil,
248
+ # robots keyword arguments
249
+ robots: Spidr.robots?)
250
+ @host_header = host_header
251
+ @host_headers = host_headers
252
+
253
+ @default_headers = default_headers
254
+
255
+ @user_agent = user_agent
256
+ @referer = referer
257
+
258
+ @sessions = SessionCache.new(
259
+ proxy: proxy,
260
+ open_timeout: open_timeout,
261
+ ssl_timeout: ssl_timeout,
262
+ read_timeout: read_timeout,
263
+ continue_timeout: continue_timeout,
264
+ keep_alive_timeout: keep_alive_timeout
265
+ )
195
266
  @cookies = CookieJar.new
196
267
  @authorized = AuthStore.new
197
268
 
198
269
  @running = false
199
- @delay = options.fetch(:delay,0)
270
+ @delay = delay
200
271
  @history = Set[]
201
272
  @failures = Set[]
202
273
  @queue = []
203
274
 
204
- @limit = options[:limit]
275
+ @limit = limit
205
276
  @levels = Hash.new(0)
206
- @max_depth = options[:max_depth]
207
-
208
- if options[:queue]
209
- self.queue = options[:queue]
210
- end
211
-
212
- if options[:history]
213
- self.history = options[:history]
214
- end
215
-
216
- initialize_sanitizers(options)
217
- initialize_filters(options)
218
- initialize_actions(options)
219
- initialize_events(options)
220
-
221
- if options.fetch(:robots,Spidr.robots?)
222
- initialize_robots
223
- end
277
+ @max_depth = max_depth
278
+
279
+ self.queue = queue if queue
280
+ self.history = history if history
281
+
282
+ initialize_sanitizers(
283
+ strip_fragments: strip_fragments,
284
+ strip_query: strip_query
285
+ )
286
+
287
+ initialize_filters(
288
+ schemes: schemes,
289
+ host: host,
290
+ hosts: hosts,
291
+ ignore_hosts: ignore_hosts,
292
+ ports: ports,
293
+ ignore_ports: ignore_ports,
294
+ links: links,
295
+ ignore_links: ignore_links,
296
+ urls: urls,
297
+ ignore_urls: ignore_urls,
298
+ exts: exts,
299
+ ignore_exts: ignore_exts
300
+ )
301
+ initialize_actions
302
+ initialize_events
303
+
304
+ initialize_robots if robots
224
305
 
225
306
  yield self if block_given?
226
307
  end
@@ -231,8 +312,8 @@ module Spidr
231
312
  # @param [URI::HTTP, String] url
232
313
  # The URL to start spidering at.
233
314
  #
234
- # @param [Hash] options
235
- # Additional options. See {Agent#initialize}.
315
+ # @param [Hash{Symbol => Object}] kwargs
316
+ # Additional keyword arguments. See {Agent#initialize}.
236
317
  #
237
318
  # @yield [agent]
238
319
  # If a block is given, it will be passed the newly created agent
@@ -241,12 +322,16 @@ module Spidr
241
322
  # @yieldparam [Agent] agent
242
323
  # The newly created agent.
243
324
  #
325
+ # @return [Agent]
326
+ # The created agent object.
327
+ #
244
328
  # @see #initialize
245
329
  # @see #start_at
246
330
  #
247
- def self.start_at(url,options={},&block)
248
- agent = new(options,&block)
331
+ def self.start_at(url,**kwargs,&block)
332
+ agent = new(**kwargs,&block)
249
333
  agent.start_at(url)
334
+ return agent
250
335
  end
251
336
 
252
337
  #
@@ -255,8 +340,8 @@ module Spidr
255
340
  # @param [URI::HTTP, String] url
256
341
  # The web-site to spider.
257
342
  #
258
- # @param [Hash] options
259
- # Additional options. See {Agent#initialize}.
343
+ # @param [Hash{Symbol => Object}] kwargs
344
+ # Additional keyword arguments. See {Agent#initialize}.
260
345
  #
261
346
  # @yield [agent]
262
347
  # If a block is given, it will be passed the newly created agent
@@ -265,13 +350,17 @@ module Spidr
265
350
  # @yieldparam [Agent] agent
266
351
  # The newly created agent.
267
352
  #
353
+ # @return [Agent]
354
+ # The created agent object.
355
+ #
268
356
  # @see #initialize
269
357
  #
270
- def self.site(url,options={},&block)
271
- url = URI(url.to_s) unless url.kind_of?(URI)
358
+ def self.site(url,**kwargs,&block)
359
+ url = URI(url)
272
360
 
273
- agent = new(options.merge(host: url.host),&block)
361
+ agent = new(host: url.host, **kwargs, &block)
274
362
  agent.start_at(url)
363
+ return agent
275
364
  end
276
365
 
277
366
  #
@@ -280,8 +369,8 @@ module Spidr
280
369
  # @param [String] name
281
370
  # The host-name to spider.
282
371
  #
283
- # @param [Hash] options
284
- # Additional options. See {Agent#initialize}.
372
+ # @param [Hash{Symbol => Object}] kwargs
373
+ # Additional keyword arguments. See {Agent#initialize}.
285
374
  #
286
375
  # @yield [agent]
287
376
  # If a block is given, it will be passed the newly created agent
@@ -290,11 +379,44 @@ module Spidr
290
379
  # @yieldparam [Agent] agent
291
380
  # The newly created agent.
292
381
  #
382
+ # @return [Agent]
383
+ # The created agent object.
384
+ #
293
385
  # @see #initialize
294
386
  #
295
- def self.host(name,options={},&block)
296
- agent = new(options.merge(host: name),&block)
387
+ def self.host(name,**kwargs,&block)
388
+ agent = new(host: name, **kwargs, &block)
297
389
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
390
+ return agent
391
+ end
392
+
393
+ #
394
+ # Creates a new agent and spiders the entire domain.
395
+ #
396
+ # @param [String] name
397
+ # The top-level domain to spider.
398
+ #
399
+ # @param [Hash{Symbol => Object}] kwargs
400
+ # Additional keyword arguments. See {Agent#initialize}.
401
+ #
402
+ # @yield [agent]
403
+ # If a block is given, it will be passed the newly created agent
404
+ # before it begins spidering.
405
+ #
406
+ # @yieldparam [Agent] agent
407
+ # The newly created agent.
408
+ #
409
+ # @return [Agent]
410
+ # The created agent object.
411
+ #
412
+ # @see #initialize
413
+ #
414
+ # @since 0.7.0
415
+ #
416
+ def self.domain(name,**kwargs,&block)
417
+ agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
418
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
419
+ return agent
298
420
  end
299
421
 
300
422
  #
@@ -314,10 +436,10 @@ module Spidr
314
436
  #
315
437
  # Sets the proxy information that the agent uses.
316
438
  #
317
- # @param [Proxy] new_proxy
439
+ # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
318
440
  # The new proxy information.
319
441
  #
320
- # @return [Hash]
442
+ # @return [Proxy]
321
443
  # The new proxy information.
322
444
  #
323
445
  # @see SessionCache#proxy=
@@ -408,9 +530,7 @@ module Spidr
408
530
  @history.clear
409
531
 
410
532
  new_history.each do |url|
411
- url = URI(url.to_s) unless url.kind_of?(URI)
412
-
413
- @history << url
533
+ @history << URI(url)
414
534
  end
415
535
 
416
536
  return @history
@@ -425,7 +545,7 @@ module Spidr
425
545
  # The links which have been visited.
426
546
  #
427
547
  def visited_links
428
- @history.map { |url| url.to_s }
548
+ @history.map(&:to_s)
429
549
  end
430
550
 
431
551
  #
@@ -435,7 +555,7 @@ module Spidr
435
555
  # The hosts which have been visited.
436
556
  #
437
557
  def visited_hosts
438
- visited_urls.map { |uri| uri.host }.uniq
558
+ visited_urls.map(&:host).uniq
439
559
  end
440
560
 
441
561
  #
@@ -448,9 +568,7 @@ module Spidr
448
568
  # Specifies whether a URL was visited.
449
569
  #
450
570
  def visited?(url)
451
- url = URI(url.to_s) unless url.kind_of?(URI)
452
-
453
- return @history.include?(url)
571
+ @history.include?(URI(url))
454
572
  end
455
573
 
456
574
  #
@@ -469,9 +587,7 @@ module Spidr
469
587
  @failures.clear
470
588
 
471
589
  new_failures.each do |url|
472
- url = URI(url.to_s) unless url.kind_of?(URI)
473
-
474
- @failures << url
590
+ @failures << URI(url)
475
591
  end
476
592
 
477
593
  return @failures
@@ -487,9 +603,7 @@ module Spidr
487
603
  # Specifies whether the given URL was unable to be visited.
488
604
  #
489
605
  def failed?(url)
490
- url = URI(url.to_s) unless url.kind_of?(URI)
491
-
492
- return @failures.include?(url)
606
+ @failures.include?(URI(url))
493
607
  end
494
608
 
495
609
  alias pending_urls queue
@@ -510,9 +624,7 @@ module Spidr
510
624
  @queue.clear
511
625
 
512
626
  new_queue.each do |url|
513
- url = URI(url.to_s) unless url.kind_of?(URI)
514
-
515
- @queue << url
627
+ @queue << URI(url)
516
628
  end
517
629
 
518
630
  return @queue
@@ -544,7 +656,7 @@ module Spidr
544
656
  def enqueue(url,level=0)
545
657
  url = sanitize_url(url)
546
658
 
547
- if (!(queued?(url)) && visit?(url))
659
+ if (!queued?(url) && visit?(url))
548
660
  link = url.to_s
549
661
 
550
662
  begin
@@ -594,7 +706,7 @@ module Spidr
594
706
  # The page for the response, or `nil` if the request failed.
595
707
  #
596
708
  def get_page(url)
597
- url = URI(url.to_s)
709
+ url = URI(url)
598
710
 
599
711
  prepare_request(url) do |session,path,headers|
600
712
  new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +741,7 @@ module Spidr
629
741
  # @since 0.2.2
630
742
  #
631
743
  def post_page(url,post_data='')
632
- url = URI(url.to_s) unless url.kind_of?(URI)
744
+ url = URI(url)
633
745
 
634
746
  prepare_request(url) do |session,path,headers|
635
747
  new_page = Page.new(url,session.post(path,post_data,headers))
@@ -643,7 +755,7 @@ module Spidr
643
755
  end
644
756
 
645
757
  #
646
- # Visits a given URL, and enqueus the links recovered from the URL
758
+ # Visits a given URL, and enqueues the links recovered from the URL
647
759
  # to be visited later.
648
760
  #
649
761
  # @param [URI::HTTP, String] url
@@ -725,7 +837,7 @@ module Spidr
725
837
 
726
838
  unless @host_headers.empty?
727
839
  @host_headers.each do |name,header|
728
- if host.match(name)
840
+ if url.host.match(name)
729
841
  headers['Host'] = header
730
842
  break
731
843
  end
@@ -769,8 +881,6 @@ module Spidr
769
881
  # @since 0.2.2
770
882
  #
771
883
  def prepare_request(url,&block)
772
- host = url.host
773
- port = url.port
774
884
  path = unless url.path.empty?
775
885
  url.path
776
886
  else
@@ -34,7 +34,7 @@ module Spidr
34
34
  #
35
35
  def [](url)
36
36
  # normalize the url
37
- url = URI(url.to_s) unless url.kind_of?(URI)
37
+ url = URI(url)
38
38
 
39
39
  key = [url.scheme, url.host, url.port]
40
40
  paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
42
42
  return nil unless paths
43
43
 
44
44
  # longest path first
45
- ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
45
+ ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
46
46
 
47
47
  # directories of the path
48
48
  path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
70
70
  #
71
71
  def []=(url,auth)
72
72
  # normalize the url
73
- url = URI(url.to_s) unless url.kind_of?(URI)
73
+ url = URI(url)
74
74
 
75
75
  # normalize the URL path
76
76
  path = URI.expand_path(url.path)
@@ -109,7 +109,7 @@ module Spidr
109
109
  # or `nil` if no authorization exists.
110
110
  #
111
111
  # @param [URI] url
112
- # The url.
112
+ # The URL.
113
113
  #
114
114
  # @return [String, nil]
115
115
  # The base64 encoded authorizatio string or `nil`.
@@ -118,7 +118,7 @@ module Spidr
118
118
  #
119
119
  def for_url(url)
120
120
  if (auth = self[url])
121
- return Base64.encode64("#{auth.username}:#{auth.password}")
121
+ Base64.encode64("#{auth.username}:#{auth.password}")
122
122
  end
123
123
  end
124
124
 
@@ -144,7 +144,11 @@ module Spidr
144
144
  # @since 0.2.2
145
145
  #
146
146
  def size
147
- @credentials.inject(0) { |res, arr| res + arr[1].length }
147
+ total = 0
148
+
149
+ @credentials.each_value { |paths| total += paths.length }
150
+
151
+ return total
148
152
  end
149
153
 
150
154
  #
@@ -221,5 +221,56 @@ module Spidr
221
221
  def zip?
222
222
  is_content_type?('application/zip')
223
223
  end
224
+
225
+ #
226
+ # Determines if the page is a PNG image.
227
+ #
228
+ # @return [Boolean]
229
+ # Specifies whether the page is a PNG image.
230
+ #
231
+ # @since 0.7.0
232
+ #
233
+ def png?
234
+ is_content_type?('image/png')
235
+ end
236
+
237
+ #
238
+ # Determines if the page is a GIF image.
239
+ #
240
+ # @return [Boolean]
241
+ # Specifies whether the page is a GIF image.
242
+ #
243
+ # @since 0.7.0
244
+ #
245
+ def gif?
246
+ is_content_type?('image/gif')
247
+ end
248
+
249
+ #
250
+ # Determines if the page is a JPEG image.
251
+ #
252
+ # @return [Boolean]
253
+ # Specifies whether the page is a JPEG image.
254
+ #
255
+ # @since 0.7.0
256
+ #
257
+ def jpeg?
258
+ is_content_type?('image/jpeg')
259
+ end
260
+
261
+ #
262
+ # Determines if the page is a ICO image.
263
+ #
264
+ # @return [Boolean]
265
+ # Specifies whether the page is a ICO image.
266
+ #
267
+ # @since 0.7.0
268
+ #
269
+ def ico?
270
+ is_content_type?('image/x-icon') ||
271
+ is_content_type?('image/vnd.microsoft.icon')
272
+ end
273
+
274
+ alias icon? ico?
224
275
  end
225
276
  end