spidr 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spidr/agent.rb CHANGED
@@ -19,12 +19,12 @@ module Spidr
19
19
 
20
20
  include Settings::UserAgent
21
21
 
22
- # HTTP Host Header to use
22
+ # HTTP Host `Header` to use
23
23
  #
24
24
  # @return [String]
25
25
  attr_accessor :host_header
26
26
 
27
- # HTTP Host Headers to use for specific hosts
27
+ # HTTP `Host` Headers to use for specific hosts
28
28
  #
29
29
  # @return [Hash{String,Regexp => String}]
30
30
  attr_reader :host_headers
@@ -96,70 +96,110 @@ module Spidr
96
96
  #
97
97
  # Creates a new Agent object.
98
98
  #
99
- # @param [Hash] options
100
- # Additional options
99
+ # @param [String, nil] host_header
100
+ # The HTTP `Host` header to use with each request.
101
101
  #
102
- # @option options [Integer] :open_timeout (Spidr.open_timeout)
103
- # Optional open timeout.
102
+ # @param [Hash{String,Regexp => String}] host_headers
103
+ # The HTTP `Host` headers to use for specific hosts.
104
104
  #
105
- # @option options [Integer] :read_timeout (Spidr.read_timeout)
105
+ # @param [Hash{String => String}] default_headers
106
+ # Default headers to set for every request.
107
+ #
108
+ # @param [String, nil] user_agent
109
+ # The `User-Agent` string to send with each requests.
110
+ #
111
+ # @param [String, nil] referer
112
+ # The `Referer` URL to send with each request.
113
+ #
114
+ # @param [Integer, nil] open_timeout
115
+ # Optional open connection timeout.
116
+ #
117
+ # @param [Integer, nil] read_timeout
106
118
  # Optional read timeout.
107
119
  #
108
- # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
109
- # Optional ssl timeout.
120
+ # @param [Integer, nil] ssl_timeout
121
+ # Optional SSL connection timeout.
110
122
  #
111
- # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
123
+ # @param [Integer, nil] continue_timeout
112
124
  # Optional continue timeout.
113
125
  #
114
- # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
115
- # Optional keep_alive timeout.
126
+ # @param [Integer, nil] keep_alive_timeout
127
+ # Optional `Keep-Alive` timeout.
116
128
  #
117
- # @option options [Hash] :proxy (Spidr.proxy)
129
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
118
130
  # The proxy information to use.
119
131
  #
120
- # @option :proxy [String] :host
132
+ # @option proxy [String] :host
121
133
  # The host the proxy is running on.
122
134
  #
123
- # @option :proxy [Integer] :port
135
+ # @option proxy [Integer] :port (8080)
124
136
  # The port the proxy is running on.
125
137
  #
126
- # @option :proxy [String] :user
138
+ # @option proxy [String, nil] :user
127
139
  # The user to authenticate as with the proxy.
128
140
  #
129
- # @option :proxy [String] :password
141
+ # @option proxy [String, nil] :password
130
142
  # The password to authenticate with.
131
143
  #
132
- # @option options [Hash{String => String}] :default_headers
133
- # Default headers to set for every request.
144
+ # @param [Integer] delay
145
+ # The number of seconds to pause between each request.
146
+ #
147
+ # @param [Integer, nil] limit
148
+ # The maximum number of pages to visit.
134
149
  #
135
- # @option options [String] :host_header
136
- # The HTTP Host header to use with each request.
150
+ # @param [Integer, nil] max_depth
151
+ # The maximum link depth to follow.
137
152
  #
138
- # @option options [Hash{String,Regexp => String}] :host_headers
139
- # The HTTP Host headers to use for specific hosts.
153
+ # @param [Set, Array, nil] queue
154
+ # The initial queue of URLs to visit.
140
155
  #
141
- # @option options [String] :user_agent (Spidr.user_agent)
142
- # The User-Agent string to send with each requests.
156
+ # @param [Set, Array, nil] history
157
+ # The initial list of visited URLs.
143
158
  #
144
- # @option options [String] :referer
145
- # The Referer URL to send with each request.
159
+ # @param [Boolean] strip_fragments
160
+ # Controls whether to strip the fragment components from the URLs.
146
161
  #
147
- # @option options [Integer] :delay (0)
148
- # The number of seconds to pause between each request.
162
+ # @param [Boolean] strip_query
163
+ # Controls whether to strip the query components from the URLs.
149
164
  #
150
- # @option options [Set, Array] :queue
151
- # The initial queue of URLs to visit.
165
+ # @param [Array<String>] schemes
166
+ # The list of acceptable URI schemes to visit.
167
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
152
168
  #
153
- # @option options [Set, Array] :history
154
- # The initial list of visited URLs.
169
+ # @param [String] host
170
+ # The host-name to visit.
155
171
  #
156
- # @option options [Integer] :limit
157
- # The maximum number of pages to visit.
172
+ # @param [Array<String, Regexp, Proc>] hosts
173
+ # The patterns which match the host-names to visit.
158
174
  #
159
- # @option options [Integer] :max_depth
160
- # The maximum link depth to follow.
175
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
176
+ # The patterns which match the host-names to not visit.
177
+ #
178
+ # @param [Array<Integer, Regexp, Proc>] ports
179
+ # The patterns which match the ports to visit.
161
180
  #
162
- # @option options [Boolean] :robots (Spidr.robots?)
181
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
182
+ # The patterns which match the ports to not visit.
183
+ #
184
+ # @param [Array<String, Regexp, Proc>] links
185
+ # The patterns which match the links to visit.
186
+ #
187
+ # @param [Array<String, Regexp, Proc>] ignore_links
188
+ # The patterns which match the links to not visit.
189
+ #
190
+ # @param [Array<String, Regexp, Proc>] urls
191
+ # The patterns which match the URLs to visit.
192
+ #
193
+ # @param [Array<String, Regexp, Proc>] ignore_urls
194
+ # The patterns which match the URLs to not visit.
195
+ #
196
+ # @param [Array<String, Regexp, Proc>] exts
197
+ # The patterns which match the URI path extensions to visit.
198
+ #
199
+ # @param [Array<String, Regexp, Proc>] ignore_exts
200
+ # The patterns which match the URI path extensions to not visit.
201
+ #
202
+ # @param [Boolean] robots
163
203
  # Specifies whether `robots.txt` should be honored.
164
204
  #
165
205
  # @yield [agent]
@@ -169,58 +209,99 @@ module Spidr
169
209
  # @yieldparam [Agent] agent
170
210
  # The newly created agent.
171
211
  #
172
- # @see #initialize_sanitizers
173
- # @see #initialize_filters
174
- # @see #initialize_actions
175
- # @see #initialize_events
176
- #
177
- def initialize(options={})
178
- @host_header = options[:host_header]
179
- @host_headers = {}
180
-
181
- if options[:host_headers]
182
- @host_headers.merge!(options[:host_headers])
183
- end
184
-
185
- @default_headers = {}
186
-
187
- if options[:default_headers]
188
- @default_headers.merge!(options[:default_headers])
189
- end
190
-
191
- @user_agent = options.fetch(:user_agent,Spidr.user_agent)
192
- @referer = options[:referer]
193
-
194
- @sessions = SessionCache.new(options)
212
+ def initialize(# header keyword arguments
213
+ host_header: nil,
214
+ host_headers: {},
215
+ default_headers: {},
216
+ user_agent: Spidr.user_agent,
217
+ referer: nil,
218
+ # session cache keyword arguments
219
+ proxy: Spidr.proxy,
220
+ open_timeout: Spidr.open_timeout,
221
+ ssl_timeout: Spidr.ssl_timeout,
222
+ read_timeout: Spidr.read_timeout,
223
+ continue_timeout: Spidr.continue_timeout,
224
+ keep_alive_timeout: Spidr.keep_alive_timeout,
225
+ # spidering controls keyword arguments
226
+ delay: 0,
227
+ limit: nil,
228
+ max_depth: nil,
229
+ # history keyword arguments
230
+ queue: nil,
231
+ history: nil,
232
+ # sanitizer keyword arguments
233
+ strip_fragments: true,
234
+ strip_query: false,
235
+ # filtering keyword arguments
236
+ schemes: self.class.default_schemes,
237
+ host: nil,
238
+ hosts: nil,
239
+ ignore_hosts: nil,
240
+ ports: nil,
241
+ ignore_ports: nil,
242
+ links: nil,
243
+ ignore_links: nil,
244
+ urls: nil,
245
+ ignore_urls: nil,
246
+ exts: nil,
247
+ ignore_exts: nil,
248
+ # robots keyword arguments
249
+ robots: Spidr.robots?)
250
+ @host_header = host_header
251
+ @host_headers = host_headers
252
+
253
+ @default_headers = default_headers
254
+
255
+ @user_agent = user_agent
256
+ @referer = referer
257
+
258
+ @sessions = SessionCache.new(
259
+ proxy: proxy,
260
+ open_timeout: open_timeout,
261
+ ssl_timeout: ssl_timeout,
262
+ read_timeout: read_timeout,
263
+ continue_timeout: continue_timeout,
264
+ keep_alive_timeout: keep_alive_timeout
265
+ )
195
266
  @cookies = CookieJar.new
196
267
  @authorized = AuthStore.new
197
268
 
198
269
  @running = false
199
- @delay = options.fetch(:delay,0)
270
+ @delay = delay
200
271
  @history = Set[]
201
272
  @failures = Set[]
202
273
  @queue = []
203
274
 
204
- @limit = options[:limit]
275
+ @limit = limit
205
276
  @levels = Hash.new(0)
206
- @max_depth = options[:max_depth]
207
-
208
- if options[:queue]
209
- self.queue = options[:queue]
210
- end
211
-
212
- if options[:history]
213
- self.history = options[:history]
214
- end
215
-
216
- initialize_sanitizers(options)
217
- initialize_filters(options)
218
- initialize_actions(options)
219
- initialize_events(options)
220
-
221
- if options.fetch(:robots,Spidr.robots?)
222
- initialize_robots
223
- end
277
+ @max_depth = max_depth
278
+
279
+ self.queue = queue if queue
280
+ self.history = history if history
281
+
282
+ initialize_sanitizers(
283
+ strip_fragments: strip_fragments,
284
+ strip_query: strip_query
285
+ )
286
+
287
+ initialize_filters(
288
+ schemes: schemes,
289
+ host: host,
290
+ hosts: hosts,
291
+ ignore_hosts: ignore_hosts,
292
+ ports: ports,
293
+ ignore_ports: ignore_ports,
294
+ links: links,
295
+ ignore_links: ignore_links,
296
+ urls: urls,
297
+ ignore_urls: ignore_urls,
298
+ exts: exts,
299
+ ignore_exts: ignore_exts
300
+ )
301
+ initialize_actions
302
+ initialize_events
303
+
304
+ initialize_robots if robots
224
305
 
225
306
  yield self if block_given?
226
307
  end
@@ -231,8 +312,8 @@ module Spidr
231
312
  # @param [URI::HTTP, String] url
232
313
  # The URL to start spidering at.
233
314
  #
234
- # @param [Hash] options
235
- # Additional options. See {Agent#initialize}.
315
+ # @param [Hash{Symbol => Object}] kwargs
316
+ # Additional keyword arguments. See {Agent#initialize}.
236
317
  #
237
318
  # @yield [agent]
238
319
  # If a block is given, it will be passed the newly created agent
@@ -241,12 +322,16 @@ module Spidr
241
322
  # @yieldparam [Agent] agent
242
323
  # The newly created agent.
243
324
  #
325
+ # @return [Agent]
326
+ # The created agent object.
327
+ #
244
328
  # @see #initialize
245
329
  # @see #start_at
246
330
  #
247
- def self.start_at(url,options={},&block)
248
- agent = new(options,&block)
331
+ def self.start_at(url,**kwargs,&block)
332
+ agent = new(**kwargs,&block)
249
333
  agent.start_at(url)
334
+ return agent
250
335
  end
251
336
 
252
337
  #
@@ -255,8 +340,8 @@ module Spidr
255
340
  # @param [URI::HTTP, String] url
256
341
  # The web-site to spider.
257
342
  #
258
- # @param [Hash] options
259
- # Additional options. See {Agent#initialize}.
343
+ # @param [Hash{Symbol => Object}] kwargs
344
+ # Additional keyword arguments. See {Agent#initialize}.
260
345
  #
261
346
  # @yield [agent]
262
347
  # If a block is given, it will be passed the newly created agent
@@ -265,13 +350,17 @@ module Spidr
265
350
  # @yieldparam [Agent] agent
266
351
  # The newly created agent.
267
352
  #
353
+ # @return [Agent]
354
+ # The created agent object.
355
+ #
268
356
  # @see #initialize
269
357
  #
270
- def self.site(url,options={},&block)
271
- url = URI(url.to_s) unless url.kind_of?(URI)
358
+ def self.site(url,**kwargs,&block)
359
+ url = URI(url)
272
360
 
273
- agent = new(options.merge(host: url.host),&block)
361
+ agent = new(host: url.host, **kwargs, &block)
274
362
  agent.start_at(url)
363
+ return agent
275
364
  end
276
365
 
277
366
  #
@@ -280,8 +369,8 @@ module Spidr
280
369
  # @param [String] name
281
370
  # The host-name to spider.
282
371
  #
283
- # @param [Hash] options
284
- # Additional options. See {Agent#initialize}.
372
+ # @param [Hash{Symbol => Object}] kwargs
373
+ # Additional keyword arguments. See {Agent#initialize}.
285
374
  #
286
375
  # @yield [agent]
287
376
  # If a block is given, it will be passed the newly created agent
@@ -290,11 +379,44 @@ module Spidr
290
379
  # @yieldparam [Agent] agent
291
380
  # The newly created agent.
292
381
  #
382
+ # @return [Agent]
383
+ # The created agent object.
384
+ #
293
385
  # @see #initialize
294
386
  #
295
- def self.host(name,options={},&block)
296
- agent = new(options.merge(host: name),&block)
387
+ def self.host(name,**kwargs,&block)
388
+ agent = new(host: name, **kwargs, &block)
297
389
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
390
+ return agent
391
+ end
392
+
393
+ #
394
+ # Creates a new agent and spiders the entire domain.
395
+ #
396
+ # @param [String] name
397
+ # The top-level domain to spider.
398
+ #
399
+ # @param [Hash{Symbol => Object}] kwargs
400
+ # Additional keyword arguments. See {Agent#initialize}.
401
+ #
402
+ # @yield [agent]
403
+ # If a block is given, it will be passed the newly created agent
404
+ # before it begins spidering.
405
+ #
406
+ # @yieldparam [Agent] agent
407
+ # The newly created agent.
408
+ #
409
+ # @return [Agent]
410
+ # The created agent object.
411
+ #
412
+ # @see #initialize
413
+ #
414
+ # @since 0.7.0
415
+ #
416
+ def self.domain(name,**kwargs,&block)
417
+ agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
418
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
419
+ return agent
298
420
  end
299
421
 
300
422
  #
@@ -314,10 +436,10 @@ module Spidr
314
436
  #
315
437
  # Sets the proxy information that the agent uses.
316
438
  #
317
- # @param [Proxy] new_proxy
439
+ # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
318
440
  # The new proxy information.
319
441
  #
320
- # @return [Hash]
442
+ # @return [Proxy]
321
443
  # The new proxy information.
322
444
  #
323
445
  # @see SessionCache#proxy=
@@ -408,9 +530,7 @@ module Spidr
408
530
  @history.clear
409
531
 
410
532
  new_history.each do |url|
411
- url = URI(url.to_s) unless url.kind_of?(URI)
412
-
413
- @history << url
533
+ @history << URI(url)
414
534
  end
415
535
 
416
536
  return @history
@@ -425,7 +545,7 @@ module Spidr
425
545
  # The links which have been visited.
426
546
  #
427
547
  def visited_links
428
- @history.map { |url| url.to_s }
548
+ @history.map(&:to_s)
429
549
  end
430
550
 
431
551
  #
@@ -435,7 +555,7 @@ module Spidr
435
555
  # The hosts which have been visited.
436
556
  #
437
557
  def visited_hosts
438
- visited_urls.map { |uri| uri.host }.uniq
558
+ visited_urls.map(&:host).uniq
439
559
  end
440
560
 
441
561
  #
@@ -448,9 +568,7 @@ module Spidr
448
568
  # Specifies whether a URL was visited.
449
569
  #
450
570
  def visited?(url)
451
- url = URI(url.to_s) unless url.kind_of?(URI)
452
-
453
- return @history.include?(url)
571
+ @history.include?(URI(url))
454
572
  end
455
573
 
456
574
  #
@@ -469,9 +587,7 @@ module Spidr
469
587
  @failures.clear
470
588
 
471
589
  new_failures.each do |url|
472
- url = URI(url.to_s) unless url.kind_of?(URI)
473
-
474
- @failures << url
590
+ @failures << URI(url)
475
591
  end
476
592
 
477
593
  return @failures
@@ -487,9 +603,7 @@ module Spidr
487
603
  # Specifies whether the given URL was unable to be visited.
488
604
  #
489
605
  def failed?(url)
490
- url = URI(url.to_s) unless url.kind_of?(URI)
491
-
492
- return @failures.include?(url)
606
+ @failures.include?(URI(url))
493
607
  end
494
608
 
495
609
  alias pending_urls queue
@@ -510,9 +624,7 @@ module Spidr
510
624
  @queue.clear
511
625
 
512
626
  new_queue.each do |url|
513
- url = URI(url.to_s) unless url.kind_of?(URI)
514
-
515
- @queue << url
627
+ @queue << URI(url)
516
628
  end
517
629
 
518
630
  return @queue
@@ -544,7 +656,7 @@ module Spidr
544
656
  def enqueue(url,level=0)
545
657
  url = sanitize_url(url)
546
658
 
547
- if (!(queued?(url)) && visit?(url))
659
+ if (!queued?(url) && visit?(url))
548
660
  link = url.to_s
549
661
 
550
662
  begin
@@ -594,7 +706,7 @@ module Spidr
594
706
  # The page for the response, or `nil` if the request failed.
595
707
  #
596
708
  def get_page(url)
597
- url = URI(url.to_s)
709
+ url = URI(url)
598
710
 
599
711
  prepare_request(url) do |session,path,headers|
600
712
  new_page = Page.new(url,session.get(path,headers))
@@ -629,7 +741,7 @@ module Spidr
629
741
  # @since 0.2.2
630
742
  #
631
743
  def post_page(url,post_data='')
632
- url = URI(url.to_s) unless url.kind_of?(URI)
744
+ url = URI(url)
633
745
 
634
746
  prepare_request(url) do |session,path,headers|
635
747
  new_page = Page.new(url,session.post(path,post_data,headers))
@@ -643,7 +755,7 @@ module Spidr
643
755
  end
644
756
 
645
757
  #
646
- # Visits a given URL, and enqueus the links recovered from the URL
758
+ # Visits a given URL, and enqueues the links recovered from the URL
647
759
  # to be visited later.
648
760
  #
649
761
  # @param [URI::HTTP, String] url
@@ -725,7 +837,7 @@ module Spidr
725
837
 
726
838
  unless @host_headers.empty?
727
839
  @host_headers.each do |name,header|
728
- if host.match(name)
840
+ if url.host.match(name)
729
841
  headers['Host'] = header
730
842
  break
731
843
  end
@@ -769,8 +881,6 @@ module Spidr
769
881
  # @since 0.2.2
770
882
  #
771
883
  def prepare_request(url,&block)
772
- host = url.host
773
- port = url.port
774
884
  path = unless url.path.empty?
775
885
  url.path
776
886
  else
@@ -34,7 +34,7 @@ module Spidr
34
34
  #
35
35
  def [](url)
36
36
  # normalize the url
37
- url = URI(url.to_s) unless url.kind_of?(URI)
37
+ url = URI(url)
38
38
 
39
39
  key = [url.scheme, url.host, url.port]
40
40
  paths = @credentials[key]
@@ -42,7 +42,7 @@ module Spidr
42
42
  return nil unless paths
43
43
 
44
44
  # longest path first
45
- ordered_paths = paths.keys.sort_by { |key| key.length }.reverse
45
+ ordered_paths = paths.keys.sort_by { |path_key| -path_key.length }
46
46
 
47
47
  # directories of the path
48
48
  path_dirs = URI.expand_path(url.path).split('/')
@@ -70,7 +70,7 @@ module Spidr
70
70
  #
71
71
  def []=(url,auth)
72
72
  # normalize the url
73
- url = URI(url.to_s) unless url.kind_of?(URI)
73
+ url = URI(url)
74
74
 
75
75
  # normalize the URL path
76
76
  path = URI.expand_path(url.path)
@@ -109,7 +109,7 @@ module Spidr
109
109
  # or `nil` if no authorization exists.
110
110
  #
111
111
  # @param [URI] url
112
- # The url.
112
+ # The URL.
113
113
  #
114
114
  # @return [String, nil]
115
115
  # The base64 encoded authorizatio string or `nil`.
@@ -118,7 +118,7 @@ module Spidr
118
118
  #
119
119
  def for_url(url)
120
120
  if (auth = self[url])
121
- return Base64.encode64("#{auth.username}:#{auth.password}")
121
+ Base64.encode64("#{auth.username}:#{auth.password}")
122
122
  end
123
123
  end
124
124
 
@@ -144,7 +144,11 @@ module Spidr
144
144
  # @since 0.2.2
145
145
  #
146
146
  def size
147
- @credentials.inject(0) { |res, arr| res + arr[1].length }
147
+ total = 0
148
+
149
+ @credentials.each_value { |paths| total += paths.length }
150
+
151
+ return total
148
152
  end
149
153
 
150
154
  #
@@ -221,5 +221,56 @@ module Spidr
221
221
  def zip?
222
222
  is_content_type?('application/zip')
223
223
  end
224
+
225
+ #
226
+ # Determines if the page is a PNG image.
227
+ #
228
+ # @return [Boolean]
229
+ # Specifies whether the page is a PNG image.
230
+ #
231
+ # @since 0.7.0
232
+ #
233
+ def png?
234
+ is_content_type?('image/png')
235
+ end
236
+
237
+ #
238
+ # Determines if the page is a GIF image.
239
+ #
240
+ # @return [Boolean]
241
+ # Specifies whether the page is a GIF image.
242
+ #
243
+ # @since 0.7.0
244
+ #
245
+ def gif?
246
+ is_content_type?('image/gif')
247
+ end
248
+
249
+ #
250
+ # Determines if the page is a JPEG image.
251
+ #
252
+ # @return [Boolean]
253
+ # Specifies whether the page is a JPEG image.
254
+ #
255
+ # @since 0.7.0
256
+ #
257
+ def jpeg?
258
+ is_content_type?('image/jpeg')
259
+ end
260
+
261
+ #
262
+ # Determines if the page is a ICO image.
263
+ #
264
+ # @return [Boolean]
265
+ # Specifies whether the page is a ICO image.
266
+ #
267
+ # @since 0.7.0
268
+ #
269
+ def ico?
270
+ is_content_type?('image/x-icon') ||
271
+ is_content_type?('image/vnd.microsoft.icon')
272
+ end
273
+
274
+ alias icon? ico?
224
275
  end
225
276
  end