spidr 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,10 @@ require 'spidr/actions/exceptions/skip_link'
3
3
  require 'spidr/actions/exceptions/skip_page'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # The {Actions} module adds methods to {Agent} for controling the
8
+ # spidering of links.
9
+ #
6
10
  module Actions
7
11
  def initialize(options={})
8
12
  @paused = false
@@ -1,5 +1,8 @@
1
1
  module Spidr
2
2
  module Actions
3
+ #
4
+ # The base {Actions} exception class.
5
+ #
3
6
  class Action < RuntimeError
4
7
  end
5
8
  end
@@ -2,6 +2,9 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class used to pause a running {Agent}.
7
+ #
5
8
  class Paused < Action
6
9
  end
7
10
  end
@@ -2,6 +2,10 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a link.
8
+ #
5
9
  class SkipLink < Action
6
10
  end
7
11
  end
@@ -2,6 +2,10 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a {Page}, and all links within that page.
8
+ #
5
9
  class SkipPage < Action
6
10
  end
7
11
  end
data/lib/spidr/agent.rb CHANGED
@@ -19,6 +19,12 @@ module Spidr
19
19
  include Events
20
20
  include Actions
21
21
 
22
+ # HTTP Host Header to use
23
+ attr_accessor :host_header
24
+
25
+ # HTTP Host Headers to use for specific hosts
26
+ attr_reader :host_headers
27
+
22
28
  # User-Agent to use
23
29
  attr_accessor :user_agent
24
30
 
@@ -64,6 +70,12 @@ module Spidr
64
70
  # @option :proxy [String] :password
65
71
  # The password to authenticate with.
66
72
  #
73
+ # @option options [String] :host_header
74
+ # The HTTP Host header to use with each request.
75
+ #
76
+ # @option options [Hash{String,Regexp => String}] :host_headers
77
+ # The HTTP Host headers to use for specific hosts.
78
+ #
67
79
  # @option options [String] :user_agent (Spidr.user_agent)
68
80
  # The User-Agent string to send with each requests.
69
81
  #
@@ -87,6 +99,13 @@ module Spidr
87
99
  # The newly created agent.
88
100
  #
89
101
  def initialize(options={},&block)
102
+ @host_header = options[:host_header]
103
+ @host_headers = {}
104
+
105
+ if options[:host_headers]
106
+ @host_headers.merge!(options[:host_headers])
107
+ end
108
+
90
109
  @user_agent = (options[:user_agent] || Spidr.user_agent)
91
110
  @referer = options[:referer]
92
111
 
@@ -473,7 +492,7 @@ module Spidr
473
492
  # The page for the response.
474
493
  #
475
494
  # @return [Page, nil]
476
- # The page for the response, or +nil+ if the request failed.
495
+ # The page for the response, or `nil` if the request failed.
477
496
  #
478
497
  def get_page(url,&block)
479
498
  url = URI(url.to_s)
@@ -506,7 +525,7 @@ module Spidr
506
525
  # The page for the response.
507
526
  #
508
527
  # @return [Page, nil]
509
- # The page for the response, or +nil+ if the request failed.
528
+ # The page for the response, or `nil` if the request failed.
510
529
  #
511
530
  # @since 0.2.2
512
531
  #
@@ -538,7 +557,7 @@ module Spidr
538
557
  # The page which was visited.
539
558
  #
540
559
  # @return [Page, nil]
541
- # The page that was visited. If +nil+ is returned, either the request
560
+ # The page that was visited. If `nil` is returned, either the request
542
561
  # for the page failed, or the page was skipped.
543
562
  #
544
563
  def visit_page(url,&block)
@@ -558,7 +577,20 @@ module Spidr
558
577
  rescue Actions::Action
559
578
  end
560
579
 
561
- page.urls.each { |next_url| enqueue(next_url) }
580
+ page.urls.each do |next_url|
581
+ begin
582
+ @every_link_blocks.each do |link_block|
583
+ link_block.call(page.url,next_url)
584
+ end
585
+ rescue Actions::Paused => action
586
+ raise(action)
587
+ rescue Actions::SkipLink
588
+ next
589
+ rescue Actions::Action
590
+ end
591
+
592
+ enqueue(next_url)
593
+ end
562
594
  end
563
595
  end
564
596
 
@@ -566,8 +598,8 @@ module Spidr
566
598
  # Converts the agent into a Hash.
567
599
  #
568
600
  # @return [Hash]
569
- # The agent represented as a Hash containing the +history+ and
570
- # the +queue+ of the agent.
601
+ # The agent represented as a Hash containing the `history` and
602
+ # the `queue` of the agent.
571
603
  #
572
604
  def to_hash
573
605
  {:history => @history, :queue => @queue}
@@ -609,20 +641,32 @@ module Spidr
609
641
  # append the URL query to the path
610
642
  path += "?#{url.query}" if url.query
611
643
 
612
- begin
613
- sleep(@delay) if @delay > 0
614
-
615
- headers = {}
616
- headers['User-Agent'] = @user_agent if @user_agent
617
- headers['Referer'] = @referer if @referer
644
+ # set any additional HTTP headers
645
+ headers = {}
618
646
 
619
- if (authorization = @authorized.for_url(url))
620
- headers['Authorization'] = "Basic #{authorization}"
647
+ unless @host_headers.empty?
648
+ @host_headers.each do |name,header|
649
+ if host.match(name)
650
+ headers['Host'] = header
651
+ break
652
+ end
621
653
  end
654
+ end
622
655
 
623
- if (header_cookies = @cookies.for_host(url.host))
624
- headers['Cookie'] = header_cookies
625
- end
656
+ headers['Host'] ||= @host_header if @host_header
657
+ headers['User-Agent'] = @user_agent if @user_agent
658
+ headers['Referer'] = @referer if @referer
659
+
660
+ if (authorization = @authorized.for_url(url))
661
+ headers['Authorization'] = "Basic #{authorization}"
662
+ end
663
+
664
+ if (header_cookies = @cookies.for_host(url.host))
665
+ headers['Cookie'] = header_cookies
666
+ end
667
+
668
+ begin
669
+ sleep(@delay) if @delay > 0
626
670
 
627
671
  block.call(@sessions[url],path,headers)
628
672
  rescue SystemCallError,
@@ -1,4 +1,7 @@
1
1
  module Spidr
2
+ #
3
+ # Represents HTTP Authentication credentials for a website.
4
+ #
2
5
  class AuthCredential
3
6
 
4
7
  # The username
@@ -5,6 +5,10 @@ require 'spidr/page'
5
5
  require 'base64'
6
6
 
7
7
  module Spidr
8
+ #
9
+ # Stores {AuthCredential} objects organized by a website's scheme,
10
+ # host-name and sub-directory.
11
+ #
8
12
  class AuthStore
9
13
 
10
14
  #
@@ -24,13 +28,13 @@ module Spidr
24
28
  #
25
29
  # @return [AuthCredential, nil]
26
30
  # Closest matching {AuthCredential} values for the URL,
27
- # or +nil+ if nothing matches.
31
+ # or `nil` if nothing matches.
28
32
  #
29
33
  # @since 0.2.2
30
34
  #
31
35
  def [](url)
32
36
  # normalize the url
33
- url = URI(url) unless url.kind_of?(URI)
37
+ url = URI(url.to_s) unless url.kind_of?(URI)
34
38
 
35
39
  key = [url.scheme, url.host, url.port]
36
40
  paths = @credentials[key]
@@ -64,9 +68,9 @@ module Spidr
64
68
  #
65
69
  # @since 0.2.2
66
70
  #
67
- def []=(url, auth)
71
+ def []=(url,auth)
68
72
  # normalize the url
69
- url = URI(url) unless url.kind_of?(URI)
73
+ url = URI(url.to_s) unless url.kind_of?(URI)
70
74
 
71
75
  # normalize the URL path
72
76
  path = URI.expand_path(url.path)
@@ -96,19 +100,19 @@ module Spidr
96
100
  #
97
101
  # @since 0.2.2
98
102
  #
99
- def add(url, username, password)
100
- self[url] = AuthCredential.new(username, password)
103
+ def add(url,username,password)
104
+ self[url] = AuthCredential.new(username,password)
101
105
  end
102
106
 
103
107
  #
104
108
  # Returns the base64 encoded authorization string for the URL
105
- # or +nil+ if no authorization exists.
109
+ # or `nil` if no authorization exists.
106
110
  #
107
111
  # @param [URI] url
108
112
  # The url.
109
113
  #
110
114
  # @return [String, nil]
111
- # The base64 encoded authorizatio string or +nil+.
115
+ # The base64 encoded authorizatio string or `nil`.
112
116
  #
113
117
  # @since 0.2.2
114
118
  #
@@ -3,6 +3,9 @@ require 'spidr/page'
3
3
  require 'set'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # Stores HTTP Cookies organized by host-name.
8
+ #
6
9
  class CookieJar
7
10
 
8
11
  include Enumerable
@@ -47,7 +50,7 @@ module Spidr
47
50
  # Host or domain name for cookies.
48
51
  #
49
52
  # @return [String, nil]
50
- # The cookie values or +nil+ if the host does not have a cookie in the
53
+ # The cookie values or `nil` if the host does not have a cookie in the
51
54
  # jar.
52
55
  #
53
56
  # @since 0.2.2
data/lib/spidr/events.rb CHANGED
@@ -1,4 +1,9 @@
1
1
  module Spidr
2
+ #
3
+ # The {Events} module adds methods to {Agent} for registering
4
+ # callbacks which will receive URLs, links, headers and pages, when
5
+ # they are visited.
6
+ #
2
7
  module Events
3
8
  def initialize(options={})
4
9
  super(options)
@@ -8,6 +13,7 @@ module Spidr
8
13
  @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
9
14
 
10
15
  @every_page_blocks = []
16
+ @every_link_blocks = []
11
17
  end
12
18
 
13
19
  #
@@ -499,5 +505,24 @@ module Spidr
499
505
  block.call(page) if (block && page.zip?)
500
506
  end
501
507
  end
508
+
509
+ #
510
+ # Passes every origin and destination URI of each link to a given
511
+ # block.
512
+ #
513
+ # @yield [origin,dest]
514
+ # The block will be passed every origin and destination URI of
515
+ # each link.
516
+ #
517
+ # @yieldparam [URI::HTTP] origin
518
+ # The URI that a link originated from.
519
+ #
520
+ # @yieldparam [URI::HTTP] dest
521
+ # The destination URI of a link.
522
+ #
523
+ def every_link(&block)
524
+ @every_link_blocks << block
525
+ return self
526
+ end
502
527
  end
503
528
  end
data/lib/spidr/filters.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  require 'spidr/rules'
2
2
 
3
3
  module Spidr
4
+ #
5
+ # The {Filters} module adds methods to {Agent} for controlling which
6
+ # URLs the agent will visit.
7
+ #
4
8
  module Filters
5
9
  def self.included(base)
6
10
  base.module_eval do
@@ -17,7 +21,7 @@ module Spidr
17
21
  #
18
22
  # @option options [Array] :schemes (['http', 'https'])
19
23
  # The list of acceptable URI schemes to visit.
20
- # The +https+ scheme will be ignored if +net/https+ cannot be loaded.
24
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
21
25
  #
22
26
  # @option options [String] :host
23
27
  # The host-name to visit.
data/lib/spidr/page.rb CHANGED
@@ -5,6 +5,9 @@ require 'uri'
5
5
  require 'nokogiri'
6
6
 
7
7
  module Spidr
8
+ #
9
+ # Represents a requested page from a website.
10
+ #
8
11
  class Page
9
12
 
10
13
  # Reserved names used within Cookie strings
@@ -46,10 +49,10 @@ module Spidr
46
49
  end
47
50
 
48
51
  #
49
- # Determines if the response code is +200+.
52
+ # Determines if the response code is `200`.
50
53
  #
51
54
  # @return [Boolean]
52
- # Specifies whether the response code is +200+.
55
+ # Specifies whether the response code is `200`.
53
56
  #
54
57
  def is_ok?
55
58
  code == 200
@@ -58,10 +61,10 @@ module Spidr
58
61
  alias ok? is_ok?
59
62
 
60
63
  #
61
- # Determines if the response code is +301+ or +307+.
64
+ # Determines if the response code is `301` or `307`.
62
65
  #
63
66
  # @return [Boolean]
64
- # Specifies whether the response code is +301+ or +307+.
67
+ # Specifies whether the response code is `301` or `307`.
65
68
  #
66
69
  def is_redirect?
67
70
  (code == 301 || code == 307)
@@ -70,30 +73,30 @@ module Spidr
70
73
  alias redirect? is_redirect?
71
74
 
72
75
  #
73
- # Determines if the response code is +308+.
76
+ # Determines if the response code is `308`.
74
77
  #
75
78
  # @return [Boolean]
76
- # Specifies whether the response code is +308+.
79
+ # Specifies whether the response code is `308`.
77
80
  #
78
81
  def timedout?
79
82
  code == 308
80
83
  end
81
84
 
82
85
  #
83
- # Determines if the response code is +400+.
86
+ # Determines if the response code is `400`.
84
87
  #
85
88
  # @return [Boolean]
86
- # Specifies whether the response code is +400+.
89
+ # Specifies whether the response code is `400`.
87
90
  #
88
91
  def bad_request?
89
92
  code == 400
90
93
  end
91
94
 
92
95
  #
93
- # Determines if the response code is +401+.
96
+ # Determines if the response code is `401`.
94
97
  #
95
98
  # @return [Boolean]
96
- # Specifies whether the response code is +401+.
99
+ # Specifies whether the response code is `401`.
97
100
  #
98
101
  def is_unauthorized?
99
102
  code == 401
@@ -102,10 +105,10 @@ module Spidr
102
105
  alias unauthorized? is_unauthorized?
103
106
 
104
107
  #
105
- # Determines if the response code is +403+.
108
+ # Determines if the response code is `403`.
106
109
  #
107
110
  # @return [Boolean]
108
- # Specifies whether the response code is +403+.
111
+ # Specifies whether the response code is `403`.
109
112
  #
110
113
  def is_forbidden?
111
114
  code == 403
@@ -114,10 +117,10 @@ module Spidr
114
117
  alias forbidden? is_forbidden?
115
118
 
116
119
  #
117
- # Determines if the response code is +404+.
120
+ # Determines if the response code is `404`.
118
121
  #
119
122
  # @return [Boolean]
120
- # Specifies whether the response code is +404+.
123
+ # Specifies whether the response code is `404`.
121
124
  #
122
125
  def is_missing?
123
126
  code == 404
@@ -126,10 +129,10 @@ module Spidr
126
129
  alias missing? is_missing?
127
130
 
128
131
  #
129
- # Determines if the response code is +500+.
132
+ # Determines if the response code is `500`.
130
133
  #
131
134
  # @return [Boolean]
132
- # Specifies whether the response code is +500+.
135
+ # Specifies whether the response code is `500`.
133
136
  #
134
137
  def had_internal_server_error?
135
138
  code == 500
@@ -306,12 +309,14 @@ module Spidr
306
309
  def cookie_params
307
310
  params = {}
308
311
 
309
- cookies.each do |key_value|
310
- key, value = key_value.split('=',2)
312
+ cookies.each do |cookie|
313
+ cookie.split('; ').each do |key_value|
314
+ key, value = key_value.split('=',2)
311
315
 
312
- next if RESERVED_COOKIE_NAMES.include?(key)
316
+ next if RESERVED_COOKIE_NAMES.include?(key)
313
317
 
314
- params[key] = (value || '')
318
+ params[key] = (value || '')
319
+ end
315
320
  end
316
321
 
317
322
  return params
@@ -332,7 +337,7 @@ module Spidr
332
337
  #
333
338
  # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
334
339
  # The document that represents HTML or XML pages.
335
- # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
340
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
336
341
  # the page could not be parsed properly.
337
342
  #
338
343
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
@@ -380,7 +385,7 @@ module Spidr
380
385
  # Searches for the first occurrence an XPath or CSS Path expression.
381
386
  #
382
387
  # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
383
- # The first matched node. Returns +nil+ if no nodes could be matched,
388
+ # The first matched node. Returns `nil` if no nodes could be matched,
384
389
  # or if the page is not a HTML or XML document.
385
390
  #
386
391
  # @example
@@ -416,7 +421,7 @@ module Spidr
416
421
  #
417
422
  # @return [Array<String>]
418
423
  # All links within the HTML page, frame/iframe source URLs and any
419
- # links in the +Location+ header.
424
+ # links in the `Location` header.
420
425
  #
421
426
  def links
422
427
  urls = []
@@ -502,7 +507,7 @@ module Spidr
502
507
  protected
503
508
 
504
509
  #
505
- # Provides transparent access to the values in +headers+.
510
+ # Provides transparent access to the values in `headers`.
506
511
  #
507
512
  def method_missing(sym,*args,&block)
508
513
  if (args.empty? && block.nil?)