spidr 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,10 @@ require 'spidr/actions/exceptions/skip_link'
3
3
  require 'spidr/actions/exceptions/skip_page'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # The {Actions} module adds methods to {Agent} for controling the
8
+ # spidering of links.
9
+ #
6
10
  module Actions
7
11
  def initialize(options={})
8
12
  @paused = false
@@ -1,5 +1,8 @@
1
1
  module Spidr
2
2
  module Actions
3
+ #
4
+ # The base {Actions} exception class.
5
+ #
3
6
  class Action < RuntimeError
4
7
  end
5
8
  end
@@ -2,6 +2,9 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class used to pause a running {Agent}.
7
+ #
5
8
  class Paused < Action
6
9
  end
7
10
  end
@@ -2,6 +2,10 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a link.
8
+ #
5
9
  class SkipLink < Action
6
10
  end
7
11
  end
@@ -2,6 +2,10 @@ require 'spidr/actions/exceptions/action'
2
2
 
3
3
  module Spidr
4
4
  module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a {Page}, and all links within that page.
8
+ #
5
9
  class SkipPage < Action
6
10
  end
7
11
  end
data/lib/spidr/agent.rb CHANGED
@@ -19,6 +19,12 @@ module Spidr
19
19
  include Events
20
20
  include Actions
21
21
 
22
+ # HTTP Host Header to use
23
+ attr_accessor :host_header
24
+
25
+ # HTTP Host Headers to use for specific hosts
26
+ attr_reader :host_headers
27
+
22
28
  # User-Agent to use
23
29
  attr_accessor :user_agent
24
30
 
@@ -64,6 +70,12 @@ module Spidr
64
70
  # @option :proxy [String] :password
65
71
  # The password to authenticate with.
66
72
  #
73
+ # @option options [String] :host_header
74
+ # The HTTP Host header to use with each request.
75
+ #
76
+ # @option options [Hash{String,Regexp => String}] :host_headers
77
+ # The HTTP Host headers to use for specific hosts.
78
+ #
67
79
  # @option options [String] :user_agent (Spidr.user_agent)
68
80
  # The User-Agent string to send with each requests.
69
81
  #
@@ -87,6 +99,13 @@ module Spidr
87
99
  # The newly created agent.
88
100
  #
89
101
  def initialize(options={},&block)
102
+ @host_header = options[:host_header]
103
+ @host_headers = {}
104
+
105
+ if options[:host_headers]
106
+ @host_headers.merge!(options[:host_headers])
107
+ end
108
+
90
109
  @user_agent = (options[:user_agent] || Spidr.user_agent)
91
110
  @referer = options[:referer]
92
111
 
@@ -473,7 +492,7 @@ module Spidr
473
492
  # The page for the response.
474
493
  #
475
494
  # @return [Page, nil]
476
- # The page for the response, or +nil+ if the request failed.
495
+ # The page for the response, or `nil` if the request failed.
477
496
  #
478
497
  def get_page(url,&block)
479
498
  url = URI(url.to_s)
@@ -506,7 +525,7 @@ module Spidr
506
525
  # The page for the response.
507
526
  #
508
527
  # @return [Page, nil]
509
- # The page for the response, or +nil+ if the request failed.
528
+ # The page for the response, or `nil` if the request failed.
510
529
  #
511
530
  # @since 0.2.2
512
531
  #
@@ -538,7 +557,7 @@ module Spidr
538
557
  # The page which was visited.
539
558
  #
540
559
  # @return [Page, nil]
541
- # The page that was visited. If +nil+ is returned, either the request
560
+ # The page that was visited. If `nil` is returned, either the request
542
561
  # for the page failed, or the page was skipped.
543
562
  #
544
563
  def visit_page(url,&block)
@@ -558,7 +577,20 @@ module Spidr
558
577
  rescue Actions::Action
559
578
  end
560
579
 
561
- page.urls.each { |next_url| enqueue(next_url) }
580
+ page.urls.each do |next_url|
581
+ begin
582
+ @every_link_blocks.each do |link_block|
583
+ link_block.call(page.url,next_url)
584
+ end
585
+ rescue Actions::Paused => action
586
+ raise(action)
587
+ rescue Actions::SkipLink
588
+ next
589
+ rescue Actions::Action
590
+ end
591
+
592
+ enqueue(next_url)
593
+ end
562
594
  end
563
595
  end
564
596
 
@@ -566,8 +598,8 @@ module Spidr
566
598
  # Converts the agent into a Hash.
567
599
  #
568
600
  # @return [Hash]
569
- # The agent represented as a Hash containing the +history+ and
570
- # the +queue+ of the agent.
601
+ # The agent represented as a Hash containing the `history` and
602
+ # the `queue` of the agent.
571
603
  #
572
604
  def to_hash
573
605
  {:history => @history, :queue => @queue}
@@ -609,20 +641,32 @@ module Spidr
609
641
  # append the URL query to the path
610
642
  path += "?#{url.query}" if url.query
611
643
 
612
- begin
613
- sleep(@delay) if @delay > 0
614
-
615
- headers = {}
616
- headers['User-Agent'] = @user_agent if @user_agent
617
- headers['Referer'] = @referer if @referer
644
+ # set any additional HTTP headers
645
+ headers = {}
618
646
 
619
- if (authorization = @authorized.for_url(url))
620
- headers['Authorization'] = "Basic #{authorization}"
647
+ unless @host_headers.empty?
648
+ @host_headers.each do |name,header|
649
+ if host.match(name)
650
+ headers['Host'] = header
651
+ break
652
+ end
621
653
  end
654
+ end
622
655
 
623
- if (header_cookies = @cookies.for_host(url.host))
624
- headers['Cookie'] = header_cookies
625
- end
656
+ headers['Host'] ||= @host_header if @host_header
657
+ headers['User-Agent'] = @user_agent if @user_agent
658
+ headers['Referer'] = @referer if @referer
659
+
660
+ if (authorization = @authorized.for_url(url))
661
+ headers['Authorization'] = "Basic #{authorization}"
662
+ end
663
+
664
+ if (header_cookies = @cookies.for_host(url.host))
665
+ headers['Cookie'] = header_cookies
666
+ end
667
+
668
+ begin
669
+ sleep(@delay) if @delay > 0
626
670
 
627
671
  block.call(@sessions[url],path,headers)
628
672
  rescue SystemCallError,
@@ -1,4 +1,7 @@
1
1
  module Spidr
2
+ #
3
+ # Represents HTTP Authentication credentials for a website.
4
+ #
2
5
  class AuthCredential
3
6
 
4
7
  # The username
@@ -5,6 +5,10 @@ require 'spidr/page'
5
5
  require 'base64'
6
6
 
7
7
  module Spidr
8
+ #
9
+ # Stores {AuthCredential} objects organized by a website's scheme,
10
+ # host-name and sub-directory.
11
+ #
8
12
  class AuthStore
9
13
 
10
14
  #
@@ -24,13 +28,13 @@ module Spidr
24
28
  #
25
29
  # @return [AuthCredential, nil]
26
30
  # Closest matching {AuthCredential} values for the URL,
27
- # or +nil+ if nothing matches.
31
+ # or `nil` if nothing matches.
28
32
  #
29
33
  # @since 0.2.2
30
34
  #
31
35
  def [](url)
32
36
  # normalize the url
33
- url = URI(url) unless url.kind_of?(URI)
37
+ url = URI(url.to_s) unless url.kind_of?(URI)
34
38
 
35
39
  key = [url.scheme, url.host, url.port]
36
40
  paths = @credentials[key]
@@ -64,9 +68,9 @@ module Spidr
64
68
  #
65
69
  # @since 0.2.2
66
70
  #
67
- def []=(url, auth)
71
+ def []=(url,auth)
68
72
  # normalize the url
69
- url = URI(url) unless url.kind_of?(URI)
73
+ url = URI(url.to_s) unless url.kind_of?(URI)
70
74
 
71
75
  # normalize the URL path
72
76
  path = URI.expand_path(url.path)
@@ -96,19 +100,19 @@ module Spidr
96
100
  #
97
101
  # @since 0.2.2
98
102
  #
99
- def add(url, username, password)
100
- self[url] = AuthCredential.new(username, password)
103
+ def add(url,username,password)
104
+ self[url] = AuthCredential.new(username,password)
101
105
  end
102
106
 
103
107
  #
104
108
  # Returns the base64 encoded authorization string for the URL
105
- # or +nil+ if no authorization exists.
109
+ # or `nil` if no authorization exists.
106
110
  #
107
111
  # @param [URI] url
108
112
  # The url.
109
113
  #
110
114
  # @return [String, nil]
111
- # The base64 encoded authorizatio string or +nil+.
115
+ # The base64 encoded authorizatio string or `nil`.
112
116
  #
113
117
  # @since 0.2.2
114
118
  #
@@ -3,6 +3,9 @@ require 'spidr/page'
3
3
  require 'set'
4
4
 
5
5
  module Spidr
6
+ #
7
+ # Stores HTTP Cookies organized by host-name.
8
+ #
6
9
  class CookieJar
7
10
 
8
11
  include Enumerable
@@ -47,7 +50,7 @@ module Spidr
47
50
  # Host or domain name for cookies.
48
51
  #
49
52
  # @return [String, nil]
50
- # The cookie values or +nil+ if the host does not have a cookie in the
53
+ # The cookie values or `nil` if the host does not have a cookie in the
51
54
  # jar.
52
55
  #
53
56
  # @since 0.2.2
data/lib/spidr/events.rb CHANGED
@@ -1,4 +1,9 @@
1
1
  module Spidr
2
+ #
3
+ # The {Events} module adds methods to {Agent} for registering
4
+ # callbacks which will receive URLs, links, headers and pages, when
5
+ # they are visited.
6
+ #
2
7
  module Events
3
8
  def initialize(options={})
4
9
  super(options)
@@ -8,6 +13,7 @@ module Spidr
8
13
  @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
9
14
 
10
15
  @every_page_blocks = []
16
+ @every_link_blocks = []
11
17
  end
12
18
 
13
19
  #
@@ -499,5 +505,24 @@ module Spidr
499
505
  block.call(page) if (block && page.zip?)
500
506
  end
501
507
  end
508
+
509
+ #
510
+ # Passes every origin and destination URI of each link to a given
511
+ # block.
512
+ #
513
+ # @yield [origin,dest]
514
+ # The block will be passed every origin and destination URI of
515
+ # each link.
516
+ #
517
+ # @yieldparam [URI::HTTP] origin
518
+ # The URI that a link originated from.
519
+ #
520
+ # @yieldparam [URI::HTTP] dest
521
+ # The destination URI of a link.
522
+ #
523
+ def every_link(&block)
524
+ @every_link_blocks << block
525
+ return self
526
+ end
502
527
  end
503
528
  end
data/lib/spidr/filters.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  require 'spidr/rules'
2
2
 
3
3
  module Spidr
4
+ #
5
+ # The {Filters} module adds methods to {Agent} for controlling which
6
+ # URLs the agent will visit.
7
+ #
4
8
  module Filters
5
9
  def self.included(base)
6
10
  base.module_eval do
@@ -17,7 +21,7 @@ module Spidr
17
21
  #
18
22
  # @option options [Array] :schemes (['http', 'https'])
19
23
  # The list of acceptable URI schemes to visit.
20
- # The +https+ scheme will be ignored if +net/https+ cannot be loaded.
24
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
21
25
  #
22
26
  # @option options [String] :host
23
27
  # The host-name to visit.
data/lib/spidr/page.rb CHANGED
@@ -5,6 +5,9 @@ require 'uri'
5
5
  require 'nokogiri'
6
6
 
7
7
  module Spidr
8
+ #
9
+ # Represents a requested page from a website.
10
+ #
8
11
  class Page
9
12
 
10
13
  # Reserved names used within Cookie strings
@@ -46,10 +49,10 @@ module Spidr
46
49
  end
47
50
 
48
51
  #
49
- # Determines if the response code is +200+.
52
+ # Determines if the response code is `200`.
50
53
  #
51
54
  # @return [Boolean]
52
- # Specifies whether the response code is +200+.
55
+ # Specifies whether the response code is `200`.
53
56
  #
54
57
  def is_ok?
55
58
  code == 200
@@ -58,10 +61,10 @@ module Spidr
58
61
  alias ok? is_ok?
59
62
 
60
63
  #
61
- # Determines if the response code is +301+ or +307+.
64
+ # Determines if the response code is `301` or `307`.
62
65
  #
63
66
  # @return [Boolean]
64
- # Specifies whether the response code is +301+ or +307+.
67
+ # Specifies whether the response code is `301` or `307`.
65
68
  #
66
69
  def is_redirect?
67
70
  (code == 301 || code == 307)
@@ -70,30 +73,30 @@ module Spidr
70
73
  alias redirect? is_redirect?
71
74
 
72
75
  #
73
- # Determines if the response code is +308+.
76
+ # Determines if the response code is `308`.
74
77
  #
75
78
  # @return [Boolean]
76
- # Specifies whether the response code is +308+.
79
+ # Specifies whether the response code is `308`.
77
80
  #
78
81
  def timedout?
79
82
  code == 308
80
83
  end
81
84
 
82
85
  #
83
- # Determines if the response code is +400+.
86
+ # Determines if the response code is `400`.
84
87
  #
85
88
  # @return [Boolean]
86
- # Specifies whether the response code is +400+.
89
+ # Specifies whether the response code is `400`.
87
90
  #
88
91
  def bad_request?
89
92
  code == 400
90
93
  end
91
94
 
92
95
  #
93
- # Determines if the response code is +401+.
96
+ # Determines if the response code is `401`.
94
97
  #
95
98
  # @return [Boolean]
96
- # Specifies whether the response code is +401+.
99
+ # Specifies whether the response code is `401`.
97
100
  #
98
101
  def is_unauthorized?
99
102
  code == 401
@@ -102,10 +105,10 @@ module Spidr
102
105
  alias unauthorized? is_unauthorized?
103
106
 
104
107
  #
105
- # Determines if the response code is +403+.
108
+ # Determines if the response code is `403`.
106
109
  #
107
110
  # @return [Boolean]
108
- # Specifies whether the response code is +403+.
111
+ # Specifies whether the response code is `403`.
109
112
  #
110
113
  def is_forbidden?
111
114
  code == 403
@@ -114,10 +117,10 @@ module Spidr
114
117
  alias forbidden? is_forbidden?
115
118
 
116
119
  #
117
- # Determines if the response code is +404+.
120
+ # Determines if the response code is `404`.
118
121
  #
119
122
  # @return [Boolean]
120
- # Specifies whether the response code is +404+.
123
+ # Specifies whether the response code is `404`.
121
124
  #
122
125
  def is_missing?
123
126
  code == 404
@@ -126,10 +129,10 @@ module Spidr
126
129
  alias missing? is_missing?
127
130
 
128
131
  #
129
- # Determines if the response code is +500+.
132
+ # Determines if the response code is `500`.
130
133
  #
131
134
  # @return [Boolean]
132
- # Specifies whether the response code is +500+.
135
+ # Specifies whether the response code is `500`.
133
136
  #
134
137
  def had_internal_server_error?
135
138
  code == 500
@@ -306,12 +309,14 @@ module Spidr
306
309
  def cookie_params
307
310
  params = {}
308
311
 
309
- cookies.each do |key_value|
310
- key, value = key_value.split('=',2)
312
+ cookies.each do |cookie|
313
+ cookie.split('; ').each do |key_value|
314
+ key, value = key_value.split('=',2)
311
315
 
312
- next if RESERVED_COOKIE_NAMES.include?(key)
316
+ next if RESERVED_COOKIE_NAMES.include?(key)
313
317
 
314
- params[key] = (value || '')
318
+ params[key] = (value || '')
319
+ end
315
320
  end
316
321
 
317
322
  return params
@@ -332,7 +337,7 @@ module Spidr
332
337
  #
333
338
  # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
334
339
  # The document that represents HTML or XML pages.
335
- # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
340
+ # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
336
341
  # the page could not be parsed properly.
337
342
  #
338
343
  # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
@@ -380,7 +385,7 @@ module Spidr
380
385
  # Searches for the first occurrence an XPath or CSS Path expression.
381
386
  #
382
387
  # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
383
- # The first matched node. Returns +nil+ if no nodes could be matched,
388
+ # The first matched node. Returns `nil` if no nodes could be matched,
384
389
  # or if the page is not a HTML or XML document.
385
390
  #
386
391
  # @example
@@ -416,7 +421,7 @@ module Spidr
416
421
  #
417
422
  # @return [Array<String>]
418
423
  # All links within the HTML page, frame/iframe source URLs and any
419
- # links in the +Location+ header.
424
+ # links in the `Location` header.
420
425
  #
421
426
  def links
422
427
  urls = []
@@ -502,7 +507,7 @@ module Spidr
502
507
  protected
503
508
 
504
509
  #
505
- # Provides transparent access to the values in +headers+.
510
+ # Provides transparent access to the values in `headers`.
506
511
  #
507
512
  def method_missing(sym,*args,&block)
508
513
  if (args.empty? && block.nil?)