spidr 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/{History.rdoc → ChangeLog.md} +47 -39
- data/LICENSE.txt +21 -0
- data/{README.rdoc → README.md} +57 -49
- data/Rakefile +36 -22
- data/lib/spidr/actions/actions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +3 -0
- data/lib/spidr/actions/exceptions/paused.rb +3 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +4 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +4 -0
- data/lib/spidr/agent.rb +61 -17
- data/lib/spidr/auth_credential.rb +3 -0
- data/lib/spidr/auth_store.rb +12 -8
- data/lib/spidr/cookie_jar.rb +4 -1
- data/lib/spidr/events.rb +25 -0
- data/lib/spidr/filters.rb +5 -1
- data/lib/spidr/page.rb +29 -24
- data/lib/spidr/rules.rb +4 -0
- data/lib/spidr/sanitizers.rb +4 -0
- data/lib/spidr/session_cache.rb +26 -1
- data/lib/spidr/version.rb +1 -1
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +108 -0
- data/spec/page_spec.rb +0 -1
- data/spec/session_cache.rb +58 -0
- data/spidr.gemspec +115 -0
- metadata +99 -90
- data.tar.gz.sig +0 -2
- data/Manifest.txt +0 -41
- data/tasks/spec.rb +0 -10
- data/tasks/yard.rb +0 -12
- metadata.gz.sig +0 -0
@@ -3,6 +3,10 @@ require 'spidr/actions/exceptions/skip_link'
|
|
3
3
|
require 'spidr/actions/exceptions/skip_page'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# The {Actions} module adds methods to {Agent} for controling the
|
8
|
+
# spidering of links.
|
9
|
+
#
|
6
10
|
module Actions
|
7
11
|
def initialize(options={})
|
8
12
|
@paused = false
|
data/lib/spidr/agent.rb
CHANGED
@@ -19,6 +19,12 @@ module Spidr
|
|
19
19
|
include Events
|
20
20
|
include Actions
|
21
21
|
|
22
|
+
# HTTP Host Header to use
|
23
|
+
attr_accessor :host_header
|
24
|
+
|
25
|
+
# HTTP Host Headers to use for specific hosts
|
26
|
+
attr_reader :host_headers
|
27
|
+
|
22
28
|
# User-Agent to use
|
23
29
|
attr_accessor :user_agent
|
24
30
|
|
@@ -64,6 +70,12 @@ module Spidr
|
|
64
70
|
# @option :proxy [String] :password
|
65
71
|
# The password to authenticate with.
|
66
72
|
#
|
73
|
+
# @option options [String] :host_header
|
74
|
+
# The HTTP Host header to use with each request.
|
75
|
+
#
|
76
|
+
# @option options [Hash{String,Regexp => String}] :host_headers
|
77
|
+
# The HTTP Host headers to use for specific hosts.
|
78
|
+
#
|
67
79
|
# @option options [String] :user_agent (Spidr.user_agent)
|
68
80
|
# The User-Agent string to send with each requests.
|
69
81
|
#
|
@@ -87,6 +99,13 @@ module Spidr
|
|
87
99
|
# The newly created agent.
|
88
100
|
#
|
89
101
|
def initialize(options={},&block)
|
102
|
+
@host_header = options[:host_header]
|
103
|
+
@host_headers = {}
|
104
|
+
|
105
|
+
if options[:host_headers]
|
106
|
+
@host_headers.merge!(options[:host_headers])
|
107
|
+
end
|
108
|
+
|
90
109
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
91
110
|
@referer = options[:referer]
|
92
111
|
|
@@ -473,7 +492,7 @@ module Spidr
|
|
473
492
|
# The page for the response.
|
474
493
|
#
|
475
494
|
# @return [Page, nil]
|
476
|
-
# The page for the response, or
|
495
|
+
# The page for the response, or `nil` if the request failed.
|
477
496
|
#
|
478
497
|
def get_page(url,&block)
|
479
498
|
url = URI(url.to_s)
|
@@ -506,7 +525,7 @@ module Spidr
|
|
506
525
|
# The page for the response.
|
507
526
|
#
|
508
527
|
# @return [Page, nil]
|
509
|
-
# The page for the response, or
|
528
|
+
# The page for the response, or `nil` if the request failed.
|
510
529
|
#
|
511
530
|
# @since 0.2.2
|
512
531
|
#
|
@@ -538,7 +557,7 @@ module Spidr
|
|
538
557
|
# The page which was visited.
|
539
558
|
#
|
540
559
|
# @return [Page, nil]
|
541
|
-
# The page that was visited. If
|
560
|
+
# The page that was visited. If `nil` is returned, either the request
|
542
561
|
# for the page failed, or the page was skipped.
|
543
562
|
#
|
544
563
|
def visit_page(url,&block)
|
@@ -558,7 +577,20 @@ module Spidr
|
|
558
577
|
rescue Actions::Action
|
559
578
|
end
|
560
579
|
|
561
|
-
page.urls.each
|
580
|
+
page.urls.each do |next_url|
|
581
|
+
begin
|
582
|
+
@every_link_blocks.each do |link_block|
|
583
|
+
link_block.call(page.url,next_url)
|
584
|
+
end
|
585
|
+
rescue Actions::Paused => action
|
586
|
+
raise(action)
|
587
|
+
rescue Actions::SkipLink
|
588
|
+
next
|
589
|
+
rescue Actions::Action
|
590
|
+
end
|
591
|
+
|
592
|
+
enqueue(next_url)
|
593
|
+
end
|
562
594
|
end
|
563
595
|
end
|
564
596
|
|
@@ -566,8 +598,8 @@ module Spidr
|
|
566
598
|
# Converts the agent into a Hash.
|
567
599
|
#
|
568
600
|
# @return [Hash]
|
569
|
-
# The agent represented as a Hash containing the
|
570
|
-
# the
|
601
|
+
# The agent represented as a Hash containing the `history` and
|
602
|
+
# the `queue` of the agent.
|
571
603
|
#
|
572
604
|
def to_hash
|
573
605
|
{:history => @history, :queue => @queue}
|
@@ -609,20 +641,32 @@ module Spidr
|
|
609
641
|
# append the URL query to the path
|
610
642
|
path += "?#{url.query}" if url.query
|
611
643
|
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
headers = {}
|
616
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
617
|
-
headers['Referer'] = @referer if @referer
|
644
|
+
# set any additional HTTP headers
|
645
|
+
headers = {}
|
618
646
|
|
619
|
-
|
620
|
-
|
647
|
+
unless @host_headers.empty?
|
648
|
+
@host_headers.each do |name,header|
|
649
|
+
if host.match(name)
|
650
|
+
headers['Host'] = header
|
651
|
+
break
|
652
|
+
end
|
621
653
|
end
|
654
|
+
end
|
622
655
|
|
623
|
-
|
624
|
-
|
625
|
-
|
656
|
+
headers['Host'] ||= @host_header if @host_header
|
657
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
658
|
+
headers['Referer'] = @referer if @referer
|
659
|
+
|
660
|
+
if (authorization = @authorized.for_url(url))
|
661
|
+
headers['Authorization'] = "Basic #{authorization}"
|
662
|
+
end
|
663
|
+
|
664
|
+
if (header_cookies = @cookies.for_host(url.host))
|
665
|
+
headers['Cookie'] = header_cookies
|
666
|
+
end
|
667
|
+
|
668
|
+
begin
|
669
|
+
sleep(@delay) if @delay > 0
|
626
670
|
|
627
671
|
block.call(@sessions[url],path,headers)
|
628
672
|
rescue SystemCallError,
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -5,6 +5,10 @@ require 'spidr/page'
|
|
5
5
|
require 'base64'
|
6
6
|
|
7
7
|
module Spidr
|
8
|
+
#
|
9
|
+
# Stores {AuthCredential} objects organized by a website's scheme,
|
10
|
+
# host-name and sub-directory.
|
11
|
+
#
|
8
12
|
class AuthStore
|
9
13
|
|
10
14
|
#
|
@@ -24,13 +28,13 @@ module Spidr
|
|
24
28
|
#
|
25
29
|
# @return [AuthCredential, nil]
|
26
30
|
# Closest matching {AuthCredential} values for the URL,
|
27
|
-
# or
|
31
|
+
# or `nil` if nothing matches.
|
28
32
|
#
|
29
33
|
# @since 0.2.2
|
30
34
|
#
|
31
35
|
def [](url)
|
32
36
|
# normalize the url
|
33
|
-
url = URI(url) unless url.kind_of?(URI)
|
37
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
34
38
|
|
35
39
|
key = [url.scheme, url.host, url.port]
|
36
40
|
paths = @credentials[key]
|
@@ -64,9 +68,9 @@ module Spidr
|
|
64
68
|
#
|
65
69
|
# @since 0.2.2
|
66
70
|
#
|
67
|
-
def []=(url,
|
71
|
+
def []=(url,auth)
|
68
72
|
# normalize the url
|
69
|
-
url = URI(url) unless url.kind_of?(URI)
|
73
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
70
74
|
|
71
75
|
# normalize the URL path
|
72
76
|
path = URI.expand_path(url.path)
|
@@ -96,19 +100,19 @@ module Spidr
|
|
96
100
|
#
|
97
101
|
# @since 0.2.2
|
98
102
|
#
|
99
|
-
def add(url,
|
100
|
-
self[url] = AuthCredential.new(username,
|
103
|
+
def add(url,username,password)
|
104
|
+
self[url] = AuthCredential.new(username,password)
|
101
105
|
end
|
102
106
|
|
103
107
|
#
|
104
108
|
# Returns the base64 encoded authorization string for the URL
|
105
|
-
# or
|
109
|
+
# or `nil` if no authorization exists.
|
106
110
|
#
|
107
111
|
# @param [URI] url
|
108
112
|
# The url.
|
109
113
|
#
|
110
114
|
# @return [String, nil]
|
111
|
-
# The base64 encoded authorizatio string or
|
115
|
+
# The base64 encoded authorizatio string or `nil`.
|
112
116
|
#
|
113
117
|
# @since 0.2.2
|
114
118
|
#
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -3,6 +3,9 @@ require 'spidr/page'
|
|
3
3
|
require 'set'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# Stores HTTP Cookies organized by host-name.
|
8
|
+
#
|
6
9
|
class CookieJar
|
7
10
|
|
8
11
|
include Enumerable
|
@@ -47,7 +50,7 @@ module Spidr
|
|
47
50
|
# Host or domain name for cookies.
|
48
51
|
#
|
49
52
|
# @return [String, nil]
|
50
|
-
# The cookie values or
|
53
|
+
# The cookie values or `nil` if the host does not have a cookie in the
|
51
54
|
# jar.
|
52
55
|
#
|
53
56
|
# @since 0.2.2
|
data/lib/spidr/events.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
module Spidr
|
2
|
+
#
|
3
|
+
# The {Events} module adds methods to {Agent} for registering
|
4
|
+
# callbacks which will receive URLs, links, headers and pages, when
|
5
|
+
# they are visited.
|
6
|
+
#
|
2
7
|
module Events
|
3
8
|
def initialize(options={})
|
4
9
|
super(options)
|
@@ -8,6 +13,7 @@ module Spidr
|
|
8
13
|
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
9
14
|
|
10
15
|
@every_page_blocks = []
|
16
|
+
@every_link_blocks = []
|
11
17
|
end
|
12
18
|
|
13
19
|
#
|
@@ -499,5 +505,24 @@ module Spidr
|
|
499
505
|
block.call(page) if (block && page.zip?)
|
500
506
|
end
|
501
507
|
end
|
508
|
+
|
509
|
+
#
|
510
|
+
# Passes every origin and destination URI of each link to a given
|
511
|
+
# block.
|
512
|
+
#
|
513
|
+
# @yield [origin,dest]
|
514
|
+
# The block will be passed every origin and destination URI of
|
515
|
+
# each link.
|
516
|
+
#
|
517
|
+
# @yieldparam [URI::HTTP] origin
|
518
|
+
# The URI that a link originated from.
|
519
|
+
#
|
520
|
+
# @yieldparam [URI::HTTP] dest
|
521
|
+
# The destination URI of a link.
|
522
|
+
#
|
523
|
+
def every_link(&block)
|
524
|
+
@every_link_blocks << block
|
525
|
+
return self
|
526
|
+
end
|
502
527
|
end
|
503
528
|
end
|
data/lib/spidr/filters.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
require 'spidr/rules'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
+
#
|
5
|
+
# The {Filters} module adds methods to {Agent} for controlling which
|
6
|
+
# URLs the agent will visit.
|
7
|
+
#
|
4
8
|
module Filters
|
5
9
|
def self.included(base)
|
6
10
|
base.module_eval do
|
@@ -17,7 +21,7 @@ module Spidr
|
|
17
21
|
#
|
18
22
|
# @option options [Array] :schemes (['http', 'https'])
|
19
23
|
# The list of acceptable URI schemes to visit.
|
20
|
-
# The
|
24
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
21
25
|
#
|
22
26
|
# @option options [String] :host
|
23
27
|
# The host-name to visit.
|
data/lib/spidr/page.rb
CHANGED
@@ -5,6 +5,9 @@ require 'uri'
|
|
5
5
|
require 'nokogiri'
|
6
6
|
|
7
7
|
module Spidr
|
8
|
+
#
|
9
|
+
# Represents a requested page from a website.
|
10
|
+
#
|
8
11
|
class Page
|
9
12
|
|
10
13
|
# Reserved names used within Cookie strings
|
@@ -46,10 +49,10 @@ module Spidr
|
|
46
49
|
end
|
47
50
|
|
48
51
|
#
|
49
|
-
# Determines if the response code is
|
52
|
+
# Determines if the response code is `200`.
|
50
53
|
#
|
51
54
|
# @return [Boolean]
|
52
|
-
# Specifies whether the response code is
|
55
|
+
# Specifies whether the response code is `200`.
|
53
56
|
#
|
54
57
|
def is_ok?
|
55
58
|
code == 200
|
@@ -58,10 +61,10 @@ module Spidr
|
|
58
61
|
alias ok? is_ok?
|
59
62
|
|
60
63
|
#
|
61
|
-
# Determines if the response code is
|
64
|
+
# Determines if the response code is `301` or `307`.
|
62
65
|
#
|
63
66
|
# @return [Boolean]
|
64
|
-
# Specifies whether the response code is
|
67
|
+
# Specifies whether the response code is `301` or `307`.
|
65
68
|
#
|
66
69
|
def is_redirect?
|
67
70
|
(code == 301 || code == 307)
|
@@ -70,30 +73,30 @@ module Spidr
|
|
70
73
|
alias redirect? is_redirect?
|
71
74
|
|
72
75
|
#
|
73
|
-
# Determines if the response code is
|
76
|
+
# Determines if the response code is `308`.
|
74
77
|
#
|
75
78
|
# @return [Boolean]
|
76
|
-
# Specifies whether the response code is
|
79
|
+
# Specifies whether the response code is `308`.
|
77
80
|
#
|
78
81
|
def timedout?
|
79
82
|
code == 308
|
80
83
|
end
|
81
84
|
|
82
85
|
#
|
83
|
-
# Determines if the response code is
|
86
|
+
# Determines if the response code is `400`.
|
84
87
|
#
|
85
88
|
# @return [Boolean]
|
86
|
-
# Specifies whether the response code is
|
89
|
+
# Specifies whether the response code is `400`.
|
87
90
|
#
|
88
91
|
def bad_request?
|
89
92
|
code == 400
|
90
93
|
end
|
91
94
|
|
92
95
|
#
|
93
|
-
# Determines if the response code is
|
96
|
+
# Determines if the response code is `401`.
|
94
97
|
#
|
95
98
|
# @return [Boolean]
|
96
|
-
# Specifies whether the response code is
|
99
|
+
# Specifies whether the response code is `401`.
|
97
100
|
#
|
98
101
|
def is_unauthorized?
|
99
102
|
code == 401
|
@@ -102,10 +105,10 @@ module Spidr
|
|
102
105
|
alias unauthorized? is_unauthorized?
|
103
106
|
|
104
107
|
#
|
105
|
-
# Determines if the response code is
|
108
|
+
# Determines if the response code is `403`.
|
106
109
|
#
|
107
110
|
# @return [Boolean]
|
108
|
-
# Specifies whether the response code is
|
111
|
+
# Specifies whether the response code is `403`.
|
109
112
|
#
|
110
113
|
def is_forbidden?
|
111
114
|
code == 403
|
@@ -114,10 +117,10 @@ module Spidr
|
|
114
117
|
alias forbidden? is_forbidden?
|
115
118
|
|
116
119
|
#
|
117
|
-
# Determines if the response code is
|
120
|
+
# Determines if the response code is `404`.
|
118
121
|
#
|
119
122
|
# @return [Boolean]
|
120
|
-
# Specifies whether the response code is
|
123
|
+
# Specifies whether the response code is `404`.
|
121
124
|
#
|
122
125
|
def is_missing?
|
123
126
|
code == 404
|
@@ -126,10 +129,10 @@ module Spidr
|
|
126
129
|
alias missing? is_missing?
|
127
130
|
|
128
131
|
#
|
129
|
-
# Determines if the response code is
|
132
|
+
# Determines if the response code is `500`.
|
130
133
|
#
|
131
134
|
# @return [Boolean]
|
132
|
-
# Specifies whether the response code is
|
135
|
+
# Specifies whether the response code is `500`.
|
133
136
|
#
|
134
137
|
def had_internal_server_error?
|
135
138
|
code == 500
|
@@ -306,12 +309,14 @@ module Spidr
|
|
306
309
|
def cookie_params
|
307
310
|
params = {}
|
308
311
|
|
309
|
-
cookies.each do |
|
310
|
-
|
312
|
+
cookies.each do |cookie|
|
313
|
+
cookie.split('; ').each do |key_value|
|
314
|
+
key, value = key_value.split('=',2)
|
311
315
|
|
312
|
-
|
316
|
+
next if RESERVED_COOKIE_NAMES.include?(key)
|
313
317
|
|
314
|
-
|
318
|
+
params[key] = (value || '')
|
319
|
+
end
|
315
320
|
end
|
316
321
|
|
317
322
|
return params
|
@@ -332,7 +337,7 @@ module Spidr
|
|
332
337
|
#
|
333
338
|
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
334
339
|
# The document that represents HTML or XML pages.
|
335
|
-
# Returns
|
340
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
336
341
|
# the page could not be parsed properly.
|
337
342
|
#
|
338
343
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
@@ -380,7 +385,7 @@ module Spidr
|
|
380
385
|
# Searches for the first occurrence an XPath or CSS Path expression.
|
381
386
|
#
|
382
387
|
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
383
|
-
# The first matched node. Returns
|
388
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
384
389
|
# or if the page is not a HTML or XML document.
|
385
390
|
#
|
386
391
|
# @example
|
@@ -416,7 +421,7 @@ module Spidr
|
|
416
421
|
#
|
417
422
|
# @return [Array<String>]
|
418
423
|
# All links within the HTML page, frame/iframe source URLs and any
|
419
|
-
# links in the
|
424
|
+
# links in the `Location` header.
|
420
425
|
#
|
421
426
|
def links
|
422
427
|
urls = []
|
@@ -502,7 +507,7 @@ module Spidr
|
|
502
507
|
protected
|
503
508
|
|
504
509
|
#
|
505
|
-
# Provides transparent access to the values in
|
510
|
+
# Provides transparent access to the values in `headers`.
|
506
511
|
#
|
507
512
|
def method_missing(sym,*args,&block)
|
508
513
|
if (args.empty? && block.nil?)
|