spidr 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.specopts +1 -0
- data/.yardopts +1 -0
- data/{History.rdoc → ChangeLog.md} +47 -39
- data/LICENSE.txt +21 -0
- data/{README.rdoc → README.md} +57 -49
- data/Rakefile +36 -22
- data/lib/spidr/actions/actions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +3 -0
- data/lib/spidr/actions/exceptions/paused.rb +3 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +4 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +4 -0
- data/lib/spidr/agent.rb +61 -17
- data/lib/spidr/auth_credential.rb +3 -0
- data/lib/spidr/auth_store.rb +12 -8
- data/lib/spidr/cookie_jar.rb +4 -1
- data/lib/spidr/events.rb +25 -0
- data/lib/spidr/filters.rb +5 -1
- data/lib/spidr/page.rb +29 -24
- data/lib/spidr/rules.rb +4 -0
- data/lib/spidr/sanitizers.rb +4 -0
- data/lib/spidr/session_cache.rb +26 -1
- data/lib/spidr/version.rb +1 -1
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +108 -0
- data/spec/page_spec.rb +0 -1
- data/spec/session_cache.rb +58 -0
- data/spidr.gemspec +115 -0
- metadata +99 -90
- data.tar.gz.sig +0 -2
- data/Manifest.txt +0 -41
- data/tasks/spec.rb +0 -10
- data/tasks/yard.rb +0 -12
- metadata.gz.sig +0 -0
@@ -3,6 +3,10 @@ require 'spidr/actions/exceptions/skip_link'
|
|
3
3
|
require 'spidr/actions/exceptions/skip_page'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# The {Actions} module adds methods to {Agent} for controling the
|
8
|
+
# spidering of links.
|
9
|
+
#
|
6
10
|
module Actions
|
7
11
|
def initialize(options={})
|
8
12
|
@paused = false
|
data/lib/spidr/agent.rb
CHANGED
@@ -19,6 +19,12 @@ module Spidr
|
|
19
19
|
include Events
|
20
20
|
include Actions
|
21
21
|
|
22
|
+
# HTTP Host Header to use
|
23
|
+
attr_accessor :host_header
|
24
|
+
|
25
|
+
# HTTP Host Headers to use for specific hosts
|
26
|
+
attr_reader :host_headers
|
27
|
+
|
22
28
|
# User-Agent to use
|
23
29
|
attr_accessor :user_agent
|
24
30
|
|
@@ -64,6 +70,12 @@ module Spidr
|
|
64
70
|
# @option :proxy [String] :password
|
65
71
|
# The password to authenticate with.
|
66
72
|
#
|
73
|
+
# @option options [String] :host_header
|
74
|
+
# The HTTP Host header to use with each request.
|
75
|
+
#
|
76
|
+
# @option options [Hash{String,Regexp => String}] :host_headers
|
77
|
+
# The HTTP Host headers to use for specific hosts.
|
78
|
+
#
|
67
79
|
# @option options [String] :user_agent (Spidr.user_agent)
|
68
80
|
# The User-Agent string to send with each requests.
|
69
81
|
#
|
@@ -87,6 +99,13 @@ module Spidr
|
|
87
99
|
# The newly created agent.
|
88
100
|
#
|
89
101
|
def initialize(options={},&block)
|
102
|
+
@host_header = options[:host_header]
|
103
|
+
@host_headers = {}
|
104
|
+
|
105
|
+
if options[:host_headers]
|
106
|
+
@host_headers.merge!(options[:host_headers])
|
107
|
+
end
|
108
|
+
|
90
109
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
91
110
|
@referer = options[:referer]
|
92
111
|
|
@@ -473,7 +492,7 @@ module Spidr
|
|
473
492
|
# The page for the response.
|
474
493
|
#
|
475
494
|
# @return [Page, nil]
|
476
|
-
# The page for the response, or
|
495
|
+
# The page for the response, or `nil` if the request failed.
|
477
496
|
#
|
478
497
|
def get_page(url,&block)
|
479
498
|
url = URI(url.to_s)
|
@@ -506,7 +525,7 @@ module Spidr
|
|
506
525
|
# The page for the response.
|
507
526
|
#
|
508
527
|
# @return [Page, nil]
|
509
|
-
# The page for the response, or
|
528
|
+
# The page for the response, or `nil` if the request failed.
|
510
529
|
#
|
511
530
|
# @since 0.2.2
|
512
531
|
#
|
@@ -538,7 +557,7 @@ module Spidr
|
|
538
557
|
# The page which was visited.
|
539
558
|
#
|
540
559
|
# @return [Page, nil]
|
541
|
-
# The page that was visited. If
|
560
|
+
# The page that was visited. If `nil` is returned, either the request
|
542
561
|
# for the page failed, or the page was skipped.
|
543
562
|
#
|
544
563
|
def visit_page(url,&block)
|
@@ -558,7 +577,20 @@ module Spidr
|
|
558
577
|
rescue Actions::Action
|
559
578
|
end
|
560
579
|
|
561
|
-
page.urls.each
|
580
|
+
page.urls.each do |next_url|
|
581
|
+
begin
|
582
|
+
@every_link_blocks.each do |link_block|
|
583
|
+
link_block.call(page.url,next_url)
|
584
|
+
end
|
585
|
+
rescue Actions::Paused => action
|
586
|
+
raise(action)
|
587
|
+
rescue Actions::SkipLink
|
588
|
+
next
|
589
|
+
rescue Actions::Action
|
590
|
+
end
|
591
|
+
|
592
|
+
enqueue(next_url)
|
593
|
+
end
|
562
594
|
end
|
563
595
|
end
|
564
596
|
|
@@ -566,8 +598,8 @@ module Spidr
|
|
566
598
|
# Converts the agent into a Hash.
|
567
599
|
#
|
568
600
|
# @return [Hash]
|
569
|
-
# The agent represented as a Hash containing the
|
570
|
-
# the
|
601
|
+
# The agent represented as a Hash containing the `history` and
|
602
|
+
# the `queue` of the agent.
|
571
603
|
#
|
572
604
|
def to_hash
|
573
605
|
{:history => @history, :queue => @queue}
|
@@ -609,20 +641,32 @@ module Spidr
|
|
609
641
|
# append the URL query to the path
|
610
642
|
path += "?#{url.query}" if url.query
|
611
643
|
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
headers = {}
|
616
|
-
headers['User-Agent'] = @user_agent if @user_agent
|
617
|
-
headers['Referer'] = @referer if @referer
|
644
|
+
# set any additional HTTP headers
|
645
|
+
headers = {}
|
618
646
|
|
619
|
-
|
620
|
-
|
647
|
+
unless @host_headers.empty?
|
648
|
+
@host_headers.each do |name,header|
|
649
|
+
if host.match(name)
|
650
|
+
headers['Host'] = header
|
651
|
+
break
|
652
|
+
end
|
621
653
|
end
|
654
|
+
end
|
622
655
|
|
623
|
-
|
624
|
-
|
625
|
-
|
656
|
+
headers['Host'] ||= @host_header if @host_header
|
657
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
658
|
+
headers['Referer'] = @referer if @referer
|
659
|
+
|
660
|
+
if (authorization = @authorized.for_url(url))
|
661
|
+
headers['Authorization'] = "Basic #{authorization}"
|
662
|
+
end
|
663
|
+
|
664
|
+
if (header_cookies = @cookies.for_host(url.host))
|
665
|
+
headers['Cookie'] = header_cookies
|
666
|
+
end
|
667
|
+
|
668
|
+
begin
|
669
|
+
sleep(@delay) if @delay > 0
|
626
670
|
|
627
671
|
block.call(@sessions[url],path,headers)
|
628
672
|
rescue SystemCallError,
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -5,6 +5,10 @@ require 'spidr/page'
|
|
5
5
|
require 'base64'
|
6
6
|
|
7
7
|
module Spidr
|
8
|
+
#
|
9
|
+
# Stores {AuthCredential} objects organized by a website's scheme,
|
10
|
+
# host-name and sub-directory.
|
11
|
+
#
|
8
12
|
class AuthStore
|
9
13
|
|
10
14
|
#
|
@@ -24,13 +28,13 @@ module Spidr
|
|
24
28
|
#
|
25
29
|
# @return [AuthCredential, nil]
|
26
30
|
# Closest matching {AuthCredential} values for the URL,
|
27
|
-
# or
|
31
|
+
# or `nil` if nothing matches.
|
28
32
|
#
|
29
33
|
# @since 0.2.2
|
30
34
|
#
|
31
35
|
def [](url)
|
32
36
|
# normalize the url
|
33
|
-
url = URI(url) unless url.kind_of?(URI)
|
37
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
34
38
|
|
35
39
|
key = [url.scheme, url.host, url.port]
|
36
40
|
paths = @credentials[key]
|
@@ -64,9 +68,9 @@ module Spidr
|
|
64
68
|
#
|
65
69
|
# @since 0.2.2
|
66
70
|
#
|
67
|
-
def []=(url,
|
71
|
+
def []=(url,auth)
|
68
72
|
# normalize the url
|
69
|
-
url = URI(url) unless url.kind_of?(URI)
|
73
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
70
74
|
|
71
75
|
# normalize the URL path
|
72
76
|
path = URI.expand_path(url.path)
|
@@ -96,19 +100,19 @@ module Spidr
|
|
96
100
|
#
|
97
101
|
# @since 0.2.2
|
98
102
|
#
|
99
|
-
def add(url,
|
100
|
-
self[url] = AuthCredential.new(username,
|
103
|
+
def add(url,username,password)
|
104
|
+
self[url] = AuthCredential.new(username,password)
|
101
105
|
end
|
102
106
|
|
103
107
|
#
|
104
108
|
# Returns the base64 encoded authorization string for the URL
|
105
|
-
# or
|
109
|
+
# or `nil` if no authorization exists.
|
106
110
|
#
|
107
111
|
# @param [URI] url
|
108
112
|
# The url.
|
109
113
|
#
|
110
114
|
# @return [String, nil]
|
111
|
-
# The base64 encoded authorizatio string or
|
115
|
+
# The base64 encoded authorizatio string or `nil`.
|
112
116
|
#
|
113
117
|
# @since 0.2.2
|
114
118
|
#
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -3,6 +3,9 @@ require 'spidr/page'
|
|
3
3
|
require 'set'
|
4
4
|
|
5
5
|
module Spidr
|
6
|
+
#
|
7
|
+
# Stores HTTP Cookies organized by host-name.
|
8
|
+
#
|
6
9
|
class CookieJar
|
7
10
|
|
8
11
|
include Enumerable
|
@@ -47,7 +50,7 @@ module Spidr
|
|
47
50
|
# Host or domain name for cookies.
|
48
51
|
#
|
49
52
|
# @return [String, nil]
|
50
|
-
# The cookie values or
|
53
|
+
# The cookie values or `nil` if the host does not have a cookie in the
|
51
54
|
# jar.
|
52
55
|
#
|
53
56
|
# @since 0.2.2
|
data/lib/spidr/events.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
module Spidr
|
2
|
+
#
|
3
|
+
# The {Events} module adds methods to {Agent} for registering
|
4
|
+
# callbacks which will receive URLs, links, headers and pages, when
|
5
|
+
# they are visited.
|
6
|
+
#
|
2
7
|
module Events
|
3
8
|
def initialize(options={})
|
4
9
|
super(options)
|
@@ -8,6 +13,7 @@ module Spidr
|
|
8
13
|
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
9
14
|
|
10
15
|
@every_page_blocks = []
|
16
|
+
@every_link_blocks = []
|
11
17
|
end
|
12
18
|
|
13
19
|
#
|
@@ -499,5 +505,24 @@ module Spidr
|
|
499
505
|
block.call(page) if (block && page.zip?)
|
500
506
|
end
|
501
507
|
end
|
508
|
+
|
509
|
+
#
|
510
|
+
# Passes every origin and destination URI of each link to a given
|
511
|
+
# block.
|
512
|
+
#
|
513
|
+
# @yield [origin,dest]
|
514
|
+
# The block will be passed every origin and destination URI of
|
515
|
+
# each link.
|
516
|
+
#
|
517
|
+
# @yieldparam [URI::HTTP] origin
|
518
|
+
# The URI that a link originated from.
|
519
|
+
#
|
520
|
+
# @yieldparam [URI::HTTP] dest
|
521
|
+
# The destination URI of a link.
|
522
|
+
#
|
523
|
+
def every_link(&block)
|
524
|
+
@every_link_blocks << block
|
525
|
+
return self
|
526
|
+
end
|
502
527
|
end
|
503
528
|
end
|
data/lib/spidr/filters.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
require 'spidr/rules'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
+
#
|
5
|
+
# The {Filters} module adds methods to {Agent} for controlling which
|
6
|
+
# URLs the agent will visit.
|
7
|
+
#
|
4
8
|
module Filters
|
5
9
|
def self.included(base)
|
6
10
|
base.module_eval do
|
@@ -17,7 +21,7 @@ module Spidr
|
|
17
21
|
#
|
18
22
|
# @option options [Array] :schemes (['http', 'https'])
|
19
23
|
# The list of acceptable URI schemes to visit.
|
20
|
-
# The
|
24
|
+
# The `https` scheme will be ignored if `net/https` cannot be loaded.
|
21
25
|
#
|
22
26
|
# @option options [String] :host
|
23
27
|
# The host-name to visit.
|
data/lib/spidr/page.rb
CHANGED
@@ -5,6 +5,9 @@ require 'uri'
|
|
5
5
|
require 'nokogiri'
|
6
6
|
|
7
7
|
module Spidr
|
8
|
+
#
|
9
|
+
# Represents a requested page from a website.
|
10
|
+
#
|
8
11
|
class Page
|
9
12
|
|
10
13
|
# Reserved names used within Cookie strings
|
@@ -46,10 +49,10 @@ module Spidr
|
|
46
49
|
end
|
47
50
|
|
48
51
|
#
|
49
|
-
# Determines if the response code is
|
52
|
+
# Determines if the response code is `200`.
|
50
53
|
#
|
51
54
|
# @return [Boolean]
|
52
|
-
# Specifies whether the response code is
|
55
|
+
# Specifies whether the response code is `200`.
|
53
56
|
#
|
54
57
|
def is_ok?
|
55
58
|
code == 200
|
@@ -58,10 +61,10 @@ module Spidr
|
|
58
61
|
alias ok? is_ok?
|
59
62
|
|
60
63
|
#
|
61
|
-
# Determines if the response code is
|
64
|
+
# Determines if the response code is `301` or `307`.
|
62
65
|
#
|
63
66
|
# @return [Boolean]
|
64
|
-
# Specifies whether the response code is
|
67
|
+
# Specifies whether the response code is `301` or `307`.
|
65
68
|
#
|
66
69
|
def is_redirect?
|
67
70
|
(code == 301 || code == 307)
|
@@ -70,30 +73,30 @@ module Spidr
|
|
70
73
|
alias redirect? is_redirect?
|
71
74
|
|
72
75
|
#
|
73
|
-
# Determines if the response code is
|
76
|
+
# Determines if the response code is `308`.
|
74
77
|
#
|
75
78
|
# @return [Boolean]
|
76
|
-
# Specifies whether the response code is
|
79
|
+
# Specifies whether the response code is `308`.
|
77
80
|
#
|
78
81
|
def timedout?
|
79
82
|
code == 308
|
80
83
|
end
|
81
84
|
|
82
85
|
#
|
83
|
-
# Determines if the response code is
|
86
|
+
# Determines if the response code is `400`.
|
84
87
|
#
|
85
88
|
# @return [Boolean]
|
86
|
-
# Specifies whether the response code is
|
89
|
+
# Specifies whether the response code is `400`.
|
87
90
|
#
|
88
91
|
def bad_request?
|
89
92
|
code == 400
|
90
93
|
end
|
91
94
|
|
92
95
|
#
|
93
|
-
# Determines if the response code is
|
96
|
+
# Determines if the response code is `401`.
|
94
97
|
#
|
95
98
|
# @return [Boolean]
|
96
|
-
# Specifies whether the response code is
|
99
|
+
# Specifies whether the response code is `401`.
|
97
100
|
#
|
98
101
|
def is_unauthorized?
|
99
102
|
code == 401
|
@@ -102,10 +105,10 @@ module Spidr
|
|
102
105
|
alias unauthorized? is_unauthorized?
|
103
106
|
|
104
107
|
#
|
105
|
-
# Determines if the response code is
|
108
|
+
# Determines if the response code is `403`.
|
106
109
|
#
|
107
110
|
# @return [Boolean]
|
108
|
-
# Specifies whether the response code is
|
111
|
+
# Specifies whether the response code is `403`.
|
109
112
|
#
|
110
113
|
def is_forbidden?
|
111
114
|
code == 403
|
@@ -114,10 +117,10 @@ module Spidr
|
|
114
117
|
alias forbidden? is_forbidden?
|
115
118
|
|
116
119
|
#
|
117
|
-
# Determines if the response code is
|
120
|
+
# Determines if the response code is `404`.
|
118
121
|
#
|
119
122
|
# @return [Boolean]
|
120
|
-
# Specifies whether the response code is
|
123
|
+
# Specifies whether the response code is `404`.
|
121
124
|
#
|
122
125
|
def is_missing?
|
123
126
|
code == 404
|
@@ -126,10 +129,10 @@ module Spidr
|
|
126
129
|
alias missing? is_missing?
|
127
130
|
|
128
131
|
#
|
129
|
-
# Determines if the response code is
|
132
|
+
# Determines if the response code is `500`.
|
130
133
|
#
|
131
134
|
# @return [Boolean]
|
132
|
-
# Specifies whether the response code is
|
135
|
+
# Specifies whether the response code is `500`.
|
133
136
|
#
|
134
137
|
def had_internal_server_error?
|
135
138
|
code == 500
|
@@ -306,12 +309,14 @@ module Spidr
|
|
306
309
|
def cookie_params
|
307
310
|
params = {}
|
308
311
|
|
309
|
-
cookies.each do |
|
310
|
-
|
312
|
+
cookies.each do |cookie|
|
313
|
+
cookie.split('; ').each do |key_value|
|
314
|
+
key, value = key_value.split('=',2)
|
311
315
|
|
312
|
-
|
316
|
+
next if RESERVED_COOKIE_NAMES.include?(key)
|
313
317
|
|
314
|
-
|
318
|
+
params[key] = (value || '')
|
319
|
+
end
|
315
320
|
end
|
316
321
|
|
317
322
|
return params
|
@@ -332,7 +337,7 @@ module Spidr
|
|
332
337
|
#
|
333
338
|
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
334
339
|
# The document that represents HTML or XML pages.
|
335
|
-
# Returns
|
340
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
336
341
|
# the page could not be parsed properly.
|
337
342
|
#
|
338
343
|
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
@@ -380,7 +385,7 @@ module Spidr
|
|
380
385
|
# Searches for the first occurrence an XPath or CSS Path expression.
|
381
386
|
#
|
382
387
|
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
383
|
-
# The first matched node. Returns
|
388
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
384
389
|
# or if the page is not a HTML or XML document.
|
385
390
|
#
|
386
391
|
# @example
|
@@ -416,7 +421,7 @@ module Spidr
|
|
416
421
|
#
|
417
422
|
# @return [Array<String>]
|
418
423
|
# All links within the HTML page, frame/iframe source URLs and any
|
419
|
-
# links in the
|
424
|
+
# links in the `Location` header.
|
420
425
|
#
|
421
426
|
def links
|
422
427
|
urls = []
|
@@ -502,7 +507,7 @@ module Spidr
|
|
502
507
|
protected
|
503
508
|
|
504
509
|
#
|
505
|
-
# Provides transparent access to the values in
|
510
|
+
# Provides transparent access to the values in `headers`.
|
506
511
|
#
|
507
512
|
def method_missing(sym,*args,&block)
|
508
513
|
if (args.empty? && block.nil?)
|