spidr 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,46 @@
|
|
1
|
+
=== 0.2.0 / 2009-10-10
|
2
|
+
|
3
|
+
* Added URI.expand_path.
|
4
|
+
* Added Spidr::Page#search.
|
5
|
+
* Added Spidr::Page#at.
|
6
|
+
* Added Spidr::Page#title.
|
7
|
+
* Added Spidr::Agent#failures=.
|
8
|
+
* Added a HTTP session cache to Spidr::Agent, per suggestion of falter.
|
9
|
+
* Added Spidr::Agent#get_session.
|
10
|
+
* Added Spidr::Agent#kill_session.
|
11
|
+
* Added Spidr.proxy=.
|
12
|
+
* Added Spidr.disable_proxy!.
|
13
|
+
* Aliased Spidr::Page#txt? to Spidr::Page#plain_text?.
|
14
|
+
* Aliased Spidr::Page#ok? to Spidr::Page#is_ok?.
|
15
|
+
* Aliased Spidr::Page#redirect? to Spidr::Page#is_redirect?.
|
16
|
+
* Aliased Spidr::Page#unauthorized? to Spidr::Page#is_unauthorized?.
|
17
|
+
* Aliased Spidr::Page#forbidden? to Spidr::Page#is_forbidden?.
|
18
|
+
* Aliased Spidr::Page#missing? to Spidr::Page#is_missing?.
|
19
|
+
* Split URL filtering code out of Spidr::Agent and into Spidr::Filtering.
|
20
|
+
* Split URL / Page event code out of Spidr::Agent and into Spidr::Events.
|
21
|
+
* Split pause! / continue! / skip_link! / skip_page! methods out of
|
22
|
+
Spidr::Agent and into Spidr::Actions.
|
23
|
+
* Fixed a bug in Spidr::Page#code, where it was not returning an Integer.
|
24
|
+
* Make sure Spidr::Page#doc returns Nokogiri::XML::Document objects for
|
25
|
+
RSS/RDF/Atom pages as well.
|
26
|
+
* Fixed the handling of the Location header in Spidr::Page#links
|
27
|
+
(thanks falter).
|
28
|
+
* Fixed a bug in Spidr::Page#to_absolute where trailing '/' characters on
|
29
|
+
URI paths were not being preserved (thanks falter).
|
30
|
+
* Fixed a bug where the URI query was not being sent with the request
|
31
|
+
in Spidr::Agent#get_page (thanks Damian Steer).
|
32
|
+
* Fixed a bug where SSL sessions were not being properly setup
|
33
|
+
(thanks falter).
|
34
|
+
* Switched Spidr::Agent#history to be a Set, to improve search-time
|
35
|
+
of the history (thanks falter).
|
36
|
+
* Switched Spidr::Agent#failures to a Set.
|
37
|
+
* Allow a block to be passed to Spidr::Agent#run, which will receive all
|
38
|
+
pages visited.
|
39
|
+
* Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks to
|
40
|
+
Spidr::Agent#run.
|
41
|
+
* Made Spidr::Agent#visit_page public.
|
42
|
+
* Moved to YARD based documentation.
|
43
|
+
|
1
44
|
=== 0.1.9 / 2009-06-13
|
2
45
|
|
3
46
|
* Upgraded to Hoe 2.0.0.
|
data/Manifest.txt
CHANGED
@@ -3,15 +3,34 @@ Manifest.txt
|
|
3
3
|
README.txt
|
4
4
|
Rakefile
|
5
5
|
lib/spidr.rb
|
6
|
+
lib/spidr/extensions.rb
|
7
|
+
lib/spidr/extensions/uri.rb
|
6
8
|
lib/spidr/page.rb
|
7
9
|
lib/spidr/rules.rb
|
10
|
+
lib/spidr/filters.rb
|
11
|
+
lib/spidr/events.rb
|
12
|
+
lib/spidr/actions.rb
|
13
|
+
lib/spidr/actions/exceptions.rb
|
14
|
+
lib/spidr/actions/exceptions/action.rb
|
15
|
+
lib/spidr/actions/exceptions/paused.rb
|
16
|
+
lib/spidr/actions/exceptions/skip_link.rb
|
17
|
+
lib/spidr/actions/exceptions/skip_page.rb
|
18
|
+
lib/spidr/actions/actions.rb
|
8
19
|
lib/spidr/agent.rb
|
9
20
|
lib/spidr/spidr.rb
|
10
21
|
lib/spidr/version.rb
|
11
22
|
tasks/spec.rb
|
23
|
+
tasks/yard.rb
|
12
24
|
tasks/course.rb
|
13
25
|
spec/spec_helper.rb
|
14
26
|
spec/helpers/course.rb
|
27
|
+
spec/helpers/page.rb
|
28
|
+
spec/extensions/uri_spec.rb
|
29
|
+
spec/page_examples.rb
|
30
|
+
spec/page_spec.rb
|
31
|
+
spec/rules_spec.rb
|
32
|
+
spec/filters_spec.rb
|
33
|
+
spec/actions_spec.rb
|
15
34
|
spec/agent_spec.rb
|
16
35
|
spec/spidr_spec.rb
|
17
36
|
static/course/index.html
|
data/README.txt
CHANGED
@@ -28,19 +28,14 @@ and easy to use.
|
|
28
28
|
* Every visited URL.
|
29
29
|
* Every visited URL that matches a specified pattern.
|
30
30
|
* Every URL that failed to be visited.
|
31
|
-
*
|
31
|
+
* Provides action methods to:
|
32
|
+
* Pause spidering.
|
33
|
+
* Skip processing of pages.
|
34
|
+
* Skip processing of links.
|
32
35
|
* Restore the spidering queue and history from a previous session.
|
33
36
|
* Custom User-Agent strings.
|
34
37
|
* Custom proxy settings.
|
35
38
|
|
36
|
-
== REQUIREMENTS:
|
37
|
-
|
38
|
-
* {nokogiri}[http://nokogiri.rubyforge.org/]
|
39
|
-
|
40
|
-
== INSTALL:
|
41
|
-
|
42
|
-
$ sudo gem install spidr
|
43
|
-
|
44
39
|
== EXAMPLES:
|
45
40
|
|
46
41
|
* Start spidering from a URL:
|
@@ -49,11 +44,32 @@ and easy to use.
|
|
49
44
|
|
50
45
|
* Spider a host:
|
51
46
|
|
52
|
-
Spidr.host('
|
47
|
+
Spidr.host('coderrr.wordpress.com')
|
53
48
|
|
54
49
|
* Spider a site:
|
55
50
|
|
56
|
-
Spidr.site('http://
|
51
|
+
Spidr.site('http://rubyflow.com/')
|
52
|
+
|
53
|
+
* Spider multiple hosts:
|
54
|
+
|
55
|
+
Spidr.start_at(
|
56
|
+
'http://company.com/',
|
57
|
+
:hosts => [
|
58
|
+
'company.com',
|
59
|
+
/host\d\.company\.com/
|
60
|
+
]
|
61
|
+
)
|
62
|
+
|
63
|
+
* Do not spider certain links:
|
64
|
+
|
65
|
+
Spidr.site('http://matasano.com/', :ignore_links => [/log/])
|
66
|
+
|
67
|
+
* Do not spider links on certain ports:
|
68
|
+
|
69
|
+
Spidr.site(
|
70
|
+
'http://sketchy.content.com/',
|
71
|
+
:ignore_ports => [8000, 8010, 8080]
|
72
|
+
)
|
57
73
|
|
58
74
|
* Print out visited URLs:
|
59
75
|
|
@@ -61,6 +77,79 @@ and easy to use.
|
|
61
77
|
spider.every_url { |url| puts url }
|
62
78
|
end
|
63
79
|
|
80
|
+
* Print out the URLs that could not be requested:
|
81
|
+
|
82
|
+
Spidr.site('http://sketchy.content.com/') do |spider|
|
83
|
+
spider.every_failed_url { |url| puts url }
|
84
|
+
end
|
85
|
+
|
86
|
+
* Search HTML and XML pages:
|
87
|
+
|
88
|
+
Spidr.site('http://company.withablog.com/') do |spider|
|
89
|
+
spider.every_page do |page|
|
90
|
+
puts "[-] #{page.url}"
|
91
|
+
|
92
|
+
page.search('//meta').each do |meta|
|
93
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
94
|
+
value = meta.attributes['content']
|
95
|
+
|
96
|
+
puts " #{name} = #{value}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
* Print out the titles from every page:
|
102
|
+
|
103
|
+
Spidr.site('http://www.rubypulse.com/') do |spider|
|
104
|
+
spider.every_page do |page|
|
105
|
+
puts page.title if page.html?
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
* Find what kinds of web servers a host is using, by accessing the headers:
|
110
|
+
|
111
|
+
servers = Set[]
|
112
|
+
|
113
|
+
Spidr.host('generic.company.com') do |spider|
|
114
|
+
spider.all_headers do |headers|
|
115
|
+
servers << headers['server']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
* Pause the spider on a forbidden page:
|
120
|
+
|
121
|
+
spider = Spidr.host('overnight.startup.com') do |spider|
|
122
|
+
spider.every_page do |page|
|
123
|
+
spider.pause! if page.forbidden?
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
* Skip the processing of a page:
|
128
|
+
|
129
|
+
Spidr.host('sketchy.content.com') do |spider|
|
130
|
+
spider.every_page do |page|
|
131
|
+
spider.skip_page! if page.not_found?
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
* Skip the processing of links:
|
136
|
+
|
137
|
+
Spidr.host('sketchy.content.com') do |spider|
|
138
|
+
spider.every_url do |url|
|
139
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
140
|
+
spider.skip_link!
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
== REQUIREMENTS:
|
146
|
+
|
147
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/] >= 1.2.0
|
148
|
+
|
149
|
+
== INSTALL:
|
150
|
+
|
151
|
+
$ sudo gem install spidr
|
152
|
+
|
64
153
|
== LICENSE:
|
65
154
|
|
66
155
|
The MIT License
|
data/Rakefile
CHANGED
@@ -4,14 +4,24 @@ require 'rubygems'
|
|
4
4
|
require 'hoe'
|
5
5
|
require 'hoe/signing'
|
6
6
|
require './tasks/spec.rb'
|
7
|
+
require './tasks/yard.rb'
|
7
8
|
require './tasks/course.rb'
|
8
9
|
require './lib/spidr/version.rb'
|
9
10
|
|
10
|
-
Hoe.spec('spidr') do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
Hoe.spec('spidr') do
|
12
|
+
self.rubyforge_name = 'spidr'
|
13
|
+
self.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
14
|
+
self.remote_rdoc_dir = 'docs'
|
15
|
+
self.extra_deps = [
|
16
|
+
['nokogiri', '>=1.2.0']
|
17
|
+
]
|
18
|
+
|
19
|
+
self.extra_dev_deps = [
|
20
|
+
['rspec', '>=1.2.8'],
|
21
|
+
['yard', '>=0.2.3.5']
|
22
|
+
]
|
23
|
+
|
24
|
+
self.spec_extras = {:has_rdoc => 'yard'}
|
15
25
|
end
|
16
26
|
|
17
27
|
# vim: syntax=Ruby
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'spidr/actions/exceptions/paused'
|
2
|
+
require 'spidr/actions/exceptions/skip_link'
|
3
|
+
require 'spidr/actions/exceptions/skip_page'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
module Actions
|
7
|
+
def initialize(options={})
|
8
|
+
@paused = false
|
9
|
+
|
10
|
+
super(options)
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Continue spidering.
|
15
|
+
#
|
16
|
+
# @yield [page]
|
17
|
+
# If a block is given, it will be passed every page visited.
|
18
|
+
#
|
19
|
+
# @yieldparam [Page] page
|
20
|
+
# The page to be visited.
|
21
|
+
#
|
22
|
+
def continue!(&block)
|
23
|
+
@paused = false
|
24
|
+
return run(&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Sets the pause state of the agent.
|
29
|
+
#
|
30
|
+
# @param [Boolean] state
|
31
|
+
# The new pause state of the agent.
|
32
|
+
#
|
33
|
+
def pause=(state)
|
34
|
+
@paused = state
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Pauses the agent, causing spidering to temporarily stop.
|
39
|
+
#
|
40
|
+
# @raise [Paused]
|
41
|
+
# Indicates to the agent, that it should pause spidering.
|
42
|
+
#
|
43
|
+
def pause!
|
44
|
+
@paused = true
|
45
|
+
raise(Paused)
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Determines whether the agent is paused.
|
50
|
+
#
|
51
|
+
# @return [Boolean]
|
52
|
+
# Specifies whether the agent is paused.
|
53
|
+
#
|
54
|
+
def paused?
|
55
|
+
@paused == true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Causes the agent to skip the link being enqueued.
|
60
|
+
#
|
61
|
+
# @raise [SkipLink]
|
62
|
+
# Indicates to the agent, that the current link should be skipped,
|
63
|
+
# and not enqueued or visited.
|
64
|
+
#
|
65
|
+
def skip_link!
|
66
|
+
raise(SkipLink)
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Causes the agent to skip the page being visited.
|
71
|
+
#
|
72
|
+
# @raise [SkipPage]
|
73
|
+
# Indicates to the agent, that the current page should be skipped.
|
74
|
+
#
|
75
|
+
def skip_page!
|
76
|
+
raise(SkipPage)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/spidr/agent.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
|
-
require 'spidr/
|
1
|
+
require 'spidr/filters'
|
2
|
+
require 'spidr/events'
|
3
|
+
require 'spidr/actions'
|
2
4
|
require 'spidr/page'
|
3
5
|
require 'spidr/spidr'
|
4
6
|
|
5
7
|
require 'net/http'
|
8
|
+
require 'set'
|
6
9
|
|
7
10
|
module Spidr
|
8
11
|
class Agent
|
9
12
|
|
13
|
+
include Filters
|
14
|
+
include Events
|
15
|
+
include Actions
|
16
|
+
|
10
17
|
# Proxy to use
|
11
18
|
attr_accessor :proxy
|
12
19
|
|
@@ -19,9 +26,6 @@ module Spidr
|
|
19
26
|
# Delay in between fetching pages
|
20
27
|
attr_accessor :delay
|
21
28
|
|
22
|
-
# List of acceptable URL schemes to follow
|
23
|
-
attr_reader :schemes
|
24
|
-
|
25
29
|
# History containing visited URLs
|
26
30
|
attr_reader :history
|
27
31
|
|
@@ -32,105 +36,81 @@ module Spidr
|
|
32
36
|
attr_reader :queue
|
33
37
|
|
34
38
|
#
|
35
|
-
# Creates a new Agent object
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
39
|
+
# Creates a new Agent object.
|
40
|
+
#
|
41
|
+
# @param [Hash] options
|
42
|
+
# Additional options
|
43
|
+
#
|
44
|
+
# @option options [Hash] :proxy (Spidr.proxy)
|
45
|
+
# The proxy information to use.
|
46
|
+
#
|
47
|
+
# @option :proxy [String] :host
|
48
|
+
# The host the proxy is running on.
|
49
|
+
#
|
50
|
+
# @option :proxy [Integer] :port
|
51
|
+
# The port the proxy is running on.
|
52
|
+
#
|
53
|
+
# @option :proxy [String] :user
|
54
|
+
# The user to authenticate as with the proxy.
|
55
|
+
#
|
56
|
+
# @option :proxy [String] :password
|
57
|
+
# The password to authenticate with.
|
58
|
+
#
|
59
|
+
# @option options [String] :user_agent (Spidr.user_agent)
|
60
|
+
# The User-Agent string to send with each requests.
|
61
|
+
#
|
62
|
+
# @option options [String] :referer
|
63
|
+
# The Referer URL to send with each request.
|
64
|
+
#
|
65
|
+
# @option options [Integer] :delay (0)
|
66
|
+
# The number of seconds to pause between each request.
|
67
|
+
#
|
68
|
+
# @option options [Set, Array] :queue
|
69
|
+
# The initial queue of URLs to visit.
|
70
|
+
#
|
71
|
+
# @option options [Set, Array] :history
|
72
|
+
# The initial list of visited URLs.
|
73
|
+
#
|
74
|
+
# @yield [agent]
|
75
|
+
# If a block is given, it will be passed the newly created agent
|
76
|
+
# for further configuration.
|
77
|
+
#
|
78
|
+
# @yieldparam [Agent] agent
|
79
|
+
# The newly created agent.
|
61
80
|
#
|
62
81
|
def initialize(options={},&block)
|
63
82
|
@proxy = (options[:proxy] || Spidr.proxy)
|
64
83
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
65
84
|
@referer = options[:referer]
|
66
85
|
|
67
|
-
@
|
68
|
-
|
69
|
-
if options[:schemes]
|
70
|
-
@schemes += options[:schemes]
|
71
|
-
else
|
72
|
-
@schemes << 'http'
|
73
|
-
|
74
|
-
begin
|
75
|
-
require 'net/https'
|
76
|
-
|
77
|
-
@schemes << 'https'
|
78
|
-
rescue Gem::LoadError => e
|
79
|
-
raise(e)
|
80
|
-
rescue ::LoadError
|
81
|
-
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
@host_rules = Rules.new(
|
86
|
-
:accept => options[:hosts],
|
87
|
-
:reject => options[:ignore_hosts]
|
88
|
-
)
|
89
|
-
@port_rules = Rules.new(
|
90
|
-
:accept => options[:ports],
|
91
|
-
:reject => options[:ignore_ports]
|
92
|
-
)
|
93
|
-
@link_rules = Rules.new(
|
94
|
-
:accept => options[:links],
|
95
|
-
:reject => options[:ignore_links]
|
96
|
-
)
|
97
|
-
@ext_rules = Rules.new(
|
98
|
-
:accept => options[:exts],
|
99
|
-
:reject => options[:ignore_exts]
|
100
|
-
)
|
101
|
-
|
102
|
-
@every_url_blocks = []
|
103
|
-
@every_failed_url_blocks = []
|
104
|
-
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
105
|
-
|
106
|
-
@every_page_blocks = []
|
107
|
-
|
86
|
+
@running = false
|
108
87
|
@delay = (options[:delay] || 0)
|
109
|
-
@history = []
|
110
|
-
@failures = []
|
88
|
+
@history = Set[]
|
89
|
+
@failures = Set[]
|
111
90
|
@queue = []
|
112
|
-
@paused = true
|
113
91
|
|
114
|
-
|
115
|
-
visit_hosts_like(options[:host])
|
116
|
-
end
|
92
|
+
@sessions = {}
|
117
93
|
|
118
|
-
|
119
|
-
self.queue = options[:queue]
|
120
|
-
end
|
121
|
-
|
122
|
-
if options[:history]
|
123
|
-
self.history = options[:history]
|
124
|
-
end
|
94
|
+
super(options)
|
125
95
|
|
126
96
|
block.call(self) if block
|
127
97
|
end
|
128
98
|
|
129
99
|
#
|
130
|
-
# Creates a new
|
131
|
-
#
|
132
|
-
#
|
133
|
-
# spidering.
|
100
|
+
# Creates a new agent and begin spidering at the given URL.
|
101
|
+
#
|
102
|
+
# @param [URI::HTTP, String] url
|
103
|
+
# The URL to start spidering at.
|
104
|
+
#
|
105
|
+
# @param [Hash] options
|
106
|
+
# Additional options. See {Agent#initialize}.
|
107
|
+
#
|
108
|
+
# @yield [agent]
|
109
|
+
# If a block is given, it will be passed the newly created agent
|
110
|
+
# before it begins spidering.
|
111
|
+
#
|
112
|
+
# @yieldparam [Agent] agent
|
113
|
+
# The newly created agent.
|
134
114
|
#
|
135
115
|
def self.start_at(url,options={},&block)
|
136
116
|
self.new(options) do |spider|
|
@@ -141,10 +121,20 @@ module Spidr
|
|
141
121
|
end
|
142
122
|
|
143
123
|
#
|
144
|
-
# Creates a new
|
145
|
-
#
|
146
|
-
#
|
147
|
-
#
|
124
|
+
# Creates a new agent and spiders the given host.
|
125
|
+
#
|
126
|
+
# @param [String]
|
127
|
+
# The host-name to spider.
|
128
|
+
#
|
129
|
+
# @param [Hash] options
|
130
|
+
# Additional options. See {Agent#initialize}.
|
131
|
+
#
|
132
|
+
# @yield [agent]
|
133
|
+
# If a block is given, it will be passed the newly created agent
|
134
|
+
# before it begins spidering.
|
135
|
+
#
|
136
|
+
# @yieldparam [Agent] agent
|
137
|
+
# The newly created agent.
|
148
138
|
#
|
149
139
|
def self.host(name,options={},&block)
|
150
140
|
self.new(options.merge(:host => name)) do |spider|
|
@@ -155,10 +145,20 @@ module Spidr
|
|
155
145
|
end
|
156
146
|
|
157
147
|
#
|
158
|
-
# Creates a new
|
159
|
-
#
|
160
|
-
#
|
161
|
-
#
|
148
|
+
# Creates a new agent and spiders the web-site located at the given URL.
|
149
|
+
#
|
150
|
+
# @param [URI::HTTP, String] url
|
151
|
+
# The web-site to spider.
|
152
|
+
#
|
153
|
+
# @param [Hash] options
|
154
|
+
# Additional options. See {Agent#initialize}.
|
155
|
+
#
|
156
|
+
# @yield [agent]
|
157
|
+
# If a block is given, it will be passed the newly created agent
|
158
|
+
# before it begins spidering.
|
159
|
+
#
|
160
|
+
# @yieldparam [Agent] agent
|
161
|
+
# The newly created agent.
|
162
162
|
#
|
163
163
|
def self.site(url,options={},&block)
|
164
164
|
url = URI(url.to_s)
|
@@ -171,348 +171,280 @@ module Spidr
|
|
171
171
|
end
|
172
172
|
|
173
173
|
#
|
174
|
-
#
|
175
|
-
#
|
176
|
-
def visit_hosts
|
177
|
-
@host_rules.accept
|
178
|
-
end
|
179
|
-
|
180
|
-
#
|
181
|
-
# Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
|
182
|
-
# it will be added to the visit_hosts.
|
174
|
+
# Clears the history of the agent.
|
183
175
|
#
|
184
|
-
def
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
visit_hosts << block
|
189
|
-
end
|
190
|
-
|
176
|
+
def clear
|
177
|
+
@queue.clear
|
178
|
+
@history.clear
|
179
|
+
@failures.clear
|
191
180
|
return self
|
192
181
|
end
|
193
182
|
|
194
183
|
#
|
195
|
-
#
|
184
|
+
# Start spidering at a given URL.
|
196
185
|
#
|
197
|
-
|
198
|
-
|
199
|
-
end
|
200
|
-
|
186
|
+
# @param [URI::HTTP, String] url
|
187
|
+
# The URL to start spidering at.
|
201
188
|
#
|
202
|
-
#
|
203
|
-
# it will be
|
189
|
+
# @yield [page]
|
190
|
+
# If a block is given, it will be passed every page visited.
|
204
191
|
#
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
end
|
192
|
+
# @yieldparam [Page] page
|
193
|
+
# A page which has been visited.
|
194
|
+
#
|
195
|
+
def start_at(url,&block)
|
196
|
+
enqueue(url)
|
211
197
|
|
212
|
-
return
|
198
|
+
return run(&block)
|
213
199
|
end
|
214
200
|
|
215
201
|
#
|
216
|
-
#
|
202
|
+
# Start spidering until the queue becomes empty or the agent is
|
203
|
+
# paused.
|
217
204
|
#
|
218
|
-
|
219
|
-
|
220
|
-
end
|
221
|
-
|
205
|
+
# @yield [page]
|
206
|
+
# If a block is given, it will be passed every page visited.
|
222
207
|
#
|
223
|
-
#
|
224
|
-
#
|
208
|
+
# @yieldparam [Page] page
|
209
|
+
# A page which has been visited.
|
225
210
|
#
|
226
|
-
def
|
227
|
-
|
228
|
-
visit_ports << pattern
|
229
|
-
elsif block
|
230
|
-
visit_ports << block
|
231
|
-
end
|
211
|
+
def run(&block)
|
212
|
+
@running = true
|
232
213
|
|
233
|
-
|
234
|
-
|
214
|
+
until (@queue.empty? || paused?)
|
215
|
+
begin
|
216
|
+
visit_page(dequeue,&block)
|
217
|
+
rescue Actions::Paused
|
218
|
+
return self
|
219
|
+
rescue Actions::Action
|
220
|
+
end
|
221
|
+
end
|
235
222
|
|
236
|
-
|
237
|
-
# Returns the +Array+ of URL port patterns to not visit.
|
238
|
-
#
|
239
|
-
def ignore_ports
|
240
|
-
@port_rules.reject
|
241
|
-
end
|
223
|
+
@running = false
|
242
224
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
ignore_ports << pattern
|
250
|
-
elsif block
|
251
|
-
ignore_ports << block
|
225
|
+
@sessions.each_value do |sess|
|
226
|
+
begin
|
227
|
+
sess.finish
|
228
|
+
rescue IOError
|
229
|
+
nil
|
230
|
+
end
|
252
231
|
end
|
253
232
|
|
233
|
+
@sessions.clear
|
254
234
|
return self
|
255
235
|
end
|
256
236
|
|
257
237
|
#
|
258
|
-
#
|
238
|
+
# Determines if the agent is running.
|
259
239
|
#
|
260
|
-
|
261
|
-
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the agent is running or stopped.
|
242
|
+
#
|
243
|
+
def running?
|
244
|
+
@running == true
|
262
245
|
end
|
263
246
|
|
264
247
|
#
|
265
|
-
#
|
266
|
-
# it will be added to the visit_links.
|
248
|
+
# Sets the history of URLs that were previously visited.
|
267
249
|
#
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
250
|
+
# @param [#each] new_history
|
251
|
+
# A list of URLs to populate the history with.
|
252
|
+
#
|
253
|
+
# @return [Set<URI::HTTP>]
|
254
|
+
# The history of the agent.
|
255
|
+
#
|
256
|
+
# @example
|
257
|
+
# agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
|
258
|
+
#
|
259
|
+
def history=(new_history)
|
260
|
+
@history.clear
|
261
|
+
|
262
|
+
new_history.each do |url|
|
263
|
+
@history << unless url.kind_of?(URI)
|
264
|
+
URI(url.to_s)
|
265
|
+
else
|
266
|
+
url
|
267
|
+
end
|
273
268
|
end
|
274
269
|
|
275
|
-
return
|
270
|
+
return @history
|
276
271
|
end
|
277
272
|
|
273
|
+
alias visited_urls history
|
274
|
+
|
275
|
+
#
|
276
|
+
# Specifies the links which have been visited.
|
278
277
|
#
|
279
|
-
#
|
278
|
+
# @return [Array<String>]
|
279
|
+
# The links which have been visited.
|
280
280
|
#
|
281
|
-
def
|
282
|
-
@
|
281
|
+
def visited_links
|
282
|
+
@history.map { |url| url.to_s }
|
283
283
|
end
|
284
284
|
|
285
285
|
#
|
286
|
-
#
|
287
|
-
# it will be added to the ignore_links.
|
286
|
+
# Specifies all hosts that were visited.
|
288
287
|
#
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
end
|
295
|
-
|
296
|
-
return self
|
288
|
+
# @return [Array<String>]
|
289
|
+
# The hosts which have been visited.
|
290
|
+
#
|
291
|
+
def visited_hosts
|
292
|
+
visited_urls.map { |uri| uri.host }.uniq
|
297
293
|
end
|
298
294
|
|
299
295
|
#
|
300
|
-
#
|
296
|
+
# Determines whether a URL was visited or not.
|
301
297
|
#
|
302
|
-
|
303
|
-
|
304
|
-
end
|
305
|
-
|
298
|
+
# @param [URI::HTTP, String] url
|
299
|
+
# The URL to search for.
|
306
300
|
#
|
307
|
-
#
|
308
|
-
#
|
301
|
+
# @return [Boolean]
|
302
|
+
# Specifies whether a URL was visited.
|
309
303
|
#
|
310
|
-
def
|
311
|
-
|
312
|
-
visit_exts << pattern
|
313
|
-
elsif block
|
314
|
-
visit_exts << block
|
315
|
-
end
|
304
|
+
def visited?(url)
|
305
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
316
306
|
|
317
|
-
return
|
307
|
+
return @history.include?(url)
|
318
308
|
end
|
319
309
|
|
320
310
|
#
|
321
|
-
#
|
311
|
+
# Sets the list of failed URLs.
|
322
312
|
#
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
313
|
+
# @param [#each]
|
314
|
+
# The new list of failed URLs.
|
315
|
+
#
|
316
|
+
# @return [Array<URI::HTTP>]
|
317
|
+
# The list of failed URLs.
|
327
318
|
#
|
328
|
-
#
|
329
|
-
#
|
319
|
+
# @example
|
320
|
+
# agent.failures = ['http://localhost/']
|
330
321
|
#
|
331
|
-
def
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
322
|
+
def failures=(new_failures)
|
323
|
+
@failures.clear
|
324
|
+
|
325
|
+
new_failures.each do |url|
|
326
|
+
@failures << unless url.kind_of?(URI)
|
327
|
+
URI(url.to_s)
|
328
|
+
else
|
329
|
+
url
|
330
|
+
end
|
336
331
|
end
|
337
332
|
|
338
|
-
return
|
333
|
+
return @failures
|
339
334
|
end
|
340
335
|
|
341
336
|
#
|
342
|
-
#
|
343
|
-
# specified _block_.
|
337
|
+
# Determines whether a given URL could not be visited.
|
344
338
|
#
|
345
|
-
|
346
|
-
|
347
|
-
return self
|
348
|
-
end
|
349
|
-
|
339
|
+
# @param [URI::HTTP, String] url
|
340
|
+
# The URL to check for failures.
|
350
341
|
#
|
351
|
-
#
|
352
|
-
#
|
342
|
+
# @return [Boolean]
|
343
|
+
# Specifies whether the given URL was unable to be visited.
|
353
344
|
#
|
354
|
-
def
|
355
|
-
|
356
|
-
return self
|
357
|
-
end
|
345
|
+
def failed?(url)
|
346
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
358
347
|
|
359
|
-
|
360
|
-
# For every URL that the agent visits and matches the specified
|
361
|
-
# _pattern_, it will be passed to the specified _block_.
|
362
|
-
#
|
363
|
-
def urls_like(pattern,&block)
|
364
|
-
@urls_like_blocks[pattern] << block
|
365
|
-
return self
|
348
|
+
return @failures.include?(url)
|
366
349
|
end
|
367
350
|
|
368
|
-
|
369
|
-
# For every Page that the agent visits, pass the page to the
|
370
|
-
# specified _block_.
|
371
|
-
#
|
372
|
-
def every_page(&block)
|
373
|
-
@every_page_blocks << block
|
374
|
-
return self
|
375
|
-
end
|
351
|
+
alias pending_urls queue
|
376
352
|
|
377
353
|
#
|
378
|
-
#
|
379
|
-
# _block_.
|
354
|
+
# Sets the queue of URLs to visit.
|
380
355
|
#
|
381
|
-
|
382
|
-
|
383
|
-
end
|
384
|
-
|
385
|
-
#
|
386
|
-
# Clears the history of the agent.
|
356
|
+
# @param [#each]
|
357
|
+
# The new list of URLs to visit.
|
387
358
|
#
|
388
|
-
|
389
|
-
|
390
|
-
@history.clear
|
391
|
-
@failures.clear
|
392
|
-
return self
|
393
|
-
end
|
394
|
-
|
359
|
+
# @return [Array<URI::HTTP>]
|
360
|
+
# The list of URLs to visit.
|
395
361
|
#
|
396
|
-
#
|
362
|
+
# @example
|
363
|
+
# agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
|
397
364
|
#
|
398
|
-
def
|
399
|
-
|
400
|
-
|
401
|
-
return continue!
|
402
|
-
end
|
365
|
+
def queue=(new_queue)
|
366
|
+
@queue.clear
|
403
367
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
visit_page(dequeue)
|
368
|
+
new_queue.each do |url|
|
369
|
+
@queue << unless url.kind_of?(URI)
|
370
|
+
URI(url.to_s)
|
371
|
+
else
|
372
|
+
url
|
373
|
+
end
|
411
374
|
end
|
412
375
|
|
413
|
-
return
|
376
|
+
return @queue
|
414
377
|
end
|
415
378
|
|
416
379
|
#
|
417
|
-
#
|
380
|
+
# Determines whether a given URL has been enqueued.
|
418
381
|
#
|
419
|
-
|
420
|
-
|
421
|
-
return run
|
422
|
-
end
|
423
|
-
|
382
|
+
# @param [URI::HTTP] url
|
383
|
+
# The URL to search for in the queue.
|
424
384
|
#
|
425
|
-
#
|
426
|
-
#
|
385
|
+
# @return [Boolean]
|
386
|
+
# Specifies whether the given URL has been queued for visiting.
|
427
387
|
#
|
428
|
-
def
|
429
|
-
@
|
388
|
+
def queued?(url)
|
389
|
+
@queue.include?(url)
|
430
390
|
end
|
431
391
|
|
432
392
|
#
|
433
|
-
#
|
393
|
+
# Enqueues a given URL for visiting, only if it passes all of the
|
394
|
+
# agent's rules for visiting a given URL.
|
434
395
|
#
|
435
|
-
|
436
|
-
|
437
|
-
end
|
438
|
-
|
396
|
+
# @param [URI::HTTP, String] url
|
397
|
+
# The URL to enqueue for visiting.
|
439
398
|
#
|
440
|
-
#
|
399
|
+
# @return [Boolean]
|
400
|
+
# Specifies whether the URL was enqueued, or ignored.
|
441
401
|
#
|
442
|
-
def
|
443
|
-
|
444
|
-
|
445
|
-
end
|
402
|
+
def enqueue(url)
|
403
|
+
link = url.to_s
|
404
|
+
url = URI(link) unless url.kind_of?(URI)
|
446
405
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
#
|
451
|
-
# agent.schemes = ['http']
|
452
|
-
#
|
453
|
-
def schemes=(new_schemes)
|
454
|
-
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
455
|
-
end
|
406
|
+
if (!(queued?(url)) && visit?(url))
|
407
|
+
begin
|
408
|
+
@every_url_blocks.each { |block| block.call(url) }
|
456
409
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
else
|
468
|
-
url
|
410
|
+
@urls_like_blocks.each do |pattern,blocks|
|
411
|
+
if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
|
412
|
+
blocks.each { |url_block| url_block.call(url) }
|
413
|
+
end
|
414
|
+
end
|
415
|
+
rescue Actions::Paused => action
|
416
|
+
raise(action)
|
417
|
+
rescue Actions::SkipLink
|
418
|
+
return false
|
419
|
+
rescue Actions::Action
|
469
420
|
end
|
470
|
-
end
|
471
|
-
end
|
472
421
|
|
473
|
-
|
422
|
+
@queue << url
|
423
|
+
return true
|
424
|
+
end
|
474
425
|
|
475
|
-
|
476
|
-
# Returns the +Array+ of visited URLs.
|
477
|
-
#
|
478
|
-
def visited_links
|
479
|
-
@history.map { |uri| uri.to_s }
|
426
|
+
return false
|
480
427
|
end
|
481
428
|
|
482
429
|
#
|
483
|
-
#
|
430
|
+
# Requests and creates a new Page object from a given URL.
|
484
431
|
#
|
485
|
-
|
486
|
-
|
487
|
-
end
|
488
|
-
|
432
|
+
# @param [URI::HTTP] url
|
433
|
+
# The URL to request.
|
489
434
|
#
|
490
|
-
#
|
491
|
-
#
|
435
|
+
# @yield [page]
|
436
|
+
# If a block is given, it will be passed the page that represents the
|
437
|
+
# response.
|
492
438
|
#
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
return @history.include?(url)
|
497
|
-
end
|
498
|
-
|
439
|
+
# @yieldparam [Page] page
|
440
|
+
# The page for the response.
|
499
441
|
#
|
500
|
-
#
|
501
|
-
#
|
502
|
-
#
|
503
|
-
def failed?(url)
|
504
|
-
url = URI(url) unless url.kind_of?(URI)
|
505
|
-
|
506
|
-
return @failures.include?(url)
|
507
|
-
end
|
508
|
-
|
509
|
-
alias pending_urls queue
|
510
|
-
|
511
|
-
#
|
512
|
-
# Creates a new Page object from the specified _url_. If a _block_ is
|
513
|
-
# given, it will be passed the newly created Page object.
|
442
|
+
# @return [Page, nil]
|
443
|
+
# The page for the response, or +nil+ if the request failed.
|
514
444
|
#
|
515
445
|
def get_page(url,&block)
|
446
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
447
|
+
|
516
448
|
host = url.host
|
517
449
|
port = url.port
|
518
450
|
|
@@ -522,15 +454,12 @@ module Spidr
|
|
522
454
|
path = '/'
|
523
455
|
end
|
524
456
|
|
525
|
-
|
526
|
-
|
527
|
-
proxy_user = @proxy[:user]
|
528
|
-
proxy_password = @proxy[:password]
|
457
|
+
# append the URL query to the path
|
458
|
+
path += "?#{url.query}" if url.query
|
529
459
|
|
530
460
|
begin
|
531
|
-
|
461
|
+
get_session(url.scheme,host,port) do |sess|
|
532
462
|
headers = {}
|
533
|
-
|
534
463
|
headers['User-Agent'] = @user_agent if @user_agent
|
535
464
|
headers['Referer'] = @referer if @referer
|
536
465
|
|
@@ -539,157 +468,169 @@ module Spidr
|
|
539
468
|
block.call(new_page) if block
|
540
469
|
return new_page
|
541
470
|
end
|
542
|
-
rescue SystemCallError, Net::HTTPBadResponse
|
471
|
+
rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
|
543
472
|
failed(url)
|
473
|
+
kill_session(url.scheme,host,port)
|
544
474
|
return nil
|
545
475
|
end
|
546
476
|
end
|
547
477
|
|
548
478
|
#
|
549
|
-
#
|
550
|
-
#
|
479
|
+
# Visits a given URL, and enqueus the links recovered from the URL
|
480
|
+
# to be visited later.
|
551
481
|
#
|
552
|
-
|
553
|
-
|
554
|
-
end
|
555
|
-
|
482
|
+
# @param [URI::HTTP, String] url
|
483
|
+
# The URL to visit.
|
556
484
|
#
|
557
|
-
#
|
485
|
+
# @yield [page]
|
486
|
+
# If a block is given, it will be passed the page which was visited.
|
558
487
|
#
|
559
|
-
#
|
488
|
+
# @yieldparam [Page] page
|
489
|
+
# The page which was visited.
|
560
490
|
#
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
491
|
+
# @return [Page, nil]
|
492
|
+
# The page that was visited. If +nil+ is returned, either the request
|
493
|
+
# for the page failed, or the page was skipped.
|
494
|
+
#
|
495
|
+
def visit_page(url,&block)
|
496
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
497
|
+
|
498
|
+
get_page(url) do |page|
|
499
|
+
@history << page.url
|
500
|
+
|
501
|
+
begin
|
502
|
+
@every_page_blocks.each { |page_block| page_block.call(page) }
|
503
|
+
|
504
|
+
block.call(page) if block
|
505
|
+
rescue Actions::Paused => action
|
506
|
+
raise(action)
|
507
|
+
rescue Actions::SkipPage
|
508
|
+
return nil
|
509
|
+
rescue Actions::Action
|
567
510
|
end
|
511
|
+
|
512
|
+
page.urls.each { |next_url| enqueue(next_url) }
|
568
513
|
end
|
569
514
|
end
|
570
515
|
|
571
516
|
#
|
572
|
-
#
|
573
|
-
# +false+ otherwise.
|
517
|
+
# Converts the agent into a Hash.
|
574
518
|
#
|
575
|
-
|
576
|
-
|
519
|
+
# @return [Hash]
|
520
|
+
# The agent represented as a Hash containing the +history+ and
|
521
|
+
# the +queue+ of the agent.
|
522
|
+
#
|
523
|
+
def to_hash
|
524
|
+
{:history => @history, :queue => @queue}
|
577
525
|
end
|
578
526
|
|
527
|
+
protected
|
528
|
+
|
579
529
|
#
|
580
|
-
#
|
581
|
-
#
|
582
|
-
# was successfully enqueued, returns +false+ otherwise.
|
530
|
+
# Provides an active HTTP session for the given scheme, host
|
531
|
+
# and port.
|
583
532
|
#
|
584
|
-
|
585
|
-
|
586
|
-
|
533
|
+
# @param [String] scheme
|
534
|
+
# The scheme of the URL, which will be requested later.
|
535
|
+
#
|
536
|
+
# @param [String] host
|
537
|
+
# The host that the session is needed with.
|
538
|
+
#
|
539
|
+
# @param [Integer] port
|
540
|
+
# The port that the session is needed for.
|
541
|
+
#
|
542
|
+
# @yield [session]
|
543
|
+
# If a block is given, it will be passed the active HTTP session.
|
544
|
+
#
|
545
|
+
# @yieldparam [Net::HTTP] session
|
546
|
+
# The active HTTP session object.
|
547
|
+
#
|
548
|
+
def get_session(scheme,host,port,&block)
|
549
|
+
key = [scheme,host,port]
|
587
550
|
|
588
|
-
|
589
|
-
|
551
|
+
unless @sessions[key]
|
552
|
+
session = Net::HTTP::Proxy(
|
553
|
+
@proxy[:host],
|
554
|
+
@proxy[:port],
|
555
|
+
@proxy[:user],
|
556
|
+
@proxy[:password]
|
557
|
+
).new(host,port)
|
590
558
|
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
end
|
559
|
+
if scheme == 'https'
|
560
|
+
session.use_ssl = true
|
561
|
+
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
595
562
|
end
|
596
563
|
|
597
|
-
@
|
598
|
-
return true
|
564
|
+
@sessions[key] = session
|
599
565
|
end
|
600
566
|
|
601
|
-
|
567
|
+
session = @sessions[key]
|
568
|
+
block.call(session) if block
|
569
|
+
return session
|
602
570
|
end
|
603
571
|
|
604
|
-
protected
|
605
|
-
|
606
572
|
#
|
607
|
-
#
|
573
|
+
# Destroys an HTTP session for the given scheme, host and port.
|
608
574
|
#
|
609
|
-
|
610
|
-
|
611
|
-
end
|
612
|
-
|
575
|
+
# @param [String] scheme
|
576
|
+
# The scheme of the URL, which was requested through the session.
|
613
577
|
#
|
614
|
-
#
|
615
|
-
#
|
578
|
+
# @param [String] host
|
579
|
+
# The host that the session was connected with.
|
616
580
|
#
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
581
|
+
# @param [Integer] port
|
582
|
+
# The port that the session was connected to.
|
583
|
+
#
|
584
|
+
def kill_session(scheme,host,port,&block)
|
585
|
+
key = [scheme,host,port]
|
586
|
+
sess = @sessions[key]
|
587
|
+
|
588
|
+
begin
|
589
|
+
sess.finish
|
590
|
+
rescue IOError
|
591
|
+
nil
|
622
592
|
end
|
623
|
-
end
|
624
593
|
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
#
|
629
|
-
def visit_host?(url)
|
630
|
-
@host_rules.accept?(url.host)
|
594
|
+
@sessions.delete(key)
|
595
|
+
block.call if block
|
596
|
+
return nil
|
631
597
|
end
|
632
598
|
|
633
599
|
#
|
634
|
-
#
|
635
|
-
# the port of the _url_, returns +false+ otherwise.
|
636
|
-
#
|
637
|
-
def visit_port?(url)
|
638
|
-
@port_rules.accept?(url.port)
|
639
|
-
end
|
640
|
-
|
600
|
+
# Dequeues a URL that will later be visited.
|
641
601
|
#
|
642
|
-
#
|
643
|
-
#
|
602
|
+
# @return [URI::HTTP]
|
603
|
+
# The URL that was at the front of the queue.
|
644
604
|
#
|
645
|
-
def
|
646
|
-
@
|
605
|
+
def dequeue
|
606
|
+
@queue.shift
|
647
607
|
end
|
648
608
|
|
649
609
|
#
|
650
|
-
#
|
651
|
-
# the file extension of the _url_, returns +false+ otherwise.
|
610
|
+
# Determines if a given URL should be visited.
|
652
611
|
#
|
653
|
-
|
654
|
-
|
655
|
-
end
|
656
|
-
|
612
|
+
# @param [URI::HTTP] url
|
613
|
+
# The URL in question.
|
657
614
|
#
|
658
|
-
#
|
659
|
-
#
|
615
|
+
# @return [Boolean]
|
616
|
+
# Specifies whether the given URL should be visited.
|
660
617
|
#
|
661
618
|
def visit?(url)
|
662
619
|
(!(visited?(url)) &&
|
663
|
-
visit_scheme?(url) &&
|
664
|
-
visit_host?(url) &&
|
665
|
-
visit_port?(url) &&
|
666
|
-
visit_link?(url) &&
|
667
|
-
visit_ext?(url))
|
620
|
+
visit_scheme?(url.scheme) &&
|
621
|
+
visit_host?(url.host) &&
|
622
|
+
visit_port?(url.port) &&
|
623
|
+
visit_link?(url.to_s) &&
|
624
|
+
visit_ext?(url.path))
|
668
625
|
end
|
669
626
|
|
670
627
|
#
|
671
|
-
#
|
672
|
-
# _block_ is given, it will be passed a newly created Page object
|
673
|
-
# for the specified _url_.
|
674
|
-
#
|
675
|
-
def visit_page(url,&block)
|
676
|
-
get_page(url) do |page|
|
677
|
-
@history << page.url
|
678
|
-
|
679
|
-
page.urls.each { |next_url| enqueue(next_url) }
|
680
|
-
|
681
|
-
@every_page_blocks.each { |page_block| page_block.call(page) }
|
682
|
-
|
683
|
-
block.call(page) if block
|
684
|
-
end
|
685
|
-
end
|
686
|
-
|
628
|
+
# Adds a given URL to the failures list.
|
687
629
|
#
|
688
|
-
#
|
630
|
+
# @param [URI::HTTP] url
|
631
|
+
# The URL to add to the failures list.
|
689
632
|
#
|
690
633
|
def failed(url)
|
691
|
-
url = URI(url.to_s) unless url.kind_of?(URI)
|
692
|
-
|
693
634
|
@every_failed_url_blocks.each { |block| block.call(url) }
|
694
635
|
@failures << url
|
695
636
|
return true
|