spidr 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,46 @@
|
|
1
|
+
=== 0.2.0 / 2009-10-10
|
2
|
+
|
3
|
+
* Added URI.expand_path.
|
4
|
+
* Added Spidr::Page#search.
|
5
|
+
* Added Spidr::Page#at.
|
6
|
+
* Added Spidr::Page#title.
|
7
|
+
* Added Spidr::Agent#failures=.
|
8
|
+
* Added a HTTP session cache to Spidr::Agent, per suggestion of falter.
|
9
|
+
* Added Spidr::Agent#get_session.
|
10
|
+
* Added Spidr::Agent#kill_session.
|
11
|
+
* Added Spidr.proxy=.
|
12
|
+
* Added Spidr.disable_proxy!.
|
13
|
+
* Aliased Spidr::Page#txt? to Spidr::Page#plain_text?.
|
14
|
+
* Aliased Spidr::Page#ok? to Spidr::Page#is_ok?.
|
15
|
+
* Aliased Spidr::Page#redirect? to Spidr::Page#is_redirect?.
|
16
|
+
* Aliased Spidr::Page#unauthorized? to Spidr::Page#is_unauthorized?.
|
17
|
+
* Aliased Spidr::Page#forbidden? to Spidr::Page#is_forbidden?.
|
18
|
+
* Aliased Spidr::Page#missing? to Spidr::Page#is_missing?.
|
19
|
+
* Split URL filtering code out of Spidr::Agent and into Spidr::Filtering.
|
20
|
+
* Split URL / Page event code out of Spidr::Agent and into Spidr::Events.
|
21
|
+
* Split pause! / continue! / skip_link! / skip_page! methods out of
|
22
|
+
Spidr::Agent and into Spidr::Actions.
|
23
|
+
* Fixed a bug in Spidr::Page#code, where it was not returning an Integer.
|
24
|
+
* Make sure Spidr::Page#doc returns Nokogiri::XML::Document objects for
|
25
|
+
RSS/RDF/Atom pages as well.
|
26
|
+
* Fixed the handling of the Location header in Spidr::Page#links
|
27
|
+
(thanks falter).
|
28
|
+
* Fixed a bug in Spidr::Page#to_absolute where trailing '/' characters on
|
29
|
+
URI paths were not being preserved (thanks falter).
|
30
|
+
* Fixed a bug where the URI query was not being sent with the request
|
31
|
+
in Spidr::Agent#get_page (thanks Damian Steer).
|
32
|
+
* Fixed a bug where SSL sessions were not being properly setup
|
33
|
+
(thanks falter).
|
34
|
+
* Switched Spidr::Agent#history to be a Set, to improve search-time
|
35
|
+
of the history (thanks falter).
|
36
|
+
* Switched Spidr::Agent#failures to a Set.
|
37
|
+
* Allow a block to be passed to Spidr::Agent#run, which will receive all
|
38
|
+
pages visited.
|
39
|
+
* Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks to
|
40
|
+
Spidr::Agent#run.
|
41
|
+
* Made Spidr::Agent#visit_page public.
|
42
|
+
* Moved to YARD based documentation.
|
43
|
+
|
1
44
|
=== 0.1.9 / 2009-06-13
|
2
45
|
|
3
46
|
* Upgraded to Hoe 2.0.0.
|
data/Manifest.txt
CHANGED
@@ -3,15 +3,34 @@ Manifest.txt
|
|
3
3
|
README.txt
|
4
4
|
Rakefile
|
5
5
|
lib/spidr.rb
|
6
|
+
lib/spidr/extensions.rb
|
7
|
+
lib/spidr/extensions/uri.rb
|
6
8
|
lib/spidr/page.rb
|
7
9
|
lib/spidr/rules.rb
|
10
|
+
lib/spidr/filters.rb
|
11
|
+
lib/spidr/events.rb
|
12
|
+
lib/spidr/actions.rb
|
13
|
+
lib/spidr/actions/exceptions.rb
|
14
|
+
lib/spidr/actions/exceptions/action.rb
|
15
|
+
lib/spidr/actions/exceptions/paused.rb
|
16
|
+
lib/spidr/actions/exceptions/skip_link.rb
|
17
|
+
lib/spidr/actions/exceptions/skip_page.rb
|
18
|
+
lib/spidr/actions/actions.rb
|
8
19
|
lib/spidr/agent.rb
|
9
20
|
lib/spidr/spidr.rb
|
10
21
|
lib/spidr/version.rb
|
11
22
|
tasks/spec.rb
|
23
|
+
tasks/yard.rb
|
12
24
|
tasks/course.rb
|
13
25
|
spec/spec_helper.rb
|
14
26
|
spec/helpers/course.rb
|
27
|
+
spec/helpers/page.rb
|
28
|
+
spec/extensions/uri_spec.rb
|
29
|
+
spec/page_examples.rb
|
30
|
+
spec/page_spec.rb
|
31
|
+
spec/rules_spec.rb
|
32
|
+
spec/filters_spec.rb
|
33
|
+
spec/actions_spec.rb
|
15
34
|
spec/agent_spec.rb
|
16
35
|
spec/spidr_spec.rb
|
17
36
|
static/course/index.html
|
data/README.txt
CHANGED
@@ -28,19 +28,14 @@ and easy to use.
|
|
28
28
|
* Every visited URL.
|
29
29
|
* Every visited URL that matches a specified pattern.
|
30
30
|
* Every URL that failed to be visited.
|
31
|
-
*
|
31
|
+
* Provides action methods to:
|
32
|
+
* Pause spidering.
|
33
|
+
* Skip processing of pages.
|
34
|
+
* Skip processing of links.
|
32
35
|
* Restore the spidering queue and history from a previous session.
|
33
36
|
* Custom User-Agent strings.
|
34
37
|
* Custom proxy settings.
|
35
38
|
|
36
|
-
== REQUIREMENTS:
|
37
|
-
|
38
|
-
* {nokogiri}[http://nokogiri.rubyforge.org/]
|
39
|
-
|
40
|
-
== INSTALL:
|
41
|
-
|
42
|
-
$ sudo gem install spidr
|
43
|
-
|
44
39
|
== EXAMPLES:
|
45
40
|
|
46
41
|
* Start spidering from a URL:
|
@@ -49,11 +44,32 @@ and easy to use.
|
|
49
44
|
|
50
45
|
* Spider a host:
|
51
46
|
|
52
|
-
Spidr.host('
|
47
|
+
Spidr.host('coderrr.wordpress.com')
|
53
48
|
|
54
49
|
* Spider a site:
|
55
50
|
|
56
|
-
Spidr.site('http://
|
51
|
+
Spidr.site('http://rubyflow.com/')
|
52
|
+
|
53
|
+
* Spider multiple hosts:
|
54
|
+
|
55
|
+
Spidr.start_at(
|
56
|
+
'http://company.com/',
|
57
|
+
:hosts => [
|
58
|
+
'company.com',
|
59
|
+
/host\d\.company\.com/
|
60
|
+
]
|
61
|
+
)
|
62
|
+
|
63
|
+
* Do not spider certain links:
|
64
|
+
|
65
|
+
Spidr.site('http://matasano.com/', :ignore_links => [/log/])
|
66
|
+
|
67
|
+
* Do not spider links on certain ports:
|
68
|
+
|
69
|
+
Spidr.site(
|
70
|
+
'http://sketchy.content.com/',
|
71
|
+
:ignore_ports => [8000, 8010, 8080]
|
72
|
+
)
|
57
73
|
|
58
74
|
* Print out visited URLs:
|
59
75
|
|
@@ -61,6 +77,79 @@ and easy to use.
|
|
61
77
|
spider.every_url { |url| puts url }
|
62
78
|
end
|
63
79
|
|
80
|
+
* Print out the URLs that could not be requested:
|
81
|
+
|
82
|
+
Spidr.site('http://sketchy.content.com/') do |spider|
|
83
|
+
spider.every_failed_url { |url| puts url }
|
84
|
+
end
|
85
|
+
|
86
|
+
* Search HTML and XML pages:
|
87
|
+
|
88
|
+
Spidr.site('http://company.withablog.com/') do |spider|
|
89
|
+
spider.every_page do |page|
|
90
|
+
puts "[-] #{page.url}"
|
91
|
+
|
92
|
+
page.search('//meta').each do |meta|
|
93
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
94
|
+
value = meta.attributes['content']
|
95
|
+
|
96
|
+
puts " #{name} = #{value}"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
* Print out the titles from every page:
|
102
|
+
|
103
|
+
Spidr.site('http://www.rubypulse.com/') do |spider|
|
104
|
+
spider.every_page do |page|
|
105
|
+
puts page.title if page.html?
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
* Find what kinds of web servers a host is using, by accessing the headers:
|
110
|
+
|
111
|
+
servers = Set[]
|
112
|
+
|
113
|
+
Spidr.host('generic.company.com') do |spider|
|
114
|
+
spider.all_headers do |headers|
|
115
|
+
servers << headers['server']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
* Pause the spider on a forbidden page:
|
120
|
+
|
121
|
+
spider = Spidr.host('overnight.startup.com') do |spider|
|
122
|
+
spider.every_page do |page|
|
123
|
+
spider.pause! if page.forbidden?
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
* Skip the processing of a page:
|
128
|
+
|
129
|
+
Spidr.host('sketchy.content.com') do |spider|
|
130
|
+
spider.every_page do |page|
|
131
|
+
spider.skip_page! if page.not_found?
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
* Skip the processing of links:
|
136
|
+
|
137
|
+
Spidr.host('sketchy.content.com') do |spider|
|
138
|
+
spider.every_url do |url|
|
139
|
+
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
140
|
+
spider.skip_link!
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
== REQUIREMENTS:
|
146
|
+
|
147
|
+
* {nokogiri}[http://nokogiri.rubyforge.org/] >= 1.2.0
|
148
|
+
|
149
|
+
== INSTALL:
|
150
|
+
|
151
|
+
$ sudo gem install spidr
|
152
|
+
|
64
153
|
== LICENSE:
|
65
154
|
|
66
155
|
The MIT License
|
data/Rakefile
CHANGED
@@ -4,14 +4,24 @@ require 'rubygems'
|
|
4
4
|
require 'hoe'
|
5
5
|
require 'hoe/signing'
|
6
6
|
require './tasks/spec.rb'
|
7
|
+
require './tasks/yard.rb'
|
7
8
|
require './tasks/course.rb'
|
8
9
|
require './lib/spidr/version.rb'
|
9
10
|
|
10
|
-
Hoe.spec('spidr') do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
Hoe.spec('spidr') do
|
12
|
+
self.rubyforge_name = 'spidr'
|
13
|
+
self.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
14
|
+
self.remote_rdoc_dir = 'docs'
|
15
|
+
self.extra_deps = [
|
16
|
+
['nokogiri', '>=1.2.0']
|
17
|
+
]
|
18
|
+
|
19
|
+
self.extra_dev_deps = [
|
20
|
+
['rspec', '>=1.2.8'],
|
21
|
+
['yard', '>=0.2.3.5']
|
22
|
+
]
|
23
|
+
|
24
|
+
self.spec_extras = {:has_rdoc => 'yard'}
|
15
25
|
end
|
16
26
|
|
17
27
|
# vim: syntax=Ruby
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'spidr/actions/exceptions/paused'
|
2
|
+
require 'spidr/actions/exceptions/skip_link'
|
3
|
+
require 'spidr/actions/exceptions/skip_page'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
module Actions
|
7
|
+
def initialize(options={})
|
8
|
+
@paused = false
|
9
|
+
|
10
|
+
super(options)
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Continue spidering.
|
15
|
+
#
|
16
|
+
# @yield [page]
|
17
|
+
# If a block is given, it will be passed every page visited.
|
18
|
+
#
|
19
|
+
# @yieldparam [Page] page
|
20
|
+
# The page to be visited.
|
21
|
+
#
|
22
|
+
def continue!(&block)
|
23
|
+
@paused = false
|
24
|
+
return run(&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Sets the pause state of the agent.
|
29
|
+
#
|
30
|
+
# @param [Boolean] state
|
31
|
+
# The new pause state of the agent.
|
32
|
+
#
|
33
|
+
def pause=(state)
|
34
|
+
@paused = state
|
35
|
+
end
|
36
|
+
|
37
|
+
#
|
38
|
+
# Pauses the agent, causing spidering to temporarily stop.
|
39
|
+
#
|
40
|
+
# @raise [Paused]
|
41
|
+
# Indicates to the agent, that it should pause spidering.
|
42
|
+
#
|
43
|
+
def pause!
|
44
|
+
@paused = true
|
45
|
+
raise(Paused)
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Determines whether the agent is paused.
|
50
|
+
#
|
51
|
+
# @return [Boolean]
|
52
|
+
# Specifies whether the agent is paused.
|
53
|
+
#
|
54
|
+
def paused?
|
55
|
+
@paused == true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Causes the agent to skip the link being enqueued.
|
60
|
+
#
|
61
|
+
# @raise [SkipLink]
|
62
|
+
# Indicates to the agent, that the current link should be skipped,
|
63
|
+
# and not enqueued or visited.
|
64
|
+
#
|
65
|
+
def skip_link!
|
66
|
+
raise(SkipLink)
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Causes the agent to skip the page being visited.
|
71
|
+
#
|
72
|
+
# @raise [SkipPage]
|
73
|
+
# Indicates to the agent, that the current page should be skipped.
|
74
|
+
#
|
75
|
+
def skip_page!
|
76
|
+
raise(SkipPage)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/spidr/agent.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
|
-
require 'spidr/
|
1
|
+
require 'spidr/filters'
|
2
|
+
require 'spidr/events'
|
3
|
+
require 'spidr/actions'
|
2
4
|
require 'spidr/page'
|
3
5
|
require 'spidr/spidr'
|
4
6
|
|
5
7
|
require 'net/http'
|
8
|
+
require 'set'
|
6
9
|
|
7
10
|
module Spidr
|
8
11
|
class Agent
|
9
12
|
|
13
|
+
include Filters
|
14
|
+
include Events
|
15
|
+
include Actions
|
16
|
+
|
10
17
|
# Proxy to use
|
11
18
|
attr_accessor :proxy
|
12
19
|
|
@@ -19,9 +26,6 @@ module Spidr
|
|
19
26
|
# Delay in between fetching pages
|
20
27
|
attr_accessor :delay
|
21
28
|
|
22
|
-
# List of acceptable URL schemes to follow
|
23
|
-
attr_reader :schemes
|
24
|
-
|
25
29
|
# History containing visited URLs
|
26
30
|
attr_reader :history
|
27
31
|
|
@@ -32,105 +36,81 @@ module Spidr
|
|
32
36
|
attr_reader :queue
|
33
37
|
|
34
38
|
#
|
35
|
-
# Creates a new Agent object
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
39
|
+
# Creates a new Agent object.
|
40
|
+
#
|
41
|
+
# @param [Hash] options
|
42
|
+
# Additional options
|
43
|
+
#
|
44
|
+
# @option options [Hash] :proxy (Spidr.proxy)
|
45
|
+
# The proxy information to use.
|
46
|
+
#
|
47
|
+
# @option :proxy [String] :host
|
48
|
+
# The host the proxy is running on.
|
49
|
+
#
|
50
|
+
# @option :proxy [Integer] :port
|
51
|
+
# The port the proxy is running on.
|
52
|
+
#
|
53
|
+
# @option :proxy [String] :user
|
54
|
+
# The user to authenticate as with the proxy.
|
55
|
+
#
|
56
|
+
# @option :proxy [String] :password
|
57
|
+
# The password to authenticate with.
|
58
|
+
#
|
59
|
+
# @option options [String] :user_agent (Spidr.user_agent)
|
60
|
+
# The User-Agent string to send with each requests.
|
61
|
+
#
|
62
|
+
# @option options [String] :referer
|
63
|
+
# The Referer URL to send with each request.
|
64
|
+
#
|
65
|
+
# @option options [Integer] :delay (0)
|
66
|
+
# The number of seconds to pause between each request.
|
67
|
+
#
|
68
|
+
# @option options [Set, Array] :queue
|
69
|
+
# The initial queue of URLs to visit.
|
70
|
+
#
|
71
|
+
# @option options [Set, Array] :history
|
72
|
+
# The initial list of visited URLs.
|
73
|
+
#
|
74
|
+
# @yield [agent]
|
75
|
+
# If a block is given, it will be passed the newly created agent
|
76
|
+
# for further configuration.
|
77
|
+
#
|
78
|
+
# @yieldparam [Agent] agent
|
79
|
+
# The newly created agent.
|
61
80
|
#
|
62
81
|
def initialize(options={},&block)
|
63
82
|
@proxy = (options[:proxy] || Spidr.proxy)
|
64
83
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
65
84
|
@referer = options[:referer]
|
66
85
|
|
67
|
-
@
|
68
|
-
|
69
|
-
if options[:schemes]
|
70
|
-
@schemes += options[:schemes]
|
71
|
-
else
|
72
|
-
@schemes << 'http'
|
73
|
-
|
74
|
-
begin
|
75
|
-
require 'net/https'
|
76
|
-
|
77
|
-
@schemes << 'https'
|
78
|
-
rescue Gem::LoadError => e
|
79
|
-
raise(e)
|
80
|
-
rescue ::LoadError
|
81
|
-
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
@host_rules = Rules.new(
|
86
|
-
:accept => options[:hosts],
|
87
|
-
:reject => options[:ignore_hosts]
|
88
|
-
)
|
89
|
-
@port_rules = Rules.new(
|
90
|
-
:accept => options[:ports],
|
91
|
-
:reject => options[:ignore_ports]
|
92
|
-
)
|
93
|
-
@link_rules = Rules.new(
|
94
|
-
:accept => options[:links],
|
95
|
-
:reject => options[:ignore_links]
|
96
|
-
)
|
97
|
-
@ext_rules = Rules.new(
|
98
|
-
:accept => options[:exts],
|
99
|
-
:reject => options[:ignore_exts]
|
100
|
-
)
|
101
|
-
|
102
|
-
@every_url_blocks = []
|
103
|
-
@every_failed_url_blocks = []
|
104
|
-
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
105
|
-
|
106
|
-
@every_page_blocks = []
|
107
|
-
|
86
|
+
@running = false
|
108
87
|
@delay = (options[:delay] || 0)
|
109
|
-
@history = []
|
110
|
-
@failures = []
|
88
|
+
@history = Set[]
|
89
|
+
@failures = Set[]
|
111
90
|
@queue = []
|
112
|
-
@paused = true
|
113
91
|
|
114
|
-
|
115
|
-
visit_hosts_like(options[:host])
|
116
|
-
end
|
92
|
+
@sessions = {}
|
117
93
|
|
118
|
-
|
119
|
-
self.queue = options[:queue]
|
120
|
-
end
|
121
|
-
|
122
|
-
if options[:history]
|
123
|
-
self.history = options[:history]
|
124
|
-
end
|
94
|
+
super(options)
|
125
95
|
|
126
96
|
block.call(self) if block
|
127
97
|
end
|
128
98
|
|
129
99
|
#
|
130
|
-
# Creates a new
|
131
|
-
#
|
132
|
-
#
|
133
|
-
# spidering.
|
100
|
+
# Creates a new agent and begin spidering at the given URL.
|
101
|
+
#
|
102
|
+
# @param [URI::HTTP, String] url
|
103
|
+
# The URL to start spidering at.
|
104
|
+
#
|
105
|
+
# @param [Hash] options
|
106
|
+
# Additional options. See {Agent#initialize}.
|
107
|
+
#
|
108
|
+
# @yield [agent]
|
109
|
+
# If a block is given, it will be passed the newly created agent
|
110
|
+
# before it begins spidering.
|
111
|
+
#
|
112
|
+
# @yieldparam [Agent] agent
|
113
|
+
# The newly created agent.
|
134
114
|
#
|
135
115
|
def self.start_at(url,options={},&block)
|
136
116
|
self.new(options) do |spider|
|
@@ -141,10 +121,20 @@ module Spidr
|
|
141
121
|
end
|
142
122
|
|
143
123
|
#
|
144
|
-
# Creates a new
|
145
|
-
#
|
146
|
-
#
|
147
|
-
#
|
124
|
+
# Creates a new agent and spiders the given host.
|
125
|
+
#
|
126
|
+
# @param [String]
|
127
|
+
# The host-name to spider.
|
128
|
+
#
|
129
|
+
# @param [Hash] options
|
130
|
+
# Additional options. See {Agent#initialize}.
|
131
|
+
#
|
132
|
+
# @yield [agent]
|
133
|
+
# If a block is given, it will be passed the newly created agent
|
134
|
+
# before it begins spidering.
|
135
|
+
#
|
136
|
+
# @yieldparam [Agent] agent
|
137
|
+
# The newly created agent.
|
148
138
|
#
|
149
139
|
def self.host(name,options={},&block)
|
150
140
|
self.new(options.merge(:host => name)) do |spider|
|
@@ -155,10 +145,20 @@ module Spidr
|
|
155
145
|
end
|
156
146
|
|
157
147
|
#
|
158
|
-
# Creates a new
|
159
|
-
#
|
160
|
-
#
|
161
|
-
#
|
148
|
+
# Creates a new agent and spiders the web-site located at the given URL.
|
149
|
+
#
|
150
|
+
# @param [URI::HTTP, String] url
|
151
|
+
# The web-site to spider.
|
152
|
+
#
|
153
|
+
# @param [Hash] options
|
154
|
+
# Additional options. See {Agent#initialize}.
|
155
|
+
#
|
156
|
+
# @yield [agent]
|
157
|
+
# If a block is given, it will be passed the newly created agent
|
158
|
+
# before it begins spidering.
|
159
|
+
#
|
160
|
+
# @yieldparam [Agent] agent
|
161
|
+
# The newly created agent.
|
162
162
|
#
|
163
163
|
def self.site(url,options={},&block)
|
164
164
|
url = URI(url.to_s)
|
@@ -171,348 +171,280 @@ module Spidr
|
|
171
171
|
end
|
172
172
|
|
173
173
|
#
|
174
|
-
#
|
175
|
-
#
|
176
|
-
def visit_hosts
|
177
|
-
@host_rules.accept
|
178
|
-
end
|
179
|
-
|
180
|
-
#
|
181
|
-
# Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
|
182
|
-
# it will be added to the visit_hosts.
|
174
|
+
# Clears the history of the agent.
|
183
175
|
#
|
184
|
-
def
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
visit_hosts << block
|
189
|
-
end
|
190
|
-
|
176
|
+
def clear
|
177
|
+
@queue.clear
|
178
|
+
@history.clear
|
179
|
+
@failures.clear
|
191
180
|
return self
|
192
181
|
end
|
193
182
|
|
194
183
|
#
|
195
|
-
#
|
184
|
+
# Start spidering at a given URL.
|
196
185
|
#
|
197
|
-
|
198
|
-
|
199
|
-
end
|
200
|
-
|
186
|
+
# @param [URI::HTTP, String] url
|
187
|
+
# The URL to start spidering at.
|
201
188
|
#
|
202
|
-
#
|
203
|
-
# it will be
|
189
|
+
# @yield [page]
|
190
|
+
# If a block is given, it will be passed every page visited.
|
204
191
|
#
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
end
|
192
|
+
# @yieldparam [Page] page
|
193
|
+
# A page which has been visited.
|
194
|
+
#
|
195
|
+
def start_at(url,&block)
|
196
|
+
enqueue(url)
|
211
197
|
|
212
|
-
return
|
198
|
+
return run(&block)
|
213
199
|
end
|
214
200
|
|
215
201
|
#
|
216
|
-
#
|
202
|
+
# Start spidering until the queue becomes empty or the agent is
|
203
|
+
# paused.
|
217
204
|
#
|
218
|
-
|
219
|
-
|
220
|
-
end
|
221
|
-
|
205
|
+
# @yield [page]
|
206
|
+
# If a block is given, it will be passed every page visited.
|
222
207
|
#
|
223
|
-
#
|
224
|
-
#
|
208
|
+
# @yieldparam [Page] page
|
209
|
+
# A page which has been visited.
|
225
210
|
#
|
226
|
-
def
|
227
|
-
|
228
|
-
visit_ports << pattern
|
229
|
-
elsif block
|
230
|
-
visit_ports << block
|
231
|
-
end
|
211
|
+
def run(&block)
|
212
|
+
@running = true
|
232
213
|
|
233
|
-
|
234
|
-
|
214
|
+
until (@queue.empty? || paused?)
|
215
|
+
begin
|
216
|
+
visit_page(dequeue,&block)
|
217
|
+
rescue Actions::Paused
|
218
|
+
return self
|
219
|
+
rescue Actions::Action
|
220
|
+
end
|
221
|
+
end
|
235
222
|
|
236
|
-
|
237
|
-
# Returns the +Array+ of URL port patterns to not visit.
|
238
|
-
#
|
239
|
-
def ignore_ports
|
240
|
-
@port_rules.reject
|
241
|
-
end
|
223
|
+
@running = false
|
242
224
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
ignore_ports << pattern
|
250
|
-
elsif block
|
251
|
-
ignore_ports << block
|
225
|
+
@sessions.each_value do |sess|
|
226
|
+
begin
|
227
|
+
sess.finish
|
228
|
+
rescue IOError
|
229
|
+
nil
|
230
|
+
end
|
252
231
|
end
|
253
232
|
|
233
|
+
@sessions.clear
|
254
234
|
return self
|
255
235
|
end
|
256
236
|
|
257
237
|
#
|
258
|
-
#
|
238
|
+
# Determines if the agent is running.
|
259
239
|
#
|
260
|
-
|
261
|
-
|
240
|
+
# @return [Boolean]
|
241
|
+
# Specifies whether the agent is running or stopped.
|
242
|
+
#
|
243
|
+
def running?
|
244
|
+
@running == true
|
262
245
|
end
|
263
246
|
|
264
247
|
#
|
265
|
-
#
|
266
|
-
# it will be added to the visit_links.
|
248
|
+
# Sets the history of URLs that were previously visited.
|
267
249
|
#
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
250
|
+
# @param [#each] new_history
|
251
|
+
# A list of URLs to populate the history with.
|
252
|
+
#
|
253
|
+
# @return [Set<URI::HTTP>]
|
254
|
+
# The history of the agent.
|
255
|
+
#
|
256
|
+
# @example
|
257
|
+
# agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
|
258
|
+
#
|
259
|
+
def history=(new_history)
|
260
|
+
@history.clear
|
261
|
+
|
262
|
+
new_history.each do |url|
|
263
|
+
@history << unless url.kind_of?(URI)
|
264
|
+
URI(url.to_s)
|
265
|
+
else
|
266
|
+
url
|
267
|
+
end
|
273
268
|
end
|
274
269
|
|
275
|
-
return
|
270
|
+
return @history
|
276
271
|
end
|
277
272
|
|
273
|
+
alias visited_urls history
|
274
|
+
|
275
|
+
#
|
276
|
+
# Specifies the links which have been visited.
|
278
277
|
#
|
279
|
-
#
|
278
|
+
# @return [Array<String>]
|
279
|
+
# The links which have been visited.
|
280
280
|
#
|
281
|
-
def
|
282
|
-
@
|
281
|
+
def visited_links
|
282
|
+
@history.map { |url| url.to_s }
|
283
283
|
end
|
284
284
|
|
285
285
|
#
|
286
|
-
#
|
287
|
-
# it will be added to the ignore_links.
|
286
|
+
# Specifies all hosts that were visited.
|
288
287
|
#
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
end
|
295
|
-
|
296
|
-
return self
|
288
|
+
# @return [Array<String>]
|
289
|
+
# The hosts which have been visited.
|
290
|
+
#
|
291
|
+
def visited_hosts
|
292
|
+
visited_urls.map { |uri| uri.host }.uniq
|
297
293
|
end
|
298
294
|
|
299
295
|
#
|
300
|
-
#
|
296
|
+
# Determines whether a URL was visited or not.
|
301
297
|
#
|
302
|
-
|
303
|
-
|
304
|
-
end
|
305
|
-
|
298
|
+
# @param [URI::HTTP, String] url
|
299
|
+
# The URL to search for.
|
306
300
|
#
|
307
|
-
#
|
308
|
-
#
|
301
|
+
# @return [Boolean]
|
302
|
+
# Specifies whether a URL was visited.
|
309
303
|
#
|
310
|
-
def
|
311
|
-
|
312
|
-
visit_exts << pattern
|
313
|
-
elsif block
|
314
|
-
visit_exts << block
|
315
|
-
end
|
304
|
+
def visited?(url)
|
305
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
316
306
|
|
317
|
-
return
|
307
|
+
return @history.include?(url)
|
318
308
|
end
|
319
309
|
|
320
310
|
#
|
321
|
-
#
|
311
|
+
# Sets the list of failed URLs.
|
322
312
|
#
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
313
|
+
# @param [#each]
|
314
|
+
# The new list of failed URLs.
|
315
|
+
#
|
316
|
+
# @return [Array<URI::HTTP>]
|
317
|
+
# The list of failed URLs.
|
327
318
|
#
|
328
|
-
#
|
329
|
-
#
|
319
|
+
# @example
|
320
|
+
# agent.failures = ['http://localhost/']
|
330
321
|
#
|
331
|
-
def
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
322
|
+
def failures=(new_failures)
|
323
|
+
@failures.clear
|
324
|
+
|
325
|
+
new_failures.each do |url|
|
326
|
+
@failures << unless url.kind_of?(URI)
|
327
|
+
URI(url.to_s)
|
328
|
+
else
|
329
|
+
url
|
330
|
+
end
|
336
331
|
end
|
337
332
|
|
338
|
-
return
|
333
|
+
return @failures
|
339
334
|
end
|
340
335
|
|
341
336
|
#
|
342
|
-
#
|
343
|
-
# specified _block_.
|
337
|
+
# Determines whether a given URL could not be visited.
|
344
338
|
#
|
345
|
-
|
346
|
-
|
347
|
-
return self
|
348
|
-
end
|
349
|
-
|
339
|
+
# @param [URI::HTTP, String] url
|
340
|
+
# The URL to check for failures.
|
350
341
|
#
|
351
|
-
#
|
352
|
-
#
|
342
|
+
# @return [Boolean]
|
343
|
+
# Specifies whether the given URL was unable to be visited.
|
353
344
|
#
|
354
|
-
def
|
355
|
-
|
356
|
-
return self
|
357
|
-
end
|
345
|
+
def failed?(url)
|
346
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
358
347
|
|
359
|
-
|
360
|
-
# For every URL that the agent visits and matches the specified
|
361
|
-
# _pattern_, it will be passed to the specified _block_.
|
362
|
-
#
|
363
|
-
def urls_like(pattern,&block)
|
364
|
-
@urls_like_blocks[pattern] << block
|
365
|
-
return self
|
348
|
+
return @failures.include?(url)
|
366
349
|
end
|
367
350
|
|
368
|
-
|
369
|
-
# For every Page that the agent visits, pass the page to the
|
370
|
-
# specified _block_.
|
371
|
-
#
|
372
|
-
def every_page(&block)
|
373
|
-
@every_page_blocks << block
|
374
|
-
return self
|
375
|
-
end
|
351
|
+
alias pending_urls queue
|
376
352
|
|
377
353
|
#
|
378
|
-
#
|
379
|
-
# _block_.
|
354
|
+
# Sets the queue of URLs to visit.
|
380
355
|
#
|
381
|
-
|
382
|
-
|
383
|
-
end
|
384
|
-
|
385
|
-
#
|
386
|
-
# Clears the history of the agent.
|
356
|
+
# @param [#each]
|
357
|
+
# The new list of URLs to visit.
|
387
358
|
#
|
388
|
-
|
389
|
-
|
390
|
-
@history.clear
|
391
|
-
@failures.clear
|
392
|
-
return self
|
393
|
-
end
|
394
|
-
|
359
|
+
# @return [Array<URI::HTTP>]
|
360
|
+
# The list of URLs to visit.
|
395
361
|
#
|
396
|
-
#
|
362
|
+
# @example
|
363
|
+
# agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
|
397
364
|
#
|
398
|
-
def
|
399
|
-
|
400
|
-
|
401
|
-
return continue!
|
402
|
-
end
|
365
|
+
def queue=(new_queue)
|
366
|
+
@queue.clear
|
403
367
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
visit_page(dequeue)
|
368
|
+
new_queue.each do |url|
|
369
|
+
@queue << unless url.kind_of?(URI)
|
370
|
+
URI(url.to_s)
|
371
|
+
else
|
372
|
+
url
|
373
|
+
end
|
411
374
|
end
|
412
375
|
|
413
|
-
return
|
376
|
+
return @queue
|
414
377
|
end
|
415
378
|
|
416
379
|
#
|
417
|
-
#
|
380
|
+
# Determines whether a given URL has been enqueued.
|
418
381
|
#
|
419
|
-
|
420
|
-
|
421
|
-
return run
|
422
|
-
end
|
423
|
-
|
382
|
+
# @param [URI::HTTP] url
|
383
|
+
# The URL to search for in the queue.
|
424
384
|
#
|
425
|
-
#
|
426
|
-
#
|
385
|
+
# @return [Boolean]
|
386
|
+
# Specifies whether the given URL has been queued for visiting.
|
427
387
|
#
|
428
|
-
def
|
429
|
-
@
|
388
|
+
def queued?(url)
|
389
|
+
@queue.include?(url)
|
430
390
|
end
|
431
391
|
|
432
392
|
#
|
433
|
-
#
|
393
|
+
# Enqueues a given URL for visiting, only if it passes all of the
|
394
|
+
# agent's rules for visiting a given URL.
|
434
395
|
#
|
435
|
-
|
436
|
-
|
437
|
-
end
|
438
|
-
|
396
|
+
# @param [URI::HTTP, String] url
|
397
|
+
# The URL to enqueue for visiting.
|
439
398
|
#
|
440
|
-
#
|
399
|
+
# @return [Boolean]
|
400
|
+
# Specifies whether the URL was enqueued, or ignored.
|
441
401
|
#
|
442
|
-
def
|
443
|
-
|
444
|
-
|
445
|
-
end
|
402
|
+
def enqueue(url)
|
403
|
+
link = url.to_s
|
404
|
+
url = URI(link) unless url.kind_of?(URI)
|
446
405
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
#
|
451
|
-
# agent.schemes = ['http']
|
452
|
-
#
|
453
|
-
def schemes=(new_schemes)
|
454
|
-
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
455
|
-
end
|
406
|
+
if (!(queued?(url)) && visit?(url))
|
407
|
+
begin
|
408
|
+
@every_url_blocks.each { |block| block.call(url) }
|
456
409
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
else
|
468
|
-
url
|
410
|
+
@urls_like_blocks.each do |pattern,blocks|
|
411
|
+
if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
|
412
|
+
blocks.each { |url_block| url_block.call(url) }
|
413
|
+
end
|
414
|
+
end
|
415
|
+
rescue Actions::Paused => action
|
416
|
+
raise(action)
|
417
|
+
rescue Actions::SkipLink
|
418
|
+
return false
|
419
|
+
rescue Actions::Action
|
469
420
|
end
|
470
|
-
end
|
471
|
-
end
|
472
421
|
|
473
|
-
|
422
|
+
@queue << url
|
423
|
+
return true
|
424
|
+
end
|
474
425
|
|
475
|
-
|
476
|
-
# Returns the +Array+ of visited URLs.
|
477
|
-
#
|
478
|
-
def visited_links
|
479
|
-
@history.map { |uri| uri.to_s }
|
426
|
+
return false
|
480
427
|
end
|
481
428
|
|
482
429
|
#
|
483
|
-
#
|
430
|
+
# Requests and creates a new Page object from a given URL.
|
484
431
|
#
|
485
|
-
|
486
|
-
|
487
|
-
end
|
488
|
-
|
432
|
+
# @param [URI::HTTP] url
|
433
|
+
# The URL to request.
|
489
434
|
#
|
490
|
-
#
|
491
|
-
#
|
435
|
+
# @yield [page]
|
436
|
+
# If a block is given, it will be passed the page that represents the
|
437
|
+
# response.
|
492
438
|
#
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
return @history.include?(url)
|
497
|
-
end
|
498
|
-
|
439
|
+
# @yieldparam [Page] page
|
440
|
+
# The page for the response.
|
499
441
|
#
|
500
|
-
#
|
501
|
-
#
|
502
|
-
#
|
503
|
-
def failed?(url)
|
504
|
-
url = URI(url) unless url.kind_of?(URI)
|
505
|
-
|
506
|
-
return @failures.include?(url)
|
507
|
-
end
|
508
|
-
|
509
|
-
alias pending_urls queue
|
510
|
-
|
511
|
-
#
|
512
|
-
# Creates a new Page object from the specified _url_. If a _block_ is
|
513
|
-
# given, it will be passed the newly created Page object.
|
442
|
+
# @return [Page, nil]
|
443
|
+
# The page for the response, or +nil+ if the request failed.
|
514
444
|
#
|
515
445
|
def get_page(url,&block)
|
446
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
447
|
+
|
516
448
|
host = url.host
|
517
449
|
port = url.port
|
518
450
|
|
@@ -522,15 +454,12 @@ module Spidr
|
|
522
454
|
path = '/'
|
523
455
|
end
|
524
456
|
|
525
|
-
|
526
|
-
|
527
|
-
proxy_user = @proxy[:user]
|
528
|
-
proxy_password = @proxy[:password]
|
457
|
+
# append the URL query to the path
|
458
|
+
path += "?#{url.query}" if url.query
|
529
459
|
|
530
460
|
begin
|
531
|
-
|
461
|
+
get_session(url.scheme,host,port) do |sess|
|
532
462
|
headers = {}
|
533
|
-
|
534
463
|
headers['User-Agent'] = @user_agent if @user_agent
|
535
464
|
headers['Referer'] = @referer if @referer
|
536
465
|
|
@@ -539,157 +468,169 @@ module Spidr
|
|
539
468
|
block.call(new_page) if block
|
540
469
|
return new_page
|
541
470
|
end
|
542
|
-
rescue SystemCallError, Net::HTTPBadResponse
|
471
|
+
rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
|
543
472
|
failed(url)
|
473
|
+
kill_session(url.scheme,host,port)
|
544
474
|
return nil
|
545
475
|
end
|
546
476
|
end
|
547
477
|
|
548
478
|
#
|
549
|
-
#
|
550
|
-
#
|
479
|
+
# Visits a given URL, and enqueus the links recovered from the URL
|
480
|
+
# to be visited later.
|
551
481
|
#
|
552
|
-
|
553
|
-
|
554
|
-
end
|
555
|
-
|
482
|
+
# @param [URI::HTTP, String] url
|
483
|
+
# The URL to visit.
|
556
484
|
#
|
557
|
-
#
|
485
|
+
# @yield [page]
|
486
|
+
# If a block is given, it will be passed the page which was visited.
|
558
487
|
#
|
559
|
-
#
|
488
|
+
# @yieldparam [Page] page
|
489
|
+
# The page which was visited.
|
560
490
|
#
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
491
|
+
# @return [Page, nil]
|
492
|
+
# The page that was visited. If +nil+ is returned, either the request
|
493
|
+
# for the page failed, or the page was skipped.
|
494
|
+
#
|
495
|
+
def visit_page(url,&block)
|
496
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
497
|
+
|
498
|
+
get_page(url) do |page|
|
499
|
+
@history << page.url
|
500
|
+
|
501
|
+
begin
|
502
|
+
@every_page_blocks.each { |page_block| page_block.call(page) }
|
503
|
+
|
504
|
+
block.call(page) if block
|
505
|
+
rescue Actions::Paused => action
|
506
|
+
raise(action)
|
507
|
+
rescue Actions::SkipPage
|
508
|
+
return nil
|
509
|
+
rescue Actions::Action
|
567
510
|
end
|
511
|
+
|
512
|
+
page.urls.each { |next_url| enqueue(next_url) }
|
568
513
|
end
|
569
514
|
end
|
570
515
|
|
571
516
|
#
|
572
|
-
#
|
573
|
-
# +false+ otherwise.
|
517
|
+
# Converts the agent into a Hash.
|
574
518
|
#
|
575
|
-
|
576
|
-
|
519
|
+
# @return [Hash]
|
520
|
+
# The agent represented as a Hash containing the +history+ and
|
521
|
+
# the +queue+ of the agent.
|
522
|
+
#
|
523
|
+
def to_hash
|
524
|
+
{:history => @history, :queue => @queue}
|
577
525
|
end
|
578
526
|
|
527
|
+
protected
|
528
|
+
|
579
529
|
#
|
580
|
-
#
|
581
|
-
#
|
582
|
-
# was successfully enqueued, returns +false+ otherwise.
|
530
|
+
# Provides an active HTTP session for the given scheme, host
|
531
|
+
# and port.
|
583
532
|
#
|
584
|
-
|
585
|
-
|
586
|
-
|
533
|
+
# @param [String] scheme
|
534
|
+
# The scheme of the URL, which will be requested later.
|
535
|
+
#
|
536
|
+
# @param [String] host
|
537
|
+
# The host that the session is needed with.
|
538
|
+
#
|
539
|
+
# @param [Integer] port
|
540
|
+
# The port that the session is needed for.
|
541
|
+
#
|
542
|
+
# @yield [session]
|
543
|
+
# If a block is given, it will be passed the active HTTP session.
|
544
|
+
#
|
545
|
+
# @yieldparam [Net::HTTP] session
|
546
|
+
# The active HTTP session object.
|
547
|
+
#
|
548
|
+
def get_session(scheme,host,port,&block)
|
549
|
+
key = [scheme,host,port]
|
587
550
|
|
588
|
-
|
589
|
-
|
551
|
+
unless @sessions[key]
|
552
|
+
session = Net::HTTP::Proxy(
|
553
|
+
@proxy[:host],
|
554
|
+
@proxy[:port],
|
555
|
+
@proxy[:user],
|
556
|
+
@proxy[:password]
|
557
|
+
).new(host,port)
|
590
558
|
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
end
|
559
|
+
if scheme == 'https'
|
560
|
+
session.use_ssl = true
|
561
|
+
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
595
562
|
end
|
596
563
|
|
597
|
-
@
|
598
|
-
return true
|
564
|
+
@sessions[key] = session
|
599
565
|
end
|
600
566
|
|
601
|
-
|
567
|
+
session = @sessions[key]
|
568
|
+
block.call(session) if block
|
569
|
+
return session
|
602
570
|
end
|
603
571
|
|
604
|
-
protected
|
605
|
-
|
606
572
|
#
|
607
|
-
#
|
573
|
+
# Destroys an HTTP session for the given scheme, host and port.
|
608
574
|
#
|
609
|
-
|
610
|
-
|
611
|
-
end
|
612
|
-
|
575
|
+
# @param [String] scheme
|
576
|
+
# The scheme of the URL, which was requested through the session.
|
613
577
|
#
|
614
|
-
#
|
615
|
-
#
|
578
|
+
# @param [String] host
|
579
|
+
# The host that the session was connected with.
|
616
580
|
#
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
581
|
+
# @param [Integer] port
|
582
|
+
# The port that the session was connected to.
|
583
|
+
#
|
584
|
+
def kill_session(scheme,host,port,&block)
|
585
|
+
key = [scheme,host,port]
|
586
|
+
sess = @sessions[key]
|
587
|
+
|
588
|
+
begin
|
589
|
+
sess.finish
|
590
|
+
rescue IOError
|
591
|
+
nil
|
622
592
|
end
|
623
|
-
end
|
624
593
|
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
#
|
629
|
-
def visit_host?(url)
|
630
|
-
@host_rules.accept?(url.host)
|
594
|
+
@sessions.delete(key)
|
595
|
+
block.call if block
|
596
|
+
return nil
|
631
597
|
end
|
632
598
|
|
633
599
|
#
|
634
|
-
#
|
635
|
-
# the port of the _url_, returns +false+ otherwise.
|
636
|
-
#
|
637
|
-
def visit_port?(url)
|
638
|
-
@port_rules.accept?(url.port)
|
639
|
-
end
|
640
|
-
|
600
|
+
# Dequeues a URL that will later be visited.
|
641
601
|
#
|
642
|
-
#
|
643
|
-
#
|
602
|
+
# @return [URI::HTTP]
|
603
|
+
# The URL that was at the front of the queue.
|
644
604
|
#
|
645
|
-
def
|
646
|
-
@
|
605
|
+
def dequeue
|
606
|
+
@queue.shift
|
647
607
|
end
|
648
608
|
|
649
609
|
#
|
650
|
-
#
|
651
|
-
# the file extension of the _url_, returns +false+ otherwise.
|
610
|
+
# Determines if a given URL should be visited.
|
652
611
|
#
|
653
|
-
|
654
|
-
|
655
|
-
end
|
656
|
-
|
612
|
+
# @param [URI::HTTP] url
|
613
|
+
# The URL in question.
|
657
614
|
#
|
658
|
-
#
|
659
|
-
#
|
615
|
+
# @return [Boolean]
|
616
|
+
# Specifies whether the given URL should be visited.
|
660
617
|
#
|
661
618
|
def visit?(url)
|
662
619
|
(!(visited?(url)) &&
|
663
|
-
visit_scheme?(url) &&
|
664
|
-
visit_host?(url) &&
|
665
|
-
visit_port?(url) &&
|
666
|
-
visit_link?(url) &&
|
667
|
-
visit_ext?(url))
|
620
|
+
visit_scheme?(url.scheme) &&
|
621
|
+
visit_host?(url.host) &&
|
622
|
+
visit_port?(url.port) &&
|
623
|
+
visit_link?(url.to_s) &&
|
624
|
+
visit_ext?(url.path))
|
668
625
|
end
|
669
626
|
|
670
627
|
#
|
671
|
-
#
|
672
|
-
# _block_ is given, it will be passed a newly created Page object
|
673
|
-
# for the specified _url_.
|
674
|
-
#
|
675
|
-
def visit_page(url,&block)
|
676
|
-
get_page(url) do |page|
|
677
|
-
@history << page.url
|
678
|
-
|
679
|
-
page.urls.each { |next_url| enqueue(next_url) }
|
680
|
-
|
681
|
-
@every_page_blocks.each { |page_block| page_block.call(page) }
|
682
|
-
|
683
|
-
block.call(page) if block
|
684
|
-
end
|
685
|
-
end
|
686
|
-
|
628
|
+
# Adds a given URL to the failures list.
|
687
629
|
#
|
688
|
-
#
|
630
|
+
# @param [URI::HTTP] url
|
631
|
+
# The URL to add to the failures list.
|
689
632
|
#
|
690
633
|
def failed(url)
|
691
|
-
url = URI(url.to_s) unless url.kind_of?(URI)
|
692
|
-
|
693
634
|
@every_failed_url_blocks.each { |block| block.call(url) }
|
694
635
|
@failures << url
|
695
636
|
return true
|