waw 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/waw-crawl +26 -0
- data/lib/waw.rb +3 -3
- data/lib/waw/commands/command.rb +12 -4
- data/lib/waw/commands/crawl_command.rb +46 -0
- data/lib/waw/commands/profile_command.rb +5 -1
- data/lib/waw/controllers/action/action.rb +11 -0
- data/lib/waw/controllers/action/js_generation.rb +11 -2
- data/lib/waw/controllers/static/match.rb +4 -1
- data/lib/waw/controllers/static/matcher.rb +35 -0
- data/lib/waw/controllers/static/waw_access.rb +28 -15
- data/lib/waw/controllers/static/waw_access_dsl.rb +9 -0
- data/lib/waw/controllers/static_controller.rb +5 -2
- data/lib/waw/crawler.rb +176 -0
- data/lib/waw/crawler/crawler_listener.rb +64 -0
- data/lib/waw/crawler/crawler_options.rb +42 -0
- data/lib/waw/kern/living_state.rb +93 -0
- data/lib/waw/scope_utils.rb +1 -0
- data/lib/waw/tools/mail/mail_agent.rb +1 -1
- data/lib/waw/validation.rb +3 -0
- data/lib/waw/validation/datetime_validator.rb +53 -0
- data/lib/waw/wspec/browser.rb +4 -2
- data/test/bricks/error_handler/test/test_all.rb +20 -0
- data/test/bricks/static_controller/config/test.cfg +2 -0
- data/test/bricks/static_controller/logs/webapp.log +84 -0
- data/test/bricks/static_controller/test/static_controller.wspec +12 -0
- data/test/bricks/static_controller/test/test_all.rb +20 -0
- data/test/bricks/static_controller/waw.deploy +1 -0
- data/test/bricks/static_controller/waw.routing +5 -0
- data/test/bricks/test_all.rb +8 -0
- data/test/spec/controllers/static/waw_access_spec.rb +3 -3
- data/test/spec/test_all.rb +0 -2
- data/test/spec/validation/datetime_validation_spec.rb +92 -0
- data/test/unit/test_all.rb +1 -0
- data/test/unit/waw/controllers/static/logs/webapp.log +22 -0
- metadata +23 -6
data/bin/waw-crawl
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Waw - making web development simple
|
4
|
+
# (see lib/waw/waw.rb for more information)
|
5
|
+
#
|
6
|
+
# Copyright (c) 2010 University of Louvain, Bernard & Louis Lambeau
|
7
|
+
# Released under a MIT or Ruby licence
|
8
|
+
#
|
9
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
10
|
+
require 'waw'
|
11
|
+
require 'waw/commands/command'
|
12
|
+
require 'waw/commands/crawl_command'
|
13
|
+
|
14
|
+
begin
|
15
|
+
r = Waw::Commands::CrawlCommand.new.run '.', ARGV
|
16
|
+
rescue Interrupt => e
|
17
|
+
$stderr.puts
|
18
|
+
$stderr.puts "Interrupted"
|
19
|
+
raise e
|
20
|
+
rescue OptionParser::ParseError => e
|
21
|
+
$stderr.puts e.message
|
22
|
+
raise e
|
23
|
+
rescue => e
|
24
|
+
$stderr.puts e.message
|
25
|
+
raise e
|
26
|
+
end
|
data/lib/waw.rb
CHANGED
@@ -9,13 +9,13 @@ module Waw
|
|
9
9
|
|
10
10
|
# Requirements on gems for this version
|
11
11
|
GEM_REQUIREMENTS = {
|
12
|
-
:rack => '>= 1.1
|
12
|
+
:rack => '>= 1.2.1',
|
13
13
|
:wlang => '>= 0.9.0',
|
14
14
|
:json => '>= 1.1.9'
|
15
15
|
}
|
16
16
|
|
17
17
|
# Waw version
|
18
|
-
VERSION = "0.
|
18
|
+
VERSION = "0.3.0".freeze
|
19
19
|
|
20
20
|
# Waw loading mutex
|
21
21
|
WAW_KERNELS_LOCK = Mutex.new
|
@@ -96,4 +96,4 @@ require 'waw/controllers/action_controller'
|
|
96
96
|
require 'waw/controllers/static_controller'
|
97
97
|
require 'waw/controllers/no_cache'
|
98
98
|
require 'waw/controllers/error_handler'
|
99
|
-
require 'waw/routing'
|
99
|
+
require 'waw/routing'
|
data/lib/waw/commands/command.rb
CHANGED
@@ -5,14 +5,14 @@ module Waw
|
|
5
5
|
class Command
|
6
6
|
|
7
7
|
# The verbose level
|
8
|
-
attr_accessor :
|
8
|
+
attr_accessor :verbosity
|
9
9
|
|
10
10
|
# Show stack traces?
|
11
11
|
attr_accessor :trace
|
12
12
|
|
13
13
|
# Creates an empty command instance
|
14
14
|
def initialize
|
15
|
-
@
|
15
|
+
@verbosity = 1
|
16
16
|
@buffer = STDOUT
|
17
17
|
end
|
18
18
|
|
@@ -35,8 +35,12 @@ module Waw
|
|
35
35
|
@trace = true
|
36
36
|
end
|
37
37
|
|
38
|
-
opt.on("--verbose", "
|
39
|
-
@
|
38
|
+
opt.on("--verbose", "Display extra info as we progress") do |value|
|
39
|
+
@verbosity = 2
|
40
|
+
end
|
41
|
+
|
42
|
+
opt.on("--silent", "Be quiet silent") do |value|
|
43
|
+
@verbosity = 0
|
40
44
|
end
|
41
45
|
|
42
46
|
# No argument, shows at tail. This will print an options summary.
|
@@ -55,6 +59,10 @@ module Waw
|
|
55
59
|
opt.separator nil
|
56
60
|
end
|
57
61
|
end
|
62
|
+
|
63
|
+
def verbose
|
64
|
+
verbosity < 0
|
65
|
+
end
|
58
66
|
|
59
67
|
# Runs the command
|
60
68
|
def run(requester_file, argv)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'waw/crawler'
|
2
|
+
module Waw
|
3
|
+
module Commands
|
4
|
+
class CrawlCommand < Command
|
5
|
+
|
6
|
+
def banner
|
7
|
+
<<-EOF
|
8
|
+
Usage: waw-crawl [options] WEB_SITE_URI
|
9
|
+
EOF
|
10
|
+
end
|
11
|
+
|
12
|
+
# Start command is always safe
|
13
|
+
def check_command_policy
|
14
|
+
true
|
15
|
+
end
|
16
|
+
|
17
|
+
# Adds the options
|
18
|
+
def add_options(options)
|
19
|
+
@crawler = Waw::Crawler.new(nil)
|
20
|
+
options.on("--[no-]check-externals", "[Don't] ping any external link") do |value|
|
21
|
+
@crawler.check_externals = value
|
22
|
+
end
|
23
|
+
options.on("--[no-]check-img", "[Don't] check image <img src='...'>") do |value|
|
24
|
+
@crawler.ping_on('img/@src', value)
|
25
|
+
end
|
26
|
+
options.on("--[no-]check-link", "[Don't] check <link href='...'>") do |value|
|
27
|
+
@crawler.ping_on('link/@href', value)
|
28
|
+
end
|
29
|
+
options.on("--[no-]check-script", "[Don't] check <script src='...'>") do |value|
|
30
|
+
@crawler.ping_on('script/@src', value)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Runs the sub-class defined command
|
35
|
+
def __run(requester_file, arguments)
|
36
|
+
exit(nil, true) unless (arguments.size == 1)
|
37
|
+
@crawler.root_uri = arguments[0]
|
38
|
+
@crawler.listener.verbosity = @verbosity
|
39
|
+
@crawler.crawl
|
40
|
+
rescue Interrupt => ex
|
41
|
+
info "waw-crawl stopping now... ciao!" if verbose
|
42
|
+
end
|
43
|
+
|
44
|
+
end # module ProfileCommand
|
45
|
+
end # module Commands
|
46
|
+
end # module Waw
|
@@ -31,7 +31,11 @@ module Waw
|
|
31
31
|
else
|
32
32
|
puts "Visiting #{location}" if verbose
|
33
33
|
browser.all_internal_links.each do |link|
|
34
|
-
|
34
|
+
begin
|
35
|
+
visit(browser, link[:href], visited)
|
36
|
+
rescue URI::InvalidURIError => ex
|
37
|
+
puts "Hohoho, I've found something really wrong #{link[:href]}"
|
38
|
+
end
|
35
39
|
end
|
36
40
|
end
|
37
41
|
end
|
@@ -43,6 +43,17 @@ module Waw
|
|
43
43
|
"javascript:#{id}({#{buffer[2..-1]}}, '##{id}')"
|
44
44
|
end
|
45
45
|
|
46
|
+
# Factors the ajax code for the action itself
|
47
|
+
def ajax_action_code
|
48
|
+
js = Waw::ActionController::JSGeneration.new
|
49
|
+
code = js.generate_js_for_action(Waw::kernel, self, "")
|
50
|
+
<<-EOF
|
51
|
+
<script type="text/javascript">
|
52
|
+
#{code}
|
53
|
+
</script>
|
54
|
+
EOF
|
55
|
+
end
|
56
|
+
|
46
57
|
# Factors the ajax code for preparing a formulary
|
47
58
|
def ajax_form_preparer(opts = {})
|
48
59
|
form_id = opts[:form_id] || id
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Waw
|
2
2
|
class ActionController < ::Waw::Controller
|
3
|
-
class JSGeneration
|
3
|
+
class JSGeneration < ::Waw::Controller
|
4
4
|
|
5
5
|
# Header of the generated javascript file
|
6
6
|
HEADER = <<-EOF
|
@@ -18,6 +18,15 @@ module Waw
|
|
18
18
|
raise ConfigurationError, msg
|
19
19
|
end
|
20
20
|
|
21
|
+
# Acts as a waw controller
|
22
|
+
def call(env)
|
23
|
+
buffer = []
|
24
|
+
generate_js(kernel, buffer)
|
25
|
+
[ 200,
|
26
|
+
{'Content-Type' => 'application/javascript'},
|
27
|
+
buffer ]
|
28
|
+
end
|
29
|
+
|
21
30
|
# Start hook start callback required by Waw. Generates the javascript code
|
22
31
|
# if the configuration variable 'code_at_startup' is true.
|
23
32
|
def run(waw_kernel)
|
@@ -97,7 +106,7 @@ module Waw
|
|
97
106
|
function #{action.id}(request_data, form) {
|
98
107
|
$.ajax({type: "POST", url: "#{action.url}", data: request_data, dataType: "json",
|
99
108
|
error: function(data) {
|
100
|
-
window.location = '/
|
109
|
+
window.location = '/500';
|
101
110
|
},
|
102
111
|
success: function(data) {
|
103
112
|
THEEND
|
@@ -4,6 +4,8 @@ module Waw
|
|
4
4
|
class Match
|
5
5
|
include Waw::ScopeUtils
|
6
6
|
|
7
|
+
attr_reader :wawaccess
|
8
|
+
|
7
9
|
# Served file
|
8
10
|
attr_reader :served_file
|
9
11
|
|
@@ -15,13 +17,14 @@ module Waw
|
|
15
17
|
end
|
16
18
|
|
17
19
|
# Executes on a wawaccess instance
|
18
|
-
def __execute
|
20
|
+
def __execute(env)
|
19
21
|
instance_exec *@args, &@block
|
20
22
|
end
|
21
23
|
|
22
24
|
# Delegated to the wawaccess that created me
|
23
25
|
def root; @wawaccess.root; end
|
24
26
|
def folder; @wawaccess.folder; end
|
27
|
+
def req_path; @wawaccess.req_path; end
|
25
28
|
|
26
29
|
################################################### Callbacks proposed to .wawaccess rules
|
27
30
|
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Waw
|
2
|
+
class StaticController < ::Waw::Controller
|
3
|
+
class Matcher
|
4
|
+
include Waw::ScopeUtils
|
5
|
+
|
6
|
+
# Waw access on which this matcher is defined
|
7
|
+
attr_reader :wawaccess
|
8
|
+
|
9
|
+
# Matcher's predicate
|
10
|
+
attr_reader :predicate
|
11
|
+
|
12
|
+
# Creates a matcher instance
|
13
|
+
def initialize(wawaccess, predicate)
|
14
|
+
@wawaccess = wawaccess
|
15
|
+
@predicate = predicate
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns wawaccess's folder
|
19
|
+
def folder
|
20
|
+
wawaccess.folder
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns requested path
|
24
|
+
def req_path
|
25
|
+
wawaccess.req_path
|
26
|
+
end
|
27
|
+
|
28
|
+
# Does the matcher matches a given path?
|
29
|
+
def matches?(env)
|
30
|
+
instance_eval &predicate
|
31
|
+
end
|
32
|
+
|
33
|
+
end # class Matcher
|
34
|
+
end # class StaticController
|
35
|
+
end # module Waw
|
@@ -1,9 +1,8 @@
|
|
1
|
-
require 'uri'
|
2
|
-
require 'waw/controllers/static/match'
|
3
1
|
module Waw
|
4
2
|
class StaticController < ::Waw::Controller
|
5
3
|
# Waw version of .htaccess files
|
6
4
|
class WawAccess
|
5
|
+
include Waw::ScopeUtils
|
7
6
|
|
8
7
|
# The folder which is served
|
9
8
|
attr_accessor :folder
|
@@ -66,7 +65,7 @@ module Waw
|
|
66
65
|
|
67
66
|
def recognized_pattern?(pattern)
|
68
67
|
[FalseClass, TrueClass, String,
|
69
|
-
Regexp, Waw::Validation::Validator].any?{|c| c===pattern}
|
68
|
+
Regexp, Waw::Validation::Validator, StaticController::Matcher].any?{|c| c===pattern}
|
70
69
|
end
|
71
70
|
|
72
71
|
# Adds a child in the hierarchy
|
@@ -106,6 +105,10 @@ module Waw
|
|
106
105
|
|
107
106
|
################################################### Utilites about paths
|
108
107
|
|
108
|
+
def req_path
|
109
|
+
rack_env['REQ_PATH'] || normalize_req_path(rack_env['PATH_INFO'])
|
110
|
+
end
|
111
|
+
|
109
112
|
# Returns the real path of a file
|
110
113
|
def realpath(file)
|
111
114
|
File.expand_path(File.join(folder, file))
|
@@ -164,7 +167,8 @@ module Waw
|
|
164
167
|
################################################### .waw access rules application!
|
165
168
|
|
166
169
|
# Finds the matching block inside this .wawaccess handler
|
167
|
-
def find_match(
|
170
|
+
def find_match(env)
|
171
|
+
path = env['REQ_PATH']
|
168
172
|
@serve.each do |pattern, block|
|
169
173
|
case pattern
|
170
174
|
when FalseClass
|
@@ -186,6 +190,10 @@ module Waw
|
|
186
190
|
if pattern.validate(matching_file(path))
|
187
191
|
return Match.new(self, path, block)
|
188
192
|
end
|
193
|
+
when StaticController::Matcher
|
194
|
+
if pattern.matches?(env)
|
195
|
+
return Match.new(self, path, block)
|
196
|
+
end
|
189
197
|
else
|
190
198
|
raise WawError, "Unrecognized wawaccess pattern #{pattern}"
|
191
199
|
end
|
@@ -194,16 +202,16 @@ module Waw
|
|
194
202
|
end
|
195
203
|
|
196
204
|
# Applies the rules defined here or delegate to the parent if allowed
|
197
|
-
def apply_rules(
|
198
|
-
if match = find_match(
|
199
|
-
match.__execute
|
205
|
+
def apply_rules(env)
|
206
|
+
if match = find_match(env)
|
207
|
+
match.__execute(env)
|
200
208
|
elsif (parent and inherits)
|
201
|
-
parent.apply_rules(
|
209
|
+
parent.apply_rules(env)
|
202
210
|
else
|
203
|
-
body = "File not found: #{
|
211
|
+
body = "File not found: #{env['PATH_INFO']}\n"
|
204
212
|
[404, {"Content-Type" => "text/plain",
|
205
|
-
|
206
|
-
|
213
|
+
"Content-Length" => body.size.to_s,
|
214
|
+
"X-Cascade" => "pass"},
|
207
215
|
[body]]
|
208
216
|
end
|
209
217
|
end
|
@@ -224,10 +232,15 @@ module Waw
|
|
224
232
|
end
|
225
233
|
|
226
234
|
# Serves a path from a root waw access in the hierarchy
|
227
|
-
def do_path_serve(path)
|
228
|
-
|
229
|
-
waw_access = (find_wawaccess_for(
|
230
|
-
waw_access.apply_rules(
|
235
|
+
def do_path_serve(path, env = rack_env)
|
236
|
+
env['REQ_PATH'] = normalize_req_path(path)
|
237
|
+
waw_access = (find_wawaccess_for(env['REQ_PATH']) || self)
|
238
|
+
waw_access.apply_rules(env)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Makes a Rack standard call
|
242
|
+
def call(env)
|
243
|
+
do_path_serve(env['PATH_INFO'], env)
|
231
244
|
end
|
232
245
|
|
233
246
|
end # class WawAccess
|
@@ -9,6 +9,7 @@ module Waw
|
|
9
9
|
def initialize(wawaccess)
|
10
10
|
raise ArgumentError, "wawaccess cannot be nil" unless WawAccess===wawaccess
|
11
11
|
@wawaccess = wawaccess
|
12
|
+
@matchers = {}
|
12
13
|
end
|
13
14
|
|
14
15
|
# Returns a validator that matches the root of the wawaccess tree
|
@@ -16,6 +17,14 @@ module Waw
|
|
16
17
|
Waw::Validation.validator{|served_file| File.expand_path(served_file) == File.expand_path(@wawaccess.root.folder)}
|
17
18
|
end
|
18
19
|
|
20
|
+
# Installs a matcher
|
21
|
+
def matcher(name, &predicate)
|
22
|
+
@matchers[name] = Matcher.new(@wawaccess, predicate)
|
23
|
+
(class << self; self; end).send(:define_method,name) do
|
24
|
+
@matchers[name]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
19
28
|
# Starts a wawaccess file
|
20
29
|
def wawaccess(&block)
|
21
30
|
raise WawError, "#{@wawaccess.identifier}: missing block in wawaccess call" unless block
|
@@ -1,5 +1,8 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'waw/controllers/static/waw_access'
|
2
3
|
require 'waw/controllers/static/waw_access_dsl'
|
4
|
+
require 'waw/controllers/static/matcher'
|
5
|
+
require 'waw/controllers/static/match'
|
3
6
|
module Waw
|
4
7
|
#
|
5
8
|
# A waw service that serves public pages expressed in wlang wtpl format
|
@@ -29,8 +32,8 @@ module Waw
|
|
29
32
|
##############################################################################################
|
30
33
|
|
31
34
|
# Executes the service
|
32
|
-
def
|
33
|
-
@wawaccess.
|
35
|
+
def call(env)
|
36
|
+
@wawaccess.call(env)
|
34
37
|
end
|
35
38
|
|
36
39
|
end # class Controller
|
data/lib/waw/crawler.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'waw/crawler/crawler_options'
|
3
|
+
require 'waw/crawler/crawler_listener'
|
4
|
+
module Waw
|
5
|
+
class Crawler
|
6
|
+
include Crawler::Options
|
7
|
+
|
8
|
+
###################################################################### Internal state
|
9
|
+
|
10
|
+
# Mechanize agent instance
|
11
|
+
attr_reader :agent
|
12
|
+
|
13
|
+
# Root URI to crawl
|
14
|
+
attr_reader :root_uri
|
15
|
+
|
16
|
+
# Sets the root uri
|
17
|
+
def root_uri=(uri)
|
18
|
+
@root_uri = if uri.nil?
|
19
|
+
"127.0.0.1:9292"
|
20
|
+
else
|
21
|
+
uri.is_a?(URI) ? uri : URI::parse(uri.to_s)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Stack of files/pages to visit
|
26
|
+
attr_reader :stack
|
27
|
+
|
28
|
+
###################################################################### About URI visit state
|
29
|
+
|
30
|
+
# URI statuses
|
31
|
+
attr_reader :uristate
|
32
|
+
|
33
|
+
#
|
34
|
+
PINGED = 1
|
35
|
+
PENDING = 2
|
36
|
+
CHECKING = 4
|
37
|
+
CHECKED = 8
|
38
|
+
|
39
|
+
# Marks an URI as currently pending
|
40
|
+
def pending!(uri)
|
41
|
+
uristate[uri] |= PENDING
|
42
|
+
end
|
43
|
+
|
44
|
+
# Marks an URI as being pinged
|
45
|
+
def pinged!(uri)
|
46
|
+
uristate[uri] |= PINGED
|
47
|
+
end
|
48
|
+
|
49
|
+
###################################################################### Initialization
|
50
|
+
|
51
|
+
# Creates a crawler instance on a root URI
|
52
|
+
def initialize(root_uri = nil)
|
53
|
+
self.root_uri = root_uri
|
54
|
+
set_default_options
|
55
|
+
end
|
56
|
+
|
57
|
+
###################################################################### Utils
|
58
|
+
|
59
|
+
# Returns true if a given page is internal to the website currently
|
60
|
+
# crawled
|
61
|
+
def internal_uri?(uri)
|
62
|
+
uri.host.nil? or ((uri.host == root_uri.host) and (uri.port == root_uri.port))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Resolves as an absolute URI something that has been found on
|
66
|
+
# a page
|
67
|
+
def resolve_uri(href_or_src, page)
|
68
|
+
URI::parse(agent.send(:resolve, href_or_src, page))
|
69
|
+
end
|
70
|
+
|
71
|
+
###################################################################### Crawling
|
72
|
+
|
73
|
+
# Starts the crawling
|
74
|
+
def crawl
|
75
|
+
@agent = Mechanize.new
|
76
|
+
@uristate = Hash.new{|h,k| h[k] = 0}
|
77
|
+
@stack = [ agent.get(root_uri) ]
|
78
|
+
until stack.empty?
|
79
|
+
to_check = stack.shift
|
80
|
+
case to_check
|
81
|
+
when ::Mechanize::Page
|
82
|
+
check_web_page(to_check)
|
83
|
+
else
|
84
|
+
listener.doc_skipped(to_check)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
@agent = nil
|
88
|
+
@uristate = nil
|
89
|
+
@stack = nil
|
90
|
+
end
|
91
|
+
|
92
|
+
def crawl_all(query, referer_page)
|
93
|
+
referer_page.search(query).each do |loc|
|
94
|
+
crawl_one(loc, referer_page)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def crawl_one(location, referer_page)
|
99
|
+
uri = resolve_uri(location, referer_page)
|
100
|
+
|
101
|
+
# Bypass PENDING/CHECKING/CHECKED links
|
102
|
+
if uristate[uri] < PENDING
|
103
|
+
|
104
|
+
# Mark it as PENDING now
|
105
|
+
pending!(uri)
|
106
|
+
|
107
|
+
# Mark as to crawl by pushing on the stack
|
108
|
+
if internal_uri?(uri)
|
109
|
+
stack.push(agent.get(uri))
|
110
|
+
else
|
111
|
+
listener.crawl_skipped(referer_page, location)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
rescue => ex
|
116
|
+
handle_error(ex, referer_page, location)
|
117
|
+
end
|
118
|
+
|
119
|
+
###################################################################### Checking
|
120
|
+
|
121
|
+
def check_web_page(page)
|
122
|
+
uristate[page.uri] |= CHECKING
|
123
|
+
listener.checking(page){
|
124
|
+
# Make ping checks
|
125
|
+
all_ping!(ping_list.join(', '), page)
|
126
|
+
# Crawl all links now
|
127
|
+
crawl_all(crawl_list.join(', '), page)
|
128
|
+
}
|
129
|
+
uristate[page.uri] |= CHECKED
|
130
|
+
end
|
131
|
+
|
132
|
+
###################################################################### Pinging
|
133
|
+
|
134
|
+
def all_ping!(query, referer_page)
|
135
|
+
referer_page.search(query).each do |loc|
|
136
|
+
ping!(loc, referer_page)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def ping!(loc, referer_page)
|
141
|
+
uri = resolve_uri(loc, referer_page)
|
142
|
+
|
143
|
+
# Only ping uri that are not PINGED/PENDING/CHECKING/CHECKED
|
144
|
+
return unless uristate[uri] < PINGED
|
145
|
+
|
146
|
+
# bypass externals if required
|
147
|
+
if internal_uri?(uri) || check_externals
|
148
|
+
agent.head(uri) # ping!
|
149
|
+
pinged!(uri)
|
150
|
+
listener.ping_ok(referer_page, loc)
|
151
|
+
else
|
152
|
+
listener.ping_skipped(referer_page, loc)
|
153
|
+
end
|
154
|
+
|
155
|
+
rescue => ex
|
156
|
+
handle_error(ex, referer_page, loc)
|
157
|
+
end
|
158
|
+
|
159
|
+
###################################################################### Error handling
|
160
|
+
|
161
|
+
# Handles errors that occur
|
162
|
+
def handle_error(ex, referer_page, loc)
|
163
|
+
case ex
|
164
|
+
when Mechanize::ResponseCodeError
|
165
|
+
listener.reach_failure(referer_page, loc, ex)
|
166
|
+
when Mechanize::UnsupportedSchemeError
|
167
|
+
listener.scheme_failure(referer_page, loc, ex)
|
168
|
+
when SocketError
|
169
|
+
listener.socket_error(referer_page, loc, ex)
|
170
|
+
else
|
171
|
+
raise ex
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end # class Crawler
|
176
|
+
end # module Waw
|