waw 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/waw-crawl +26 -0
- data/lib/waw.rb +3 -3
- data/lib/waw/commands/command.rb +12 -4
- data/lib/waw/commands/crawl_command.rb +46 -0
- data/lib/waw/commands/profile_command.rb +5 -1
- data/lib/waw/controllers/action/action.rb +11 -0
- data/lib/waw/controllers/action/js_generation.rb +11 -2
- data/lib/waw/controllers/static/match.rb +4 -1
- data/lib/waw/controllers/static/matcher.rb +35 -0
- data/lib/waw/controllers/static/waw_access.rb +28 -15
- data/lib/waw/controllers/static/waw_access_dsl.rb +9 -0
- data/lib/waw/controllers/static_controller.rb +5 -2
- data/lib/waw/crawler.rb +176 -0
- data/lib/waw/crawler/crawler_listener.rb +64 -0
- data/lib/waw/crawler/crawler_options.rb +42 -0
- data/lib/waw/kern/living_state.rb +93 -0
- data/lib/waw/scope_utils.rb +1 -0
- data/lib/waw/tools/mail/mail_agent.rb +1 -1
- data/lib/waw/validation.rb +3 -0
- data/lib/waw/validation/datetime_validator.rb +53 -0
- data/lib/waw/wspec/browser.rb +4 -2
- data/test/bricks/error_handler/test/test_all.rb +20 -0
- data/test/bricks/static_controller/config/test.cfg +2 -0
- data/test/bricks/static_controller/logs/webapp.log +84 -0
- data/test/bricks/static_controller/test/static_controller.wspec +12 -0
- data/test/bricks/static_controller/test/test_all.rb +20 -0
- data/test/bricks/static_controller/waw.deploy +1 -0
- data/test/bricks/static_controller/waw.routing +5 -0
- data/test/bricks/test_all.rb +8 -0
- data/test/spec/controllers/static/waw_access_spec.rb +3 -3
- data/test/spec/test_all.rb +0 -2
- data/test/spec/validation/datetime_validation_spec.rb +92 -0
- data/test/unit/test_all.rb +1 -0
- data/test/unit/waw/controllers/static/logs/webapp.log +22 -0
- metadata +23 -6
data/bin/waw-crawl
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Waw - making web development simple
|
4
|
+
# (see lib/waw/waw.rb for more information)
|
5
|
+
#
|
6
|
+
# Copyright (c) 2010 University of Louvain, Bernard & Louis Lambeau
|
7
|
+
# Released under a MIT or Ruby licence
|
8
|
+
#
|
9
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
10
|
+
require 'waw'
|
11
|
+
require 'waw/commands/command'
|
12
|
+
require 'waw/commands/crawl_command'
|
13
|
+
|
14
|
+
begin
|
15
|
+
r = Waw::Commands::CrawlCommand.new.run '.', ARGV
|
16
|
+
rescue Interrupt => e
|
17
|
+
$stderr.puts
|
18
|
+
$stderr.puts "Interrupted"
|
19
|
+
raise e
|
20
|
+
rescue OptionParser::ParseError => e
|
21
|
+
$stderr.puts e.message
|
22
|
+
raise e
|
23
|
+
rescue => e
|
24
|
+
$stderr.puts e.message
|
25
|
+
raise e
|
26
|
+
end
|
data/lib/waw.rb
CHANGED
@@ -9,13 +9,13 @@ module Waw
|
|
9
9
|
|
10
10
|
# Requirements on gems for this version
|
11
11
|
GEM_REQUIREMENTS = {
|
12
|
-
:rack => '>= 1.1
|
12
|
+
:rack => '>= 1.2.1',
|
13
13
|
:wlang => '>= 0.9.0',
|
14
14
|
:json => '>= 1.1.9'
|
15
15
|
}
|
16
16
|
|
17
17
|
# Waw version
|
18
|
-
VERSION = "0.
|
18
|
+
VERSION = "0.3.0".freeze
|
19
19
|
|
20
20
|
# Waw loading mutex
|
21
21
|
WAW_KERNELS_LOCK = Mutex.new
|
@@ -96,4 +96,4 @@ require 'waw/controllers/action_controller'
|
|
96
96
|
require 'waw/controllers/static_controller'
|
97
97
|
require 'waw/controllers/no_cache'
|
98
98
|
require 'waw/controllers/error_handler'
|
99
|
-
require 'waw/routing'
|
99
|
+
require 'waw/routing'
|
data/lib/waw/commands/command.rb
CHANGED
@@ -5,14 +5,14 @@ module Waw
|
|
5
5
|
class Command
|
6
6
|
|
7
7
|
# The verbose level
|
8
|
-
attr_accessor :
|
8
|
+
attr_accessor :verbosity
|
9
9
|
|
10
10
|
# Show stack traces?
|
11
11
|
attr_accessor :trace
|
12
12
|
|
13
13
|
# Creates an empty command instance
|
14
14
|
def initialize
|
15
|
-
@
|
15
|
+
@verbosity = 1
|
16
16
|
@buffer = STDOUT
|
17
17
|
end
|
18
18
|
|
@@ -35,8 +35,12 @@ module Waw
|
|
35
35
|
@trace = true
|
36
36
|
end
|
37
37
|
|
38
|
-
opt.on("--verbose", "
|
39
|
-
@
|
38
|
+
opt.on("--verbose", "Display extra info as we progress") do |value|
|
39
|
+
@verbosity = 2
|
40
|
+
end
|
41
|
+
|
42
|
+
opt.on("--silent", "Be quiet silent") do |value|
|
43
|
+
@verbosity = 0
|
40
44
|
end
|
41
45
|
|
42
46
|
# No argument, shows at tail. This will print an options summary.
|
@@ -55,6 +59,10 @@ module Waw
|
|
55
59
|
opt.separator nil
|
56
60
|
end
|
57
61
|
end
|
62
|
+
|
63
|
+
def verbose
|
64
|
+
verbosity < 0
|
65
|
+
end
|
58
66
|
|
59
67
|
# Runs the command
|
60
68
|
def run(requester_file, argv)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'waw/crawler'
|
2
|
+
module Waw
|
3
|
+
module Commands
|
4
|
+
class CrawlCommand < Command
|
5
|
+
|
6
|
+
def banner
|
7
|
+
<<-EOF
|
8
|
+
Usage: waw-crawl [options] WEB_SITE_URI
|
9
|
+
EOF
|
10
|
+
end
|
11
|
+
|
12
|
+
# Start command is always safe
|
13
|
+
def check_command_policy
|
14
|
+
true
|
15
|
+
end
|
16
|
+
|
17
|
+
# Adds the options
|
18
|
+
def add_options(options)
|
19
|
+
@crawler = Waw::Crawler.new(nil)
|
20
|
+
options.on("--[no-]check-externals", "[Don't] ping any external link") do |value|
|
21
|
+
@crawler.check_externals = value
|
22
|
+
end
|
23
|
+
options.on("--[no-]check-img", "[Don't] check image <img src='...'>") do |value|
|
24
|
+
@crawler.ping_on('img/@src', value)
|
25
|
+
end
|
26
|
+
options.on("--[no-]check-link", "[Don't] check <link href='...'>") do |value|
|
27
|
+
@crawler.ping_on('link/@href', value)
|
28
|
+
end
|
29
|
+
options.on("--[no-]check-script", "[Don't] check <script src='...'>") do |value|
|
30
|
+
@crawler.ping_on('script/@src', value)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Runs the sub-class defined command
|
35
|
+
def __run(requester_file, arguments)
|
36
|
+
exit(nil, true) unless (arguments.size == 1)
|
37
|
+
@crawler.root_uri = arguments[0]
|
38
|
+
@crawler.listener.verbosity = @verbosity
|
39
|
+
@crawler.crawl
|
40
|
+
rescue Interrupt => ex
|
41
|
+
info "waw-crawl stopping now... ciao!" if verbose
|
42
|
+
end
|
43
|
+
|
44
|
+
end # module ProfileCommand
|
45
|
+
end # module Commands
|
46
|
+
end # module Waw
|
@@ -31,7 +31,11 @@ module Waw
|
|
31
31
|
else
|
32
32
|
puts "Visiting #{location}" if verbose
|
33
33
|
browser.all_internal_links.each do |link|
|
34
|
-
|
34
|
+
begin
|
35
|
+
visit(browser, link[:href], visited)
|
36
|
+
rescue URI::InvalidURIError => ex
|
37
|
+
puts "Hohoho, I've found something really wrong #{link[:href]}"
|
38
|
+
end
|
35
39
|
end
|
36
40
|
end
|
37
41
|
end
|
@@ -43,6 +43,17 @@ module Waw
|
|
43
43
|
"javascript:#{id}({#{buffer[2..-1]}}, '##{id}')"
|
44
44
|
end
|
45
45
|
|
46
|
+
# Factors the ajax code for the action itself
|
47
|
+
def ajax_action_code
|
48
|
+
js = Waw::ActionController::JSGeneration.new
|
49
|
+
code = js.generate_js_for_action(Waw::kernel, self, "")
|
50
|
+
<<-EOF
|
51
|
+
<script type="text/javascript">
|
52
|
+
#{code}
|
53
|
+
</script>
|
54
|
+
EOF
|
55
|
+
end
|
56
|
+
|
46
57
|
# Factors the ajax code for preparing a formulary
|
47
58
|
def ajax_form_preparer(opts = {})
|
48
59
|
form_id = opts[:form_id] || id
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Waw
|
2
2
|
class ActionController < ::Waw::Controller
|
3
|
-
class JSGeneration
|
3
|
+
class JSGeneration < ::Waw::Controller
|
4
4
|
|
5
5
|
# Header of the generated javascript file
|
6
6
|
HEADER = <<-EOF
|
@@ -18,6 +18,15 @@ module Waw
|
|
18
18
|
raise ConfigurationError, msg
|
19
19
|
end
|
20
20
|
|
21
|
+
# Acts as a waw controller
|
22
|
+
def call(env)
|
23
|
+
buffer = []
|
24
|
+
generate_js(kernel, buffer)
|
25
|
+
[ 200,
|
26
|
+
{'Content-Type' => 'application/javascript'},
|
27
|
+
buffer ]
|
28
|
+
end
|
29
|
+
|
21
30
|
# Start hook start callback required by Waw. Generates the javascript code
|
22
31
|
# if the configuration variable 'code_at_startup' is true.
|
23
32
|
def run(waw_kernel)
|
@@ -97,7 +106,7 @@ module Waw
|
|
97
106
|
function #{action.id}(request_data, form) {
|
98
107
|
$.ajax({type: "POST", url: "#{action.url}", data: request_data, dataType: "json",
|
99
108
|
error: function(data) {
|
100
|
-
window.location = '/
|
109
|
+
window.location = '/500';
|
101
110
|
},
|
102
111
|
success: function(data) {
|
103
112
|
THEEND
|
@@ -4,6 +4,8 @@ module Waw
|
|
4
4
|
class Match
|
5
5
|
include Waw::ScopeUtils
|
6
6
|
|
7
|
+
attr_reader :wawaccess
|
8
|
+
|
7
9
|
# Served file
|
8
10
|
attr_reader :served_file
|
9
11
|
|
@@ -15,13 +17,14 @@ module Waw
|
|
15
17
|
end
|
16
18
|
|
17
19
|
# Executes on a wawaccess instance
|
18
|
-
def __execute
|
20
|
+
def __execute(env)
|
19
21
|
instance_exec *@args, &@block
|
20
22
|
end
|
21
23
|
|
22
24
|
# Delegated to the wawaccess that created me
|
23
25
|
def root; @wawaccess.root; end
|
24
26
|
def folder; @wawaccess.folder; end
|
27
|
+
def req_path; @wawaccess.req_path; end
|
25
28
|
|
26
29
|
################################################### Callbacks proposed to .wawaccess rules
|
27
30
|
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Waw
|
2
|
+
class StaticController < ::Waw::Controller
|
3
|
+
class Matcher
|
4
|
+
include Waw::ScopeUtils
|
5
|
+
|
6
|
+
# Waw access on which this matcher is defined
|
7
|
+
attr_reader :wawaccess
|
8
|
+
|
9
|
+
# Matcher's predicate
|
10
|
+
attr_reader :predicate
|
11
|
+
|
12
|
+
# Creates a matcher instance
|
13
|
+
def initialize(wawaccess, predicate)
|
14
|
+
@wawaccess = wawaccess
|
15
|
+
@predicate = predicate
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns wawaccess's folder
|
19
|
+
def folder
|
20
|
+
wawaccess.folder
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns requested path
|
24
|
+
def req_path
|
25
|
+
wawaccess.req_path
|
26
|
+
end
|
27
|
+
|
28
|
+
# Does the matcher matches a given path?
|
29
|
+
def matches?(env)
|
30
|
+
instance_eval &predicate
|
31
|
+
end
|
32
|
+
|
33
|
+
end # class Matcher
|
34
|
+
end # class StaticController
|
35
|
+
end # module Waw
|
@@ -1,9 +1,8 @@
|
|
1
|
-
require 'uri'
|
2
|
-
require 'waw/controllers/static/match'
|
3
1
|
module Waw
|
4
2
|
class StaticController < ::Waw::Controller
|
5
3
|
# Waw version of .htaccess files
|
6
4
|
class WawAccess
|
5
|
+
include Waw::ScopeUtils
|
7
6
|
|
8
7
|
# The folder which is served
|
9
8
|
attr_accessor :folder
|
@@ -66,7 +65,7 @@ module Waw
|
|
66
65
|
|
67
66
|
def recognized_pattern?(pattern)
|
68
67
|
[FalseClass, TrueClass, String,
|
69
|
-
Regexp, Waw::Validation::Validator].any?{|c| c===pattern}
|
68
|
+
Regexp, Waw::Validation::Validator, StaticController::Matcher].any?{|c| c===pattern}
|
70
69
|
end
|
71
70
|
|
72
71
|
# Adds a child in the hierarchy
|
@@ -106,6 +105,10 @@ module Waw
|
|
106
105
|
|
107
106
|
################################################### Utilites about paths
|
108
107
|
|
108
|
+
def req_path
|
109
|
+
rack_env['REQ_PATH'] || normalize_req_path(rack_env['PATH_INFO'])
|
110
|
+
end
|
111
|
+
|
109
112
|
# Returns the real path of a file
|
110
113
|
def realpath(file)
|
111
114
|
File.expand_path(File.join(folder, file))
|
@@ -164,7 +167,8 @@ module Waw
|
|
164
167
|
################################################### .waw access rules application!
|
165
168
|
|
166
169
|
# Finds the matching block inside this .wawaccess handler
|
167
|
-
def find_match(
|
170
|
+
def find_match(env)
|
171
|
+
path = env['REQ_PATH']
|
168
172
|
@serve.each do |pattern, block|
|
169
173
|
case pattern
|
170
174
|
when FalseClass
|
@@ -186,6 +190,10 @@ module Waw
|
|
186
190
|
if pattern.validate(matching_file(path))
|
187
191
|
return Match.new(self, path, block)
|
188
192
|
end
|
193
|
+
when StaticController::Matcher
|
194
|
+
if pattern.matches?(env)
|
195
|
+
return Match.new(self, path, block)
|
196
|
+
end
|
189
197
|
else
|
190
198
|
raise WawError, "Unrecognized wawaccess pattern #{pattern}"
|
191
199
|
end
|
@@ -194,16 +202,16 @@ module Waw
|
|
194
202
|
end
|
195
203
|
|
196
204
|
# Applies the rules defined here or delegate to the parent if allowed
|
197
|
-
def apply_rules(
|
198
|
-
if match = find_match(
|
199
|
-
match.__execute
|
205
|
+
def apply_rules(env)
|
206
|
+
if match = find_match(env)
|
207
|
+
match.__execute(env)
|
200
208
|
elsif (parent and inherits)
|
201
|
-
parent.apply_rules(
|
209
|
+
parent.apply_rules(env)
|
202
210
|
else
|
203
|
-
body = "File not found: #{
|
211
|
+
body = "File not found: #{env['PATH_INFO']}\n"
|
204
212
|
[404, {"Content-Type" => "text/plain",
|
205
|
-
|
206
|
-
|
213
|
+
"Content-Length" => body.size.to_s,
|
214
|
+
"X-Cascade" => "pass"},
|
207
215
|
[body]]
|
208
216
|
end
|
209
217
|
end
|
@@ -224,10 +232,15 @@ module Waw
|
|
224
232
|
end
|
225
233
|
|
226
234
|
# Serves a path from a root waw access in the hierarchy
|
227
|
-
def do_path_serve(path)
|
228
|
-
|
229
|
-
waw_access = (find_wawaccess_for(
|
230
|
-
waw_access.apply_rules(
|
235
|
+
def do_path_serve(path, env = rack_env)
|
236
|
+
env['REQ_PATH'] = normalize_req_path(path)
|
237
|
+
waw_access = (find_wawaccess_for(env['REQ_PATH']) || self)
|
238
|
+
waw_access.apply_rules(env)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Makes a Rack standard call
|
242
|
+
def call(env)
|
243
|
+
do_path_serve(env['PATH_INFO'], env)
|
231
244
|
end
|
232
245
|
|
233
246
|
end # class WawAccess
|
@@ -9,6 +9,7 @@ module Waw
|
|
9
9
|
def initialize(wawaccess)
|
10
10
|
raise ArgumentError, "wawaccess cannot be nil" unless WawAccess===wawaccess
|
11
11
|
@wawaccess = wawaccess
|
12
|
+
@matchers = {}
|
12
13
|
end
|
13
14
|
|
14
15
|
# Returns a validator that matches the root of the wawaccess tree
|
@@ -16,6 +17,14 @@ module Waw
|
|
16
17
|
Waw::Validation.validator{|served_file| File.expand_path(served_file) == File.expand_path(@wawaccess.root.folder)}
|
17
18
|
end
|
18
19
|
|
20
|
+
# Installs a matcher
|
21
|
+
def matcher(name, &predicate)
|
22
|
+
@matchers[name] = Matcher.new(@wawaccess, predicate)
|
23
|
+
(class << self; self; end).send(:define_method,name) do
|
24
|
+
@matchers[name]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
19
28
|
# Starts a wawaccess file
|
20
29
|
def wawaccess(&block)
|
21
30
|
raise WawError, "#{@wawaccess.identifier}: missing block in wawaccess call" unless block
|
@@ -1,5 +1,8 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'waw/controllers/static/waw_access'
|
2
3
|
require 'waw/controllers/static/waw_access_dsl'
|
4
|
+
require 'waw/controllers/static/matcher'
|
5
|
+
require 'waw/controllers/static/match'
|
3
6
|
module Waw
|
4
7
|
#
|
5
8
|
# A waw service that serves public pages expressed in wlang wtpl format
|
@@ -29,8 +32,8 @@ module Waw
|
|
29
32
|
##############################################################################################
|
30
33
|
|
31
34
|
# Executes the service
|
32
|
-
def
|
33
|
-
@wawaccess.
|
35
|
+
def call(env)
|
36
|
+
@wawaccess.call(env)
|
34
37
|
end
|
35
38
|
|
36
39
|
end # class Controller
|
data/lib/waw/crawler.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'waw/crawler/crawler_options'
|
3
|
+
require 'waw/crawler/crawler_listener'
|
4
|
+
module Waw
|
5
|
+
class Crawler
|
6
|
+
include Crawler::Options
|
7
|
+
|
8
|
+
###################################################################### Internal state
|
9
|
+
|
10
|
+
# Mechanize agent instance
|
11
|
+
attr_reader :agent
|
12
|
+
|
13
|
+
# Root URI to crawl
|
14
|
+
attr_reader :root_uri
|
15
|
+
|
16
|
+
# Sets the root uri
|
17
|
+
def root_uri=(uri)
|
18
|
+
@root_uri = if uri.nil?
|
19
|
+
"127.0.0.1:9292"
|
20
|
+
else
|
21
|
+
uri.is_a?(URI) ? uri : URI::parse(uri.to_s)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Stack of files/pages to visit
|
26
|
+
attr_reader :stack
|
27
|
+
|
28
|
+
###################################################################### About URI visit state
|
29
|
+
|
30
|
+
# URI statuses
|
31
|
+
attr_reader :uristate
|
32
|
+
|
33
|
+
#
|
34
|
+
PINGED = 1
|
35
|
+
PENDING = 2
|
36
|
+
CHECKING = 4
|
37
|
+
CHECKED = 8
|
38
|
+
|
39
|
+
# Marks an URI as currently pending
|
40
|
+
def pending!(uri)
|
41
|
+
uristate[uri] |= PENDING
|
42
|
+
end
|
43
|
+
|
44
|
+
# Marks an URI as being pinged
|
45
|
+
def pinged!(uri)
|
46
|
+
uristate[uri] |= PINGED
|
47
|
+
end
|
48
|
+
|
49
|
+
###################################################################### Initialization
|
50
|
+
|
51
|
+
# Creates a crawler instance on a root URI
|
52
|
+
def initialize(root_uri = nil)
|
53
|
+
self.root_uri = root_uri
|
54
|
+
set_default_options
|
55
|
+
end
|
56
|
+
|
57
|
+
###################################################################### Utils
|
58
|
+
|
59
|
+
# Returns true if a given page is internal to the website currently
|
60
|
+
# crawled
|
61
|
+
def internal_uri?(uri)
|
62
|
+
uri.host.nil? or ((uri.host == root_uri.host) and (uri.port == root_uri.port))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Resolves as an absolute URI something that has been found on
|
66
|
+
# a page
|
67
|
+
def resolve_uri(href_or_src, page)
|
68
|
+
URI::parse(agent.send(:resolve, href_or_src, page))
|
69
|
+
end
|
70
|
+
|
71
|
+
###################################################################### Crawling
|
72
|
+
|
73
|
+
# Starts the crawling
|
74
|
+
def crawl
|
75
|
+
@agent = Mechanize.new
|
76
|
+
@uristate = Hash.new{|h,k| h[k] = 0}
|
77
|
+
@stack = [ agent.get(root_uri) ]
|
78
|
+
until stack.empty?
|
79
|
+
to_check = stack.shift
|
80
|
+
case to_check
|
81
|
+
when ::Mechanize::Page
|
82
|
+
check_web_page(to_check)
|
83
|
+
else
|
84
|
+
listener.doc_skipped(to_check)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
@agent = nil
|
88
|
+
@uristate = nil
|
89
|
+
@stack = nil
|
90
|
+
end
|
91
|
+
|
92
|
+
def crawl_all(query, referer_page)
|
93
|
+
referer_page.search(query).each do |loc|
|
94
|
+
crawl_one(loc, referer_page)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def crawl_one(location, referer_page)
|
99
|
+
uri = resolve_uri(location, referer_page)
|
100
|
+
|
101
|
+
# Bypass PENDING/CHECKING/CHECKED links
|
102
|
+
if uristate[uri] < PENDING
|
103
|
+
|
104
|
+
# Mark it as PENDING now
|
105
|
+
pending!(uri)
|
106
|
+
|
107
|
+
# Mark as to crawl by pushing on the stack
|
108
|
+
if internal_uri?(uri)
|
109
|
+
stack.push(agent.get(uri))
|
110
|
+
else
|
111
|
+
listener.crawl_skipped(referer_page, location)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
rescue => ex
|
116
|
+
handle_error(ex, referer_page, location)
|
117
|
+
end
|
118
|
+
|
119
|
+
###################################################################### Checking
|
120
|
+
|
121
|
+
def check_web_page(page)
|
122
|
+
uristate[page.uri] |= CHECKING
|
123
|
+
listener.checking(page){
|
124
|
+
# Make ping checks
|
125
|
+
all_ping!(ping_list.join(', '), page)
|
126
|
+
# Crawl all links now
|
127
|
+
crawl_all(crawl_list.join(', '), page)
|
128
|
+
}
|
129
|
+
uristate[page.uri] |= CHECKED
|
130
|
+
end
|
131
|
+
|
132
|
+
###################################################################### Pinging
|
133
|
+
|
134
|
+
def all_ping!(query, referer_page)
|
135
|
+
referer_page.search(query).each do |loc|
|
136
|
+
ping!(loc, referer_page)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def ping!(loc, referer_page)
|
141
|
+
uri = resolve_uri(loc, referer_page)
|
142
|
+
|
143
|
+
# Only ping uri that are not PINGED/PENDING/CHECKING/CHECKED
|
144
|
+
return unless uristate[uri] < PINGED
|
145
|
+
|
146
|
+
# bypass externals if required
|
147
|
+
if internal_uri?(uri) || check_externals
|
148
|
+
agent.head(uri) # ping!
|
149
|
+
pinged!(uri)
|
150
|
+
listener.ping_ok(referer_page, loc)
|
151
|
+
else
|
152
|
+
listener.ping_skipped(referer_page, loc)
|
153
|
+
end
|
154
|
+
|
155
|
+
rescue => ex
|
156
|
+
handle_error(ex, referer_page, loc)
|
157
|
+
end
|
158
|
+
|
159
|
+
###################################################################### Error handling
|
160
|
+
|
161
|
+
# Handles errors that occur
|
162
|
+
def handle_error(ex, referer_page, loc)
|
163
|
+
case ex
|
164
|
+
when Mechanize::ResponseCodeError
|
165
|
+
listener.reach_failure(referer_page, loc, ex)
|
166
|
+
when Mechanize::UnsupportedSchemeError
|
167
|
+
listener.scheme_failure(referer_page, loc, ex)
|
168
|
+
when SocketError
|
169
|
+
listener.socket_error(referer_page, loc, ex)
|
170
|
+
else
|
171
|
+
raise ex
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end # class Crawler
|
176
|
+
end # module Waw
|