crawl 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +15 -0
- data/bin/crawl +1 -0
- data/lib/crawl/engine.rb +3 -4
- data/lib/crawl/version.rb +1 -1
- metadata +9 -9
- data/README +0 -8
data/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Crawl
|
2
|
+
|
3
|
+
Crawl pages witin a domain, reporting any page that returns a bad response code
|
4
|
+
|
5
|
+
Usage:
|
6
|
+
|
7
|
+
>crawl [options] domain
|
8
|
+
|
9
|
+
-s, --start /home,/about Starting path(s), defaults to /
|
10
|
+
-u, --username username Basic auth username
|
11
|
+
-p, --password password Basic auth password
|
12
|
+
-c, --ci Output files for CI integration
|
13
|
+
-v, --verbose Give details when crawling
|
14
|
+
-m, --markup Validate HTML markup
|
15
|
+
-h, --help Show this message
|
data/bin/crawl
CHANGED
@@ -10,6 +10,7 @@ optparse = OptionParser.new do |opts|
|
|
10
10
|
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
11
|
opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
|
12
12
|
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
+
opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
|
13
14
|
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
14
15
|
end.parse!
|
15
16
|
|
data/lib/crawl/engine.rb
CHANGED
@@ -33,6 +33,7 @@ class Crawl::Engine
|
|
33
33
|
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
34
|
@number_of_dots = 0
|
35
35
|
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
|
+
@validate_markup = options[:markup]
|
36
37
|
end
|
37
38
|
|
38
39
|
def run
|
@@ -43,7 +44,7 @@ class Crawl::Engine
|
|
43
44
|
next unless response.headers[:content_type] =~ %r{text/html}
|
44
45
|
@visited_documents << link
|
45
46
|
@found_links += links = find_links(link, response.to_str)
|
46
|
-
|
47
|
+
validate(link, response.body) if @validate_markup
|
47
48
|
end
|
48
49
|
end
|
49
50
|
end
|
@@ -87,7 +88,6 @@ private
|
|
87
88
|
error_messages = messages.select { |message| message['type'] != 'info' }
|
88
89
|
|
89
90
|
if error_messages.empty?
|
90
|
-
handle_success
|
91
91
|
true
|
92
92
|
else
|
93
93
|
response = error_messages.map do |message|
|
@@ -97,7 +97,6 @@ private
|
|
97
97
|
end.join("\n\n")
|
98
98
|
|
99
99
|
@errors << Result.new(link, response)
|
100
|
-
handle_error('I')
|
101
100
|
false
|
102
101
|
end
|
103
102
|
rescue RestClient::ServiceUnavailable
|
@@ -132,7 +131,7 @@ private
|
|
132
131
|
test_suite.finish
|
133
132
|
@report_manager.write_report(test_suite) if options[:ci]
|
134
133
|
return response
|
135
|
-
rescue RestClient::InternalServerError, RestClient::ResourceNotFound => e
|
134
|
+
rescue RestClient::InternalServerError, RestClient::ResourceNotFound, RestClient::Unauthorized => e
|
136
135
|
@errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
|
137
136
|
@invalid_links << link
|
138
137
|
return nil
|
data/lib/crawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-02-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70216317741600 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70216317741600
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70216317740600 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70216317740600
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ci_reporter
|
38
|
-
requirement: &
|
38
|
+
requirement: &70216317739980 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70216317739980
|
47
47
|
description: Crawl all pages on a domain, checking for errors
|
48
48
|
email:
|
49
49
|
- tor@alphasights.com
|
@@ -54,7 +54,7 @@ extra_rdoc_files: []
|
|
54
54
|
files:
|
55
55
|
- .gitignore
|
56
56
|
- Gemfile
|
57
|
-
- README
|
57
|
+
- README.md
|
58
58
|
- Rakefile
|
59
59
|
- bin/crawl
|
60
60
|
- crawl.gemspec
|
data/README
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
Crawl pages witin a domain, reporting any page that returns a bad response code
|
2
|
-
Usage: crawl [options] domain
|
3
|
-
-s, --start /home,/about Starting path(s), defaults to /
|
4
|
-
-u, --username username Basic auth username
|
5
|
-
-p, --password password Basic auth password
|
6
|
-
-c, --ci Output files for CI integration
|
7
|
-
-v, --verbose Give details when crawling
|
8
|
-
-h, --help Show this message
|