crawl 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +15 -0
- data/bin/crawl +1 -0
- data/lib/crawl/engine.rb +3 -4
- data/lib/crawl/version.rb +1 -1
- metadata +9 -9
- data/README +0 -8
data/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Crawl
|
2
|
+
|
3
|
+
Crawl pages witin a domain, reporting any page that returns a bad response code
|
4
|
+
|
5
|
+
Usage:
|
6
|
+
|
7
|
+
>crawl [options] domain
|
8
|
+
|
9
|
+
-s, --start /home,/about Starting path(s), defaults to /
|
10
|
+
-u, --username username Basic auth username
|
11
|
+
-p, --password password Basic auth password
|
12
|
+
-c, --ci Output files for CI integration
|
13
|
+
-v, --verbose Give details when crawling
|
14
|
+
-m, --markup Validate HTML markup
|
15
|
+
-h, --help Show this message
|
data/bin/crawl
CHANGED
@@ -10,6 +10,7 @@ optparse = OptionParser.new do |opts|
|
|
10
10
|
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
11
|
opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
|
12
12
|
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
+
opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
|
13
14
|
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
14
15
|
end.parse!
|
15
16
|
|
data/lib/crawl/engine.rb
CHANGED
@@ -33,6 +33,7 @@ class Crawl::Engine
|
|
33
33
|
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
34
|
@number_of_dots = 0
|
35
35
|
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
|
+
@validate_markup = options[:markup]
|
36
37
|
end
|
37
38
|
|
38
39
|
def run
|
@@ -43,7 +44,7 @@ class Crawl::Engine
|
|
43
44
|
next unless response.headers[:content_type] =~ %r{text/html}
|
44
45
|
@visited_documents << link
|
45
46
|
@found_links += links = find_links(link, response.to_str)
|
46
|
-
|
47
|
+
validate(link, response.body) if @validate_markup
|
47
48
|
end
|
48
49
|
end
|
49
50
|
end
|
@@ -87,7 +88,6 @@ private
|
|
87
88
|
error_messages = messages.select { |message| message['type'] != 'info' }
|
88
89
|
|
89
90
|
if error_messages.empty?
|
90
|
-
handle_success
|
91
91
|
true
|
92
92
|
else
|
93
93
|
response = error_messages.map do |message|
|
@@ -97,7 +97,6 @@ private
|
|
97
97
|
end.join("\n\n")
|
98
98
|
|
99
99
|
@errors << Result.new(link, response)
|
100
|
-
handle_error('I')
|
101
100
|
false
|
102
101
|
end
|
103
102
|
rescue RestClient::ServiceUnavailable
|
@@ -132,7 +131,7 @@ private
|
|
132
131
|
test_suite.finish
|
133
132
|
@report_manager.write_report(test_suite) if options[:ci]
|
134
133
|
return response
|
135
|
-
rescue RestClient::InternalServerError, RestClient::ResourceNotFound => e
|
134
|
+
rescue RestClient::InternalServerError, RestClient::ResourceNotFound, RestClient::Unauthorized => e
|
136
135
|
@errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
|
137
136
|
@invalid_links << link
|
138
137
|
return nil
|
data/lib/crawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-02-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70216317741600 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70216317741600
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rest-client
|
27
|
-
requirement: &
|
27
|
+
requirement: &70216317740600 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70216317740600
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ci_reporter
|
38
|
-
requirement: &
|
38
|
+
requirement: &70216317739980 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70216317739980
|
47
47
|
description: Crawl all pages on a domain, checking for errors
|
48
48
|
email:
|
49
49
|
- tor@alphasights.com
|
@@ -54,7 +54,7 @@ extra_rdoc_files: []
|
|
54
54
|
files:
|
55
55
|
- .gitignore
|
56
56
|
- Gemfile
|
57
|
-
- README
|
57
|
+
- README.md
|
58
58
|
- Rakefile
|
59
59
|
- bin/crawl
|
60
60
|
- crawl.gemspec
|
data/README
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
Crawl pages witin a domain, reporting any page that returns a bad response code
|
2
|
-
Usage: crawl [options] domain
|
3
|
-
-s, --start /home,/about Starting path(s), defaults to /
|
4
|
-
-u, --username username Basic auth username
|
5
|
-
-p, --password password Basic auth password
|
6
|
-
-c, --ci Output files for CI integration
|
7
|
-
-v, --verbose Give details when crawling
|
8
|
-
-h, --help Show this message
|