crawl 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,15 @@
1
+ # Crawl
2
+
3
+ Crawl pages witin a domain, reporting any page that returns a bad response code
4
+
5
+ Usage:
6
+
7
+ >crawl [options] domain
8
+
9
+ -s, --start /home,/about Starting path(s), defaults to /
10
+ -u, --username username Basic auth username
11
+ -p, --password password Basic auth password
12
+ -c, --ci Output files for CI integration
13
+ -v, --verbose Give details when crawling
14
+ -m, --markup Validate HTML markup
15
+ -h, --help Show this message
data/bin/crawl CHANGED
@@ -10,6 +10,7 @@ optparse = OptionParser.new do |opts|
10
10
  opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
11
11
  opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
12
12
  opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
13
+ opts.on('-m', '--markup', 'Validate markup') { |o| options[:markup] = o }
13
14
  opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
14
15
  end.parse!
15
16
 
data/lib/crawl/engine.rb CHANGED
@@ -33,6 +33,7 @@ class Crawl::Engine
33
33
  @verbose = options[:verbose] || ENV['VERBOSE']
34
34
  @number_of_dots = 0
35
35
  @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
36
+ @validate_markup = options[:markup]
36
37
  end
37
38
 
38
39
  def run
@@ -43,7 +44,7 @@ class Crawl::Engine
43
44
  next unless response.headers[:content_type] =~ %r{text/html}
44
45
  @visited_documents << link
45
46
  @found_links += links = find_links(link, response.to_str)
46
- # validate(link, response.body_str)
47
+ validate(link, response.body) if @validate_markup
47
48
  end
48
49
  end
49
50
  end
@@ -87,7 +88,6 @@ private
87
88
  error_messages = messages.select { |message| message['type'] != 'info' }
88
89
 
89
90
  if error_messages.empty?
90
- handle_success
91
91
  true
92
92
  else
93
93
  response = error_messages.map do |message|
@@ -97,7 +97,6 @@ private
97
97
  end.join("\n\n")
98
98
 
99
99
  @errors << Result.new(link, response)
100
- handle_error('I')
101
100
  false
102
101
  end
103
102
  rescue RestClient::ServiceUnavailable
@@ -132,7 +131,7 @@ private
132
131
  test_suite.finish
133
132
  @report_manager.write_report(test_suite) if options[:ci]
134
133
  return response
135
- rescue RestClient::InternalServerError, RestClient::ResourceNotFound => e
134
+ rescue RestClient::InternalServerError, RestClient::ResourceNotFound, RestClient::Unauthorized => e
136
135
  @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
137
136
  @invalid_links << link
138
137
  return nil
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "0.0.4"
3
+ VERSION = "0.0.5"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-10 00:00:00.000000000 Z
12
+ date: 2012-02-21 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70243809291280 !ruby/object:Gem::Requirement
16
+ requirement: &70216317741600 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70243809291280
24
+ version_requirements: *70216317741600
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rest-client
27
- requirement: &70243809290740 !ruby/object:Gem::Requirement
27
+ requirement: &70216317740600 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70243809290740
35
+ version_requirements: *70216317740600
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ci_reporter
38
- requirement: &70243809290140 !ruby/object:Gem::Requirement
38
+ requirement: &70216317739980 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70243809290140
46
+ version_requirements: *70216317739980
47
47
  description: Crawl all pages on a domain, checking for errors
48
48
  email:
49
49
  - tor@alphasights.com
@@ -54,7 +54,7 @@ extra_rdoc_files: []
54
54
  files:
55
55
  - .gitignore
56
56
  - Gemfile
57
- - README
57
+ - README.md
58
58
  - Rakefile
59
59
  - bin/crawl
60
60
  - crawl.gemspec
data/README DELETED
@@ -1,8 +0,0 @@
1
- Crawl pages witin a domain, reporting any page that returns a bad response code
2
- Usage: crawl [options] domain
3
- -s, --start /home,/about Starting path(s), defaults to /
4
- -u, --username username Basic auth username
5
- -p, --password password Basic auth password
6
- -c, --ci Output files for CI integration
7
- -v, --verbose Give details when crawling
8
- -h, --help Show this message