crawl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +3 -0
- data/Rakefile +2 -0
- data/bin/crawl +37 -0
- data/crawl.gemspec +20 -0
- data/lib/crawl/engine.rb +167 -0
- data/lib/crawl/failure.rb +30 -0
- data/lib/crawl/string.rb +8 -0
- data/lib/crawl/version.rb +4 -0
- data/lib/crawl.rb +17 -0
- metadata +90 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/bin/crawl
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require_relative '../lib/crawl.rb'
|
4
|
+
|
5
|
+
options = {}
|
6
|
+
optparse = OptionParser.new do |opts|
|
7
|
+
opts.banner = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code\nUsage: crawl [options] domain"
|
8
|
+
opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
|
9
|
+
opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
|
10
|
+
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
|
+
opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
|
12
|
+
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
+
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
14
|
+
end.parse!
|
15
|
+
|
16
|
+
options.merge!(domain: optparse.first)
|
17
|
+
|
18
|
+
unless options[:domain]
|
19
|
+
puts 'Must provide a domain'
|
20
|
+
exit -1
|
21
|
+
end
|
22
|
+
|
23
|
+
crawler = Crawl::Engine.new(options)
|
24
|
+
|
25
|
+
trap("SIGINT") do
|
26
|
+
puts "\n\nAborting crawl.."
|
27
|
+
crawler.summarize
|
28
|
+
exit -1
|
29
|
+
end
|
30
|
+
|
31
|
+
crawler.run
|
32
|
+
crawler.summarize
|
33
|
+
|
34
|
+
unless crawler.errors.empty?
|
35
|
+
puts 'Errors during crawling'
|
36
|
+
exit -1
|
37
|
+
end
|
data/crawl.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/crawl/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Tor Erik Linnerud"]
|
6
|
+
gem.email = ["tor@alphasights.com"]
|
7
|
+
gem.description = "Crawl all pages on a domain, checking for errors"
|
8
|
+
gem.summary = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code"
|
9
|
+
gem.homepage = "http://github.com/alphasights/crawl"
|
10
|
+
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.name = "crawl"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Crawl::VERSION
|
17
|
+
gem.add_dependency('nokogiri')
|
18
|
+
gem.add_dependency('rest-client')
|
19
|
+
gem.add_dependency('ci_reporter')
|
20
|
+
end
|
data/lib/crawl/engine.rb
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class Crawl::Engine
|
3
|
+
DEFAULT_OPTIONS = {:domain => '',
|
4
|
+
:start => ['/'],
|
5
|
+
:username => '',
|
6
|
+
:password => '',
|
7
|
+
:verbose => false,
|
8
|
+
:session_id => false}
|
9
|
+
|
10
|
+
|
11
|
+
IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r(/xhr/), /https:/, /\.pdf$/, /^$/]
|
12
|
+
VALID_RESPONSE_CODES = [200, 302]
|
13
|
+
MAX_REDIRECTS = 3
|
14
|
+
LINE_WIDTH = 78
|
15
|
+
|
16
|
+
Result = Struct.new(:url, :object)
|
17
|
+
|
18
|
+
attr_reader :options, :errors
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(caller_options = {})
|
22
|
+
@options = DEFAULT_OPTIONS.merge(caller_options)
|
23
|
+
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
24
|
+
|
25
|
+
@found_links = options[:start].to_set
|
26
|
+
@link_sources = {}
|
27
|
+
@found_links.each {|target| @link_sources[target] = 'Initial'}
|
28
|
+
@visited_links = Set[]
|
29
|
+
@visited_documents = Set[]
|
30
|
+
@invalid_links = Set[]
|
31
|
+
@broken_pages = []
|
32
|
+
@errors = []
|
33
|
+
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
|
+
@number_of_dots = 0
|
35
|
+
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
|
+
end
|
37
|
+
|
38
|
+
def run
|
39
|
+
until (links = @found_links - (@visited_links + @invalid_links)).empty? do
|
40
|
+
links.each do |link|
|
41
|
+
puts "\nChecking #{link}" if @verbose
|
42
|
+
next unless response = retrieve(link)
|
43
|
+
next unless response.headers[:content_type] =~ %r{text/html}
|
44
|
+
@visited_documents << link
|
45
|
+
@found_links += links = find_links(link, response.to_str)
|
46
|
+
# validate(link, response.body_str)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
def summarize
|
54
|
+
if @errors.size > 0
|
55
|
+
|
56
|
+
@errors.each do |error|
|
57
|
+
puts "\n#{error.url}"
|
58
|
+
puts " Linked from #{linked_from(error.url)}"
|
59
|
+
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
60
|
+
end
|
61
|
+
|
62
|
+
print(<<-SUM)
|
63
|
+
|
64
|
+
Pages crawled: #{@visited_documents.size}
|
65
|
+
Pages with errors: #{@errors.size - @invalid_links.size}
|
66
|
+
Broken pages: #{@broken_pages.size}
|
67
|
+
Invalid links: #{@invalid_links.size}
|
68
|
+
|
69
|
+
I=Invalid P=Parse Error S=Status code bad
|
70
|
+
|
71
|
+
SUM
|
72
|
+
exit(@errors.size)
|
73
|
+
else
|
74
|
+
puts "\n\n#{@visited_documents.size} pages crawled"
|
75
|
+
end
|
76
|
+
|
77
|
+
puts
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def validate(link, body)
|
83
|
+
puts " Validating..." if @verbose
|
84
|
+
|
85
|
+
json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
|
86
|
+
messages = JSON.parse(json_response.body)['messages']
|
87
|
+
error_messages = messages.select { |message| message['type'] != 'info' }
|
88
|
+
|
89
|
+
if error_messages.empty?
|
90
|
+
handle_success
|
91
|
+
true
|
92
|
+
else
|
93
|
+
response = error_messages.map do |message|
|
94
|
+
type, message = message['type'], message['message']
|
95
|
+
type_color = type == 'error' ? 31 : 33
|
96
|
+
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
97
|
+
end.join("\n\n")
|
98
|
+
|
99
|
+
@errors << Result.new(link, response)
|
100
|
+
handle_error('I')
|
101
|
+
false
|
102
|
+
end
|
103
|
+
rescue RestClient::ServiceUnavailable
|
104
|
+
handle_error('U')
|
105
|
+
false
|
106
|
+
end
|
107
|
+
|
108
|
+
def retrieve(link)
|
109
|
+
test_suite = CI::Reporter::TestSuite.new(link)
|
110
|
+
test_case = CI::Reporter::TestCase.new(link)
|
111
|
+
test_suite.start
|
112
|
+
test_case.start
|
113
|
+
puts " Fetching.." if @verbose
|
114
|
+
|
115
|
+
headers = {}
|
116
|
+
#headers.merge!(Authorization: "Basic #{@authorization}") if options[:username]
|
117
|
+
headers.merge(user: options[:username], password: options[:password])
|
118
|
+
response = RestClient.get(options[:domain] + link, headers)
|
119
|
+
test_suite.name = link
|
120
|
+
test_case.name = link
|
121
|
+
test_case.finish
|
122
|
+
@visited_links << link
|
123
|
+
unless VALID_RESPONSE_CODES.include?(response.code)
|
124
|
+
@errors << Result.new(link, "Status code was #{response.code}")
|
125
|
+
@broken_pages << link
|
126
|
+
test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
|
127
|
+
test_suite.testcases << test_case
|
128
|
+
test_suite.finish
|
129
|
+
@report_manager.write_report(test_suite) if options[:ci]
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
test_suite.testcases << test_case
|
133
|
+
test_suite.finish
|
134
|
+
@report_manager.write_report(test_suite) if options[:ci]
|
135
|
+
return response
|
136
|
+
rescue RestClient::InternalServerError => e
|
137
|
+
@errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
|
138
|
+
@invalid_links << link
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
def linked_from(target)
|
143
|
+
@link_sources[target] # => source
|
144
|
+
end
|
145
|
+
|
146
|
+
def find_links(source_link, body)
|
147
|
+
puts " Finding links.." if @verbose
|
148
|
+
doc = Nokogiri::HTML(body)
|
149
|
+
anchors = doc.css('a').to_a
|
150
|
+
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
151
|
+
anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
|
152
|
+
anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
|
153
|
+
raw_links = anchors.map{|anchor| anchor['href']}
|
154
|
+
raw_links.compact!
|
155
|
+
raw_links.map!{|link| link.sub(options[:domain], '')}
|
156
|
+
raw_links.delete_if{|link| link =~ %r{^http://}}
|
157
|
+
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
158
|
+
raw_links.each do |target_link|
|
159
|
+
unless @found_links.include?(target_link)
|
160
|
+
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
161
|
+
@link_sources[target_link] = source_link
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
raw_links
|
166
|
+
end
|
167
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class Crawl::Failure
|
3
|
+
attr_reader :link, :code, :from
|
4
|
+
|
5
|
+
def initialize(link, code, from)
|
6
|
+
@link = link
|
7
|
+
@code = code
|
8
|
+
@from = from
|
9
|
+
end
|
10
|
+
|
11
|
+
def failure?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def error?
|
16
|
+
!failure?
|
17
|
+
end
|
18
|
+
|
19
|
+
def name
|
20
|
+
link
|
21
|
+
end
|
22
|
+
|
23
|
+
def message
|
24
|
+
"Status code was #{code}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def location
|
28
|
+
"Linked from #{from}"
|
29
|
+
end
|
30
|
+
end
|
data/lib/crawl/string.rb
ADDED
data/lib/crawl.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
puts require('nokogiri')
|
3
|
+
puts require('rest_client')
|
4
|
+
require 'ci/reporter/core'
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
require 'set'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'digest/sha1'
|
10
|
+
require 'json'
|
11
|
+
require 'tempfile'
|
12
|
+
require 'tmpdir'
|
13
|
+
|
14
|
+
require_relative "crawl/version"
|
15
|
+
require_relative "crawl/engine"
|
16
|
+
require_relative "crawl/string"
|
17
|
+
require_relative "crawl/failure"
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tor Erik Linnerud
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &70363418401240 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70363418401240
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rest-client
|
27
|
+
requirement: &70363418400700 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70363418400700
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ci_reporter
|
38
|
+
requirement: &70363418400280 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70363418400280
|
47
|
+
description: Crawl all pages on a domain, checking for errors
|
48
|
+
email:
|
49
|
+
- tor@alphasights.com
|
50
|
+
executables:
|
51
|
+
- crawl
|
52
|
+
extensions: []
|
53
|
+
extra_rdoc_files: []
|
54
|
+
files:
|
55
|
+
- .gitignore
|
56
|
+
- Gemfile
|
57
|
+
- Rakefile
|
58
|
+
- bin/crawl
|
59
|
+
- crawl.gemspec
|
60
|
+
- lib/crawl.rb
|
61
|
+
- lib/crawl/engine.rb
|
62
|
+
- lib/crawl/failure.rb
|
63
|
+
- lib/crawl/string.rb
|
64
|
+
- lib/crawl/version.rb
|
65
|
+
homepage: http://github.com/alphasights/crawl
|
66
|
+
licenses: []
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.8.11
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: Exhaustive search pages witin a domain, reporting any page that returns a
|
89
|
+
bad response code
|
90
|
+
test_files: []
|