crawl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +3 -0
- data/Rakefile +2 -0
- data/bin/crawl +37 -0
- data/crawl.gemspec +20 -0
- data/lib/crawl/engine.rb +167 -0
- data/lib/crawl/failure.rb +30 -0
- data/lib/crawl/string.rb +8 -0
- data/lib/crawl/version.rb +4 -0
- data/lib/crawl.rb +17 -0
- metadata +90 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/bin/crawl
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require_relative '../lib/crawl.rb'
|
4
|
+
|
5
|
+
options = {}
|
6
|
+
optparse = OptionParser.new do |opts|
|
7
|
+
opts.banner = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code\nUsage: crawl [options] domain"
|
8
|
+
opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
|
9
|
+
opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
|
10
|
+
opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
|
11
|
+
opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
|
12
|
+
opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
|
13
|
+
opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
|
14
|
+
end.parse!
|
15
|
+
|
16
|
+
options.merge!(domain: optparse.first)
|
17
|
+
|
18
|
+
unless options[:domain]
|
19
|
+
puts 'Must provide a domain'
|
20
|
+
exit -1
|
21
|
+
end
|
22
|
+
|
23
|
+
crawler = Crawl::Engine.new(options)
|
24
|
+
|
25
|
+
trap("SIGINT") do
|
26
|
+
puts "\n\nAborting crawl.."
|
27
|
+
crawler.summarize
|
28
|
+
exit -1
|
29
|
+
end
|
30
|
+
|
31
|
+
crawler.run
|
32
|
+
crawler.summarize
|
33
|
+
|
34
|
+
unless crawler.errors.empty?
|
35
|
+
puts 'Errors during crawling'
|
36
|
+
exit -1
|
37
|
+
end
|
data/crawl.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/crawl/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Tor Erik Linnerud"]
|
6
|
+
gem.email = ["tor@alphasights.com"]
|
7
|
+
gem.description = "Crawl all pages on a domain, checking for errors"
|
8
|
+
gem.summary = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code"
|
9
|
+
gem.homepage = "http://github.com/alphasights/crawl"
|
10
|
+
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.name = "crawl"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Crawl::VERSION
|
17
|
+
gem.add_dependency('nokogiri')
|
18
|
+
gem.add_dependency('rest-client')
|
19
|
+
gem.add_dependency('ci_reporter')
|
20
|
+
end
|
data/lib/crawl/engine.rb
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class Crawl::Engine
|
3
|
+
DEFAULT_OPTIONS = {:domain => '',
|
4
|
+
:start => ['/'],
|
5
|
+
:username => '',
|
6
|
+
:password => '',
|
7
|
+
:verbose => false,
|
8
|
+
:session_id => false}
|
9
|
+
|
10
|
+
|
11
|
+
IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r(/xhr/), /https:/, /\.pdf$/, /^$/]
|
12
|
+
VALID_RESPONSE_CODES = [200, 302]
|
13
|
+
MAX_REDIRECTS = 3
|
14
|
+
LINE_WIDTH = 78
|
15
|
+
|
16
|
+
Result = Struct.new(:url, :object)
|
17
|
+
|
18
|
+
attr_reader :options, :errors
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(caller_options = {})
|
22
|
+
@options = DEFAULT_OPTIONS.merge(caller_options)
|
23
|
+
@authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
|
24
|
+
|
25
|
+
@found_links = options[:start].to_set
|
26
|
+
@link_sources = {}
|
27
|
+
@found_links.each {|target| @link_sources[target] = 'Initial'}
|
28
|
+
@visited_links = Set[]
|
29
|
+
@visited_documents = Set[]
|
30
|
+
@invalid_links = Set[]
|
31
|
+
@broken_pages = []
|
32
|
+
@errors = []
|
33
|
+
@verbose = options[:verbose] || ENV['VERBOSE']
|
34
|
+
@number_of_dots = 0
|
35
|
+
@report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
|
36
|
+
end
|
37
|
+
|
38
|
+
def run
|
39
|
+
until (links = @found_links - (@visited_links + @invalid_links)).empty? do
|
40
|
+
links.each do |link|
|
41
|
+
puts "\nChecking #{link}" if @verbose
|
42
|
+
next unless response = retrieve(link)
|
43
|
+
next unless response.headers[:content_type] =~ %r{text/html}
|
44
|
+
@visited_documents << link
|
45
|
+
@found_links += links = find_links(link, response.to_str)
|
46
|
+
# validate(link, response.body_str)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
def summarize
|
54
|
+
if @errors.size > 0
|
55
|
+
|
56
|
+
@errors.each do |error|
|
57
|
+
puts "\n#{error.url}"
|
58
|
+
puts " Linked from #{linked_from(error.url)}"
|
59
|
+
puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
|
60
|
+
end
|
61
|
+
|
62
|
+
print(<<-SUM)
|
63
|
+
|
64
|
+
Pages crawled: #{@visited_documents.size}
|
65
|
+
Pages with errors: #{@errors.size - @invalid_links.size}
|
66
|
+
Broken pages: #{@broken_pages.size}
|
67
|
+
Invalid links: #{@invalid_links.size}
|
68
|
+
|
69
|
+
I=Invalid P=Parse Error S=Status code bad
|
70
|
+
|
71
|
+
SUM
|
72
|
+
exit(@errors.size)
|
73
|
+
else
|
74
|
+
puts "\n\n#{@visited_documents.size} pages crawled"
|
75
|
+
end
|
76
|
+
|
77
|
+
puts
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def validate(link, body)
|
83
|
+
puts " Validating..." if @verbose
|
84
|
+
|
85
|
+
json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
|
86
|
+
messages = JSON.parse(json_response.body)['messages']
|
87
|
+
error_messages = messages.select { |message| message['type'] != 'info' }
|
88
|
+
|
89
|
+
if error_messages.empty?
|
90
|
+
handle_success
|
91
|
+
true
|
92
|
+
else
|
93
|
+
response = error_messages.map do |message|
|
94
|
+
type, message = message['type'], message['message']
|
95
|
+
type_color = type == 'error' ? 31 : 33
|
96
|
+
"\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
|
97
|
+
end.join("\n\n")
|
98
|
+
|
99
|
+
@errors << Result.new(link, response)
|
100
|
+
handle_error('I')
|
101
|
+
false
|
102
|
+
end
|
103
|
+
rescue RestClient::ServiceUnavailable
|
104
|
+
handle_error('U')
|
105
|
+
false
|
106
|
+
end
|
107
|
+
|
108
|
+
def retrieve(link)
|
109
|
+
test_suite = CI::Reporter::TestSuite.new(link)
|
110
|
+
test_case = CI::Reporter::TestCase.new(link)
|
111
|
+
test_suite.start
|
112
|
+
test_case.start
|
113
|
+
puts " Fetching.." if @verbose
|
114
|
+
|
115
|
+
headers = {}
|
116
|
+
#headers.merge!(Authorization: "Basic #{@authorization}") if options[:username]
|
117
|
+
headers.merge(user: options[:username], password: options[:password])
|
118
|
+
response = RestClient.get(options[:domain] + link, headers)
|
119
|
+
test_suite.name = link
|
120
|
+
test_case.name = link
|
121
|
+
test_case.finish
|
122
|
+
@visited_links << link
|
123
|
+
unless VALID_RESPONSE_CODES.include?(response.code)
|
124
|
+
@errors << Result.new(link, "Status code was #{response.code}")
|
125
|
+
@broken_pages << link
|
126
|
+
test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
|
127
|
+
test_suite.testcases << test_case
|
128
|
+
test_suite.finish
|
129
|
+
@report_manager.write_report(test_suite) if options[:ci]
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
test_suite.testcases << test_case
|
133
|
+
test_suite.finish
|
134
|
+
@report_manager.write_report(test_suite) if options[:ci]
|
135
|
+
return response
|
136
|
+
rescue RestClient::InternalServerError => e
|
137
|
+
@errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
|
138
|
+
@invalid_links << link
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
def linked_from(target)
|
143
|
+
@link_sources[target] # => source
|
144
|
+
end
|
145
|
+
|
146
|
+
def find_links(source_link, body)
|
147
|
+
puts " Finding links.." if @verbose
|
148
|
+
doc = Nokogiri::HTML(body)
|
149
|
+
anchors = doc.css('a').to_a
|
150
|
+
anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
|
151
|
+
anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
|
152
|
+
anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
|
153
|
+
raw_links = anchors.map{|anchor| anchor['href']}
|
154
|
+
raw_links.compact!
|
155
|
+
raw_links.map!{|link| link.sub(options[:domain], '')}
|
156
|
+
raw_links.delete_if{|link| link =~ %r{^http://}}
|
157
|
+
raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
|
158
|
+
raw_links.each do |target_link|
|
159
|
+
unless @found_links.include?(target_link)
|
160
|
+
puts " Adding #{target_link} found on #{source_link}" if @verbose
|
161
|
+
@link_sources[target_link] = source_link
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
raw_links
|
166
|
+
end
|
167
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class Crawl::Failure
|
3
|
+
attr_reader :link, :code, :from
|
4
|
+
|
5
|
+
def initialize(link, code, from)
|
6
|
+
@link = link
|
7
|
+
@code = code
|
8
|
+
@from = from
|
9
|
+
end
|
10
|
+
|
11
|
+
def failure?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def error?
|
16
|
+
!failure?
|
17
|
+
end
|
18
|
+
|
19
|
+
def name
|
20
|
+
link
|
21
|
+
end
|
22
|
+
|
23
|
+
def message
|
24
|
+
"Status code was #{code}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def location
|
28
|
+
"Linked from #{from}"
|
29
|
+
end
|
30
|
+
end
|
data/lib/crawl/string.rb
ADDED
data/lib/crawl.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
puts require('nokogiri')
|
3
|
+
puts require('rest_client')
|
4
|
+
require 'ci/reporter/core'
|
5
|
+
|
6
|
+
require 'base64'
|
7
|
+
require 'set'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'digest/sha1'
|
10
|
+
require 'json'
|
11
|
+
require 'tempfile'
|
12
|
+
require 'tmpdir'
|
13
|
+
|
14
|
+
require_relative "crawl/version"
|
15
|
+
require_relative "crawl/engine"
|
16
|
+
require_relative "crawl/string"
|
17
|
+
require_relative "crawl/failure"
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tor Erik Linnerud
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &70363418401240 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70363418401240
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rest-client
|
27
|
+
requirement: &70363418400700 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70363418400700
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ci_reporter
|
38
|
+
requirement: &70363418400280 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70363418400280
|
47
|
+
description: Crawl all pages on a domain, checking for errors
|
48
|
+
email:
|
49
|
+
- tor@alphasights.com
|
50
|
+
executables:
|
51
|
+
- crawl
|
52
|
+
extensions: []
|
53
|
+
extra_rdoc_files: []
|
54
|
+
files:
|
55
|
+
- .gitignore
|
56
|
+
- Gemfile
|
57
|
+
- Rakefile
|
58
|
+
- bin/crawl
|
59
|
+
- crawl.gemspec
|
60
|
+
- lib/crawl.rb
|
61
|
+
- lib/crawl/engine.rb
|
62
|
+
- lib/crawl/failure.rb
|
63
|
+
- lib/crawl/string.rb
|
64
|
+
- lib/crawl/version.rb
|
65
|
+
homepage: http://github.com/alphasights/crawl
|
66
|
+
licenses: []
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.8.11
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: Exhaustive search pages witin a domain, reporting any page that returns a
|
89
|
+
bad response code
|
90
|
+
test_files: []
|