crawl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/bin/crawl ADDED
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require_relative '../lib/crawl.rb'
4
+
5
+ options = {}
6
+ optparse = OptionParser.new do |opts|
7
+ opts.banner = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code\nUsage: crawl [options] domain"
8
+ opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
9
+ opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
10
+ opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
11
+ opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
12
+ opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
13
+ opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
14
+ end.parse!
15
+
16
+ options.merge!(domain: optparse.first)
17
+
18
+ unless options[:domain]
19
+ puts 'Must provide a domain'
20
+ exit -1
21
+ end
22
+
23
+ crawler = Crawl::Engine.new(options)
24
+
25
+ trap("SIGINT") do
26
+ puts "\n\nAborting crawl.."
27
+ crawler.summarize
28
+ exit -1
29
+ end
30
+
31
+ crawler.run
32
+ crawler.summarize
33
+
34
+ unless crawler.errors.empty?
35
+ puts 'Errors during crawling'
36
+ exit -1
37
+ end
data/crawl.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/crawl/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Tor Erik Linnerud"]
6
+ gem.email = ["tor@alphasights.com"]
7
+ gem.description = "Crawl all pages on a domain, checking for errors"
8
+ gem.summary = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code"
9
+ gem.homepage = "http://github.com/alphasights/crawl"
10
+
11
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ gem.files = `git ls-files`.split("\n")
13
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ gem.name = "crawl"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Crawl::VERSION
17
+ gem.add_dependency('nokogiri')
18
+ gem.add_dependency('rest-client')
19
+ gem.add_dependency('ci_reporter')
20
+ end
@@ -0,0 +1,167 @@
1
+ # encoding: utf-8
2
+ class Crawl::Engine
3
+ DEFAULT_OPTIONS = {:domain => '',
4
+ :start => ['/'],
5
+ :username => '',
6
+ :password => '',
7
+ :verbose => false,
8
+ :session_id => false}
9
+
10
+
11
+ IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r(/xhr/), /https:/, /\.pdf$/, /^$/]
12
+ VALID_RESPONSE_CODES = [200, 302]
13
+ MAX_REDIRECTS = 3
14
+ LINE_WIDTH = 78
15
+
16
+ Result = Struct.new(:url, :object)
17
+
18
+ attr_reader :options, :errors
19
+
20
+
21
+ def initialize(caller_options = {})
22
+ @options = DEFAULT_OPTIONS.merge(caller_options)
23
+ @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
24
+
25
+ @found_links = options[:start].to_set
26
+ @link_sources = {}
27
+ @found_links.each {|target| @link_sources[target] = 'Initial'}
28
+ @visited_links = Set[]
29
+ @visited_documents = Set[]
30
+ @invalid_links = Set[]
31
+ @broken_pages = []
32
+ @errors = []
33
+ @verbose = options[:verbose] || ENV['VERBOSE']
34
+ @number_of_dots = 0
35
+ @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
36
+ end
37
+
38
+ def run
39
+ until (links = @found_links - (@visited_links + @invalid_links)).empty? do
40
+ links.each do |link|
41
+ puts "\nChecking #{link}" if @verbose
42
+ next unless response = retrieve(link)
43
+ next unless response.headers[:content_type] =~ %r{text/html}
44
+ @visited_documents << link
45
+ @found_links += links = find_links(link, response.to_str)
46
+ # validate(link, response.body_str)
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+
53
+ def summarize
54
+ if @errors.size > 0
55
+
56
+ @errors.each do |error|
57
+ puts "\n#{error.url}"
58
+ puts " Linked from #{linked_from(error.url)}"
59
+ puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
60
+ end
61
+
62
+ print(<<-SUM)
63
+
64
+ Pages crawled: #{@visited_documents.size}
65
+ Pages with errors: #{@errors.size - @invalid_links.size}
66
+ Broken pages: #{@broken_pages.size}
67
+ Invalid links: #{@invalid_links.size}
68
+
69
+ I=Invalid P=Parse Error S=Status code bad
70
+
71
+ SUM
72
+ exit(@errors.size)
73
+ else
74
+ puts "\n\n#{@visited_documents.size} pages crawled"
75
+ end
76
+
77
+ puts
78
+ end
79
+
80
+ private
81
+
82
+ def validate(link, body)
83
+ puts " Validating..." if @verbose
84
+
85
+ json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
86
+ messages = JSON.parse(json_response.body)['messages']
87
+ error_messages = messages.select { |message| message['type'] != 'info' }
88
+
89
+ if error_messages.empty?
90
+ handle_success
91
+ true
92
+ else
93
+ response = error_messages.map do |message|
94
+ type, message = message['type'], message['message']
95
+ type_color = type == 'error' ? 31 : 33
96
+ "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
97
+ end.join("\n\n")
98
+
99
+ @errors << Result.new(link, response)
100
+ handle_error('I')
101
+ false
102
+ end
103
+ rescue RestClient::ServiceUnavailable
104
+ handle_error('U')
105
+ false
106
+ end
107
+
108
+ def retrieve(link)
109
+ test_suite = CI::Reporter::TestSuite.new(link)
110
+ test_case = CI::Reporter::TestCase.new(link)
111
+ test_suite.start
112
+ test_case.start
113
+ puts " Fetching.." if @verbose
114
+
115
+ headers = {}
116
+ #headers.merge!(Authorization: "Basic #{@authorization}") if options[:username]
117
+ headers.merge(user: options[:username], password: options[:password])
118
+ response = RestClient.get(options[:domain] + link, headers)
119
+ test_suite.name = link
120
+ test_case.name = link
121
+ test_case.finish
122
+ @visited_links << link
123
+ unless VALID_RESPONSE_CODES.include?(response.code)
124
+ @errors << Result.new(link, "Status code was #{response.code}")
125
+ @broken_pages << link
126
+ test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
127
+ test_suite.testcases << test_case
128
+ test_suite.finish
129
+ @report_manager.write_report(test_suite) if options[:ci]
130
+ return nil
131
+ end
132
+ test_suite.testcases << test_case
133
+ test_suite.finish
134
+ @report_manager.write_report(test_suite) if options[:ci]
135
+ return response
136
+ rescue RestClient::InternalServerError => e
137
+ @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
138
+ @invalid_links << link
139
+ return nil
140
+ end
141
+
142
+ def linked_from(target)
143
+ @link_sources[target] # => source
144
+ end
145
+
146
+ def find_links(source_link, body)
147
+ puts " Finding links.." if @verbose
148
+ doc = Nokogiri::HTML(body)
149
+ anchors = doc.css('a').to_a
150
+ anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
151
+ anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
152
+ anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
153
+ raw_links = anchors.map{|anchor| anchor['href']}
154
+ raw_links.compact!
155
+ raw_links.map!{|link| link.sub(options[:domain], '')}
156
+ raw_links.delete_if{|link| link =~ %r{^http://}}
157
+ raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
158
+ raw_links.each do |target_link|
159
+ unless @found_links.include?(target_link)
160
+ puts " Adding #{target_link} found on #{source_link}" if @verbose
161
+ @link_sources[target_link] = source_link
162
+ end
163
+ end
164
+
165
+ raw_links
166
+ end
167
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ class Crawl::Failure
3
+ attr_reader :link, :code, :from
4
+
5
+ def initialize(link, code, from)
6
+ @link = link
7
+ @code = code
8
+ @from = from
9
+ end
10
+
11
+ def failure?
12
+ true
13
+ end
14
+
15
+ def error?
16
+ !failure?
17
+ end
18
+
19
+ def name
20
+ link
21
+ end
22
+
23
+ def message
24
+ "Status code was #{code}"
25
+ end
26
+
27
+ def location
28
+ "Linked from #{from}"
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+ class String
3
+ def word_wrap(line_width = 80)
4
+ self.split("\n").collect do |line|
5
+ line.length > line_width ? line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip : line
6
+ end * "\n"
7
+ end
8
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+ module Crawl
3
+ VERSION = "0.0.1"
4
+ end
data/lib/crawl.rb ADDED
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ puts require('nokogiri')
3
+ puts require('rest_client')
4
+ require 'ci/reporter/core'
5
+
6
+ require 'base64'
7
+ require 'set'
8
+ require 'fileutils'
9
+ require 'digest/sha1'
10
+ require 'json'
11
+ require 'tempfile'
12
+ require 'tmpdir'
13
+
14
+ require_relative "crawl/version"
15
+ require_relative "crawl/engine"
16
+ require_relative "crawl/string"
17
+ require_relative "crawl/failure"
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tor Erik Linnerud
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-11-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70363418401240 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70363418401240
25
+ - !ruby/object:Gem::Dependency
26
+ name: rest-client
27
+ requirement: &70363418400700 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70363418400700
36
+ - !ruby/object:Gem::Dependency
37
+ name: ci_reporter
38
+ requirement: &70363418400280 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70363418400280
47
+ description: Crawl all pages on a domain, checking for errors
48
+ email:
49
+ - tor@alphasights.com
50
+ executables:
51
+ - crawl
52
+ extensions: []
53
+ extra_rdoc_files: []
54
+ files:
55
+ - .gitignore
56
+ - Gemfile
57
+ - Rakefile
58
+ - bin/crawl
59
+ - crawl.gemspec
60
+ - lib/crawl.rb
61
+ - lib/crawl/engine.rb
62
+ - lib/crawl/failure.rb
63
+ - lib/crawl/string.rb
64
+ - lib/crawl/version.rb
65
+ homepage: http://github.com/alphasights/crawl
66
+ licenses: []
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 1.8.11
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: Exhaustive search pages witin a domain, reporting any page that returns a
89
+ bad response code
90
+ test_files: []