crawl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/bin/crawl ADDED
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require_relative '../lib/crawl.rb'
4
+
5
+ options = {}
6
+ optparse = OptionParser.new do |opts|
7
+ opts.banner = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code\nUsage: crawl [options] domain"
8
+ opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
9
+ opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
10
+ opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
11
+ opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
12
+ opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
13
+ opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
14
+ end.parse!
15
+
16
+ options.merge!(domain: optparse.first)
17
+
18
+ unless options[:domain]
19
+ puts 'Must provide a domain'
20
+ exit -1
21
+ end
22
+
23
+ crawler = Crawl::Engine.new(options)
24
+
25
+ trap("SIGINT") do
26
+ puts "\n\nAborting crawl.."
27
+ crawler.summarize
28
+ exit -1
29
+ end
30
+
31
+ crawler.run
32
+ crawler.summarize
33
+
34
+ unless crawler.errors.empty?
35
+ puts 'Errors during crawling'
36
+ exit -1
37
+ end
data/crawl.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/crawl/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Tor Erik Linnerud"]
6
+ gem.email = ["tor@alphasights.com"]
7
+ gem.description = "Crawl all pages on a domain, checking for errors"
8
+ gem.summary = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code"
9
+ gem.homepage = "http://github.com/alphasights/crawl"
10
+
11
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ gem.files = `git ls-files`.split("\n")
13
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ gem.name = "crawl"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Crawl::VERSION
17
+ gem.add_dependency('nokogiri')
18
+ gem.add_dependency('rest-client')
19
+ gem.add_dependency('ci_reporter')
20
+ end
@@ -0,0 +1,167 @@
1
+ # encoding: utf-8
2
+ class Crawl::Engine
3
+ DEFAULT_OPTIONS = {:domain => '',
4
+ :start => ['/'],
5
+ :username => '',
6
+ :password => '',
7
+ :verbose => false,
8
+ :session_id => false}
9
+
10
+
11
+ IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r(/xhr/), /https:/, /\.pdf$/, /^$/]
12
+ VALID_RESPONSE_CODES = [200, 302]
13
+ MAX_REDIRECTS = 3
14
+ LINE_WIDTH = 78
15
+
16
+ Result = Struct.new(:url, :object)
17
+
18
+ attr_reader :options, :errors
19
+
20
+
21
+ def initialize(caller_options = {})
22
+ @options = DEFAULT_OPTIONS.merge(caller_options)
23
+ @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
24
+
25
+ @found_links = options[:start].to_set
26
+ @link_sources = {}
27
+ @found_links.each {|target| @link_sources[target] = 'Initial'}
28
+ @visited_links = Set[]
29
+ @visited_documents = Set[]
30
+ @invalid_links = Set[]
31
+ @broken_pages = []
32
+ @errors = []
33
+ @verbose = options[:verbose] || ENV['VERBOSE']
34
+ @number_of_dots = 0
35
+ @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
36
+ end
37
+
38
+ def run
39
+ until (links = @found_links - (@visited_links + @invalid_links)).empty? do
40
+ links.each do |link|
41
+ puts "\nChecking #{link}" if @verbose
42
+ next unless response = retrieve(link)
43
+ next unless response.headers[:content_type] =~ %r{text/html}
44
+ @visited_documents << link
45
+ @found_links += links = find_links(link, response.to_str)
46
+ # validate(link, response.body_str)
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+
53
+ def summarize
54
+ if @errors.size > 0
55
+
56
+ @errors.each do |error|
57
+ puts "\n#{error.url}"
58
+ puts " Linked from #{linked_from(error.url)}"
59
+ puts error.object.to_s.word_wrap.split("\n").map{|line| ' ' + line}
60
+ end
61
+
62
+ print(<<-SUM)
63
+
64
+ Pages crawled: #{@visited_documents.size}
65
+ Pages with errors: #{@errors.size - @invalid_links.size}
66
+ Broken pages: #{@broken_pages.size}
67
+ Invalid links: #{@invalid_links.size}
68
+
69
+ I=Invalid P=Parse Error S=Status code bad
70
+
71
+ SUM
72
+ exit(@errors.size)
73
+ else
74
+ puts "\n\n#{@visited_documents.size} pages crawled"
75
+ end
76
+
77
+ puts
78
+ end
79
+
80
+ private
81
+
82
+ def validate(link, body)
83
+ puts " Validating..." if @verbose
84
+
85
+ json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
86
+ messages = JSON.parse(json_response.body)['messages']
87
+ error_messages = messages.select { |message| message['type'] != 'info' }
88
+
89
+ if error_messages.empty?
90
+ handle_success
91
+ true
92
+ else
93
+ response = error_messages.map do |message|
94
+ type, message = message['type'], message['message']
95
+ type_color = type == 'error' ? 31 : 33
96
+ "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
97
+ end.join("\n\n")
98
+
99
+ @errors << Result.new(link, response)
100
+ handle_error('I')
101
+ false
102
+ end
103
+ rescue RestClient::ServiceUnavailable
104
+ handle_error('U')
105
+ false
106
+ end
107
+
108
+ def retrieve(link)
109
+ test_suite = CI::Reporter::TestSuite.new(link)
110
+ test_case = CI::Reporter::TestCase.new(link)
111
+ test_suite.start
112
+ test_case.start
113
+ puts " Fetching.." if @verbose
114
+
115
+ headers = {}
116
+ #headers.merge!(Authorization: "Basic #{@authorization}") if options[:username]
117
+ headers.merge(user: options[:username], password: options[:password])
118
+ response = RestClient.get(options[:domain] + link, headers)
119
+ test_suite.name = link
120
+ test_case.name = link
121
+ test_case.finish
122
+ @visited_links << link
123
+ unless VALID_RESPONSE_CODES.include?(response.code)
124
+ @errors << Result.new(link, "Status code was #{response.code}")
125
+ @broken_pages << link
126
+ test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
127
+ test_suite.testcases << test_case
128
+ test_suite.finish
129
+ @report_manager.write_report(test_suite) if options[:ci]
130
+ return nil
131
+ end
132
+ test_suite.testcases << test_case
133
+ test_suite.finish
134
+ @report_manager.write_report(test_suite) if options[:ci]
135
+ return response
136
+ rescue RestClient::InternalServerError => e
137
+ @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
138
+ @invalid_links << link
139
+ return nil
140
+ end
141
+
142
+ def linked_from(target)
143
+ @link_sources[target] # => source
144
+ end
145
+
146
+ def find_links(source_link, body)
147
+ puts " Finding links.." if @verbose
148
+ doc = Nokogiri::HTML(body)
149
+ anchors = doc.css('a').to_a
150
+ anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
151
+ anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
152
+ anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
153
+ raw_links = anchors.map{|anchor| anchor['href']}
154
+ raw_links.compact!
155
+ raw_links.map!{|link| link.sub(options[:domain], '')}
156
+ raw_links.delete_if{|link| link =~ %r{^http://}}
157
+ raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
158
+ raw_links.each do |target_link|
159
+ unless @found_links.include?(target_link)
160
+ puts " Adding #{target_link} found on #{source_link}" if @verbose
161
+ @link_sources[target_link] = source_link
162
+ end
163
+ end
164
+
165
+ raw_links
166
+ end
167
+ end
@@ -0,0 +1,30 @@
1
+ # encoding: utf-8
2
+ class Crawl::Failure
3
+ attr_reader :link, :code, :from
4
+
5
+ def initialize(link, code, from)
6
+ @link = link
7
+ @code = code
8
+ @from = from
9
+ end
10
+
11
+ def failure?
12
+ true
13
+ end
14
+
15
+ def error?
16
+ !failure?
17
+ end
18
+
19
+ def name
20
+ link
21
+ end
22
+
23
+ def message
24
+ "Status code was #{code}"
25
+ end
26
+
27
+ def location
28
+ "Linked from #{from}"
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+ class String
3
+ def word_wrap(line_width = 80)
4
+ self.split("\n").collect do |line|
5
+ line.length > line_width ? line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip : line
6
+ end * "\n"
7
+ end
8
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+ module Crawl
3
+ VERSION = "0.0.1"
4
+ end
data/lib/crawl.rb ADDED
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ puts require('nokogiri')
3
+ puts require('rest_client')
4
+ require 'ci/reporter/core'
5
+
6
+ require 'base64'
7
+ require 'set'
8
+ require 'fileutils'
9
+ require 'digest/sha1'
10
+ require 'json'
11
+ require 'tempfile'
12
+ require 'tmpdir'
13
+
14
+ require_relative "crawl/version"
15
+ require_relative "crawl/engine"
16
+ require_relative "crawl/string"
17
+ require_relative "crawl/failure"
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tor Erik Linnerud
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-11-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70363418401240 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70363418401240
25
+ - !ruby/object:Gem::Dependency
26
+ name: rest-client
27
+ requirement: &70363418400700 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70363418400700
36
+ - !ruby/object:Gem::Dependency
37
+ name: ci_reporter
38
+ requirement: &70363418400280 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70363418400280
47
+ description: Crawl all pages on a domain, checking for errors
48
+ email:
49
+ - tor@alphasights.com
50
+ executables:
51
+ - crawl
52
+ extensions: []
53
+ extra_rdoc_files: []
54
+ files:
55
+ - .gitignore
56
+ - Gemfile
57
+ - Rakefile
58
+ - bin/crawl
59
+ - crawl.gemspec
60
+ - lib/crawl.rb
61
+ - lib/crawl/engine.rb
62
+ - lib/crawl/failure.rb
63
+ - lib/crawl/string.rb
64
+ - lib/crawl/version.rb
65
+ homepage: http://github.com/alphasights/crawl
66
+ licenses: []
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 1.8.11
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: Exhaustive search pages witin a domain, reporting any page that returns a
89
+ bad response code
90
+ test_files: []