crawl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
 - data/Gemfile +3 -0
 - data/Rakefile +2 -0
 - data/bin/crawl +37 -0
 - data/crawl.gemspec +20 -0
 - data/lib/crawl/engine.rb +167 -0
 - data/lib/crawl/failure.rb +30 -0
 - data/lib/crawl/string.rb +8 -0
 - data/lib/crawl/version.rb +4 -0
 - data/lib/crawl.rb +17 -0
 - metadata +90 -0
 
    
        data/.gitignore
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/Rakefile
    ADDED
    
    
    
        data/bin/crawl
    ADDED
    
    | 
         @@ -0,0 +1,37 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #!/usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative '../lib/crawl.rb'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            options = {}
         
     | 
| 
      
 6 
     | 
    
         
            +
            optparse = OptionParser.new do |opts|
         
     | 
| 
      
 7 
     | 
    
         
            +
              opts.banner = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code\nUsage: crawl [options] domain"
         
     | 
| 
      
 8 
     | 
    
         
            +
              opts.on('-s', '--start /home,/about', Array, 'Starting path(s), defaults to /') { |o| options[:start] = o }
         
     | 
| 
      
 9 
     | 
    
         
            +
              opts.on('-u', '--username username', String, 'Basic auth username') { |o| options[:username] = o }
         
     | 
| 
      
 10 
     | 
    
         
            +
              opts.on('-p', '--password password', String, 'Basic auth password') { |o| options[:password] = o }
         
     | 
| 
      
 11 
     | 
    
         
            +
              opts.on('-c', '--ci', 'Output files for CI integration') { |o| options[:ci] = o }
         
     | 
| 
      
 12 
     | 
    
         
            +
              opts.on('-v', '--verbose', 'Give details when crawling') { |o| options[:verbose] = o }
         
     | 
| 
      
 13 
     | 
    
         
            +
              opts.on_tail("-h", "--help", "Show this message") { |o| puts opts; exit }
         
     | 
| 
      
 14 
     | 
    
         
            +
            end.parse!
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            options.merge!(domain: optparse.first)
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            unless options[:domain]
         
     | 
| 
      
 19 
     | 
    
         
            +
              puts 'Must provide a domain'
         
     | 
| 
      
 20 
     | 
    
         
            +
              exit -1
         
     | 
| 
      
 21 
     | 
    
         
            +
            end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            crawler = Crawl::Engine.new(options)
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            trap("SIGINT") do
         
     | 
| 
      
 26 
     | 
    
         
            +
                puts "\n\nAborting crawl.."
         
     | 
| 
      
 27 
     | 
    
         
            +
                crawler.summarize
         
     | 
| 
      
 28 
     | 
    
         
            +
                exit -1
         
     | 
| 
      
 29 
     | 
    
         
            +
            end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
            crawler.run
         
     | 
| 
      
 32 
     | 
    
         
            +
            crawler.summarize
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            unless crawler.errors.empty?
         
     | 
| 
      
 35 
     | 
    
         
            +
              puts 'Errors during crawling'
         
     | 
| 
      
 36 
     | 
    
         
            +
              exit -1
         
     | 
| 
      
 37 
     | 
    
         
            +
            end
         
     | 
    
        data/crawl.gemspec
    ADDED
    
    | 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # -*- encoding: utf-8 -*-
         
     | 
| 
      
 2 
     | 
    
         
            +
            require File.expand_path('../lib/crawl/version', __FILE__)
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            Gem::Specification.new do |gem|
         
     | 
| 
      
 5 
     | 
    
         
            +
              gem.authors       = ["Tor Erik Linnerud"]
         
     | 
| 
      
 6 
     | 
    
         
            +
              gem.email         = ["tor@alphasights.com"]
         
     | 
| 
      
 7 
     | 
    
         
            +
              gem.description   = "Crawl all pages on a domain, checking for errors"
         
     | 
| 
      
 8 
     | 
    
         
            +
              gem.summary       = "Exhaustive search pages witin a domain, reporting any page that returns a bad response code"
         
     | 
| 
      
 9 
     | 
    
         
            +
              gem.homepage      = "http://github.com/alphasights/crawl"
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              gem.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
         
     | 
| 
      
 12 
     | 
    
         
            +
              gem.files         = `git ls-files`.split("\n")
         
     | 
| 
      
 13 
     | 
    
         
            +
              gem.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
         
     | 
| 
      
 14 
     | 
    
         
            +
              gem.name          = "crawl"
         
     | 
| 
      
 15 
     | 
    
         
            +
              gem.require_paths = ["lib"]
         
     | 
| 
      
 16 
     | 
    
         
            +
              gem.version       = Crawl::VERSION
         
     | 
| 
      
 17 
     | 
    
         
            +
              gem.add_dependency('nokogiri')
         
     | 
| 
      
 18 
     | 
    
         
            +
              gem.add_dependency('rest-client')
         
     | 
| 
      
 19 
     | 
    
         
            +
              gem.add_dependency('ci_reporter')
         
     | 
| 
      
 20 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/crawl/engine.rb
    ADDED
    
    | 
         @@ -0,0 +1,167 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            class Crawl::Engine
         
     | 
| 
      
 3 
     | 
    
         
            +
              DEFAULT_OPTIONS = {:domain => '',
         
     | 
| 
      
 4 
     | 
    
         
            +
                                 :start => ['/'],
         
     | 
| 
      
 5 
     | 
    
         
            +
                                 :username => '',
         
     | 
| 
      
 6 
     | 
    
         
            +
                                 :password => '',
         
     | 
| 
      
 7 
     | 
    
         
            +
                                 :verbose => false,
         
     | 
| 
      
 8 
     | 
    
         
            +
                                 :session_id => false}
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              IGNORE = [/#/, /mailto:/, /skype:/, /logout/, /javascript:/, %r(/xhr/), /https:/, /\.pdf$/, /^$/]
         
     | 
| 
      
 12 
     | 
    
         
            +
              VALID_RESPONSE_CODES = [200, 302]
         
     | 
| 
      
 13 
     | 
    
         
            +
              MAX_REDIRECTS = 3
         
     | 
| 
      
 14 
     | 
    
         
            +
              LINE_WIDTH = 78
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              Result = Struct.new(:url, :object)
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
              attr_reader :options, :errors
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              def initialize(caller_options = {})
         
     | 
| 
      
 22 
     | 
    
         
            +
                @options = DEFAULT_OPTIONS.merge(caller_options)
         
     | 
| 
      
 23 
     | 
    
         
            +
                @authorization = Base64.encode64("#{options[:username]}:#{options[:password]}")
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                @found_links = options[:start].to_set
         
     | 
| 
      
 26 
     | 
    
         
            +
                @link_sources = {}
         
     | 
| 
      
 27 
     | 
    
         
            +
                @found_links.each {|target| @link_sources[target] = 'Initial'}
         
     | 
| 
      
 28 
     | 
    
         
            +
                @visited_links = Set[]
         
     | 
| 
      
 29 
     | 
    
         
            +
                @visited_documents = Set[]
         
     | 
| 
      
 30 
     | 
    
         
            +
                @invalid_links = Set[]
         
     | 
| 
      
 31 
     | 
    
         
            +
                @broken_pages = []
         
     | 
| 
      
 32 
     | 
    
         
            +
                @errors = []
         
     | 
| 
      
 33 
     | 
    
         
            +
                @verbose = options[:verbose] || ENV['VERBOSE']
         
     | 
| 
      
 34 
     | 
    
         
            +
                @number_of_dots = 0
         
     | 
| 
      
 35 
     | 
    
         
            +
                @report_manager = CI::Reporter::ReportManager.new("crawler") if options[:ci]
         
     | 
| 
      
 36 
     | 
    
         
            +
              end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
              def run
         
     | 
| 
      
 39 
     | 
    
         
            +
                until (links = @found_links - (@visited_links + @invalid_links)).empty? do
         
     | 
| 
      
 40 
     | 
    
         
            +
                  links.each do |link|
         
     | 
| 
      
 41 
     | 
    
         
            +
                    puts "\nChecking #{link}" if @verbose
         
     | 
| 
      
 42 
     | 
    
         
            +
                    next unless response = retrieve(link)
         
     | 
| 
      
 43 
     | 
    
         
            +
                    next unless response.headers[:content_type] =~ %r{text/html}
         
     | 
| 
      
 44 
     | 
    
         
            +
                    @visited_documents << link
         
     | 
| 
      
 45 
     | 
    
         
            +
                    @found_links += links = find_links(link, response.to_str)
         
     | 
| 
      
 46 
     | 
    
         
            +
                    # validate(link, response.body_str)
         
     | 
| 
      
 47 
     | 
    
         
            +
                  end
         
     | 
| 
      
 48 
     | 
    
         
            +
                end
         
     | 
| 
      
 49 
     | 
    
         
            +
              end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
              def summarize
         
     | 
| 
      
 54 
     | 
    
         
            +
                if @errors.size > 0
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                  @errors.each do |error|
         
     | 
| 
      
 57 
     | 
    
         
            +
                    puts "\n#{error.url}"
         
     | 
| 
      
 58 
     | 
    
         
            +
                    puts "  Linked from #{linked_from(error.url)}"
         
     | 
| 
      
 59 
     | 
    
         
            +
                    puts error.object.to_s.word_wrap.split("\n").map{|line| '  ' + line}
         
     | 
| 
      
 60 
     | 
    
         
            +
                  end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                  print(<<-SUM)
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
            Pages crawled: #{@visited_documents.size}
         
     | 
| 
      
 65 
     | 
    
         
            +
            Pages with errors: #{@errors.size - @invalid_links.size}
         
     | 
| 
      
 66 
     | 
    
         
            +
            Broken pages: #{@broken_pages.size}
         
     | 
| 
      
 67 
     | 
    
         
            +
            Invalid links: #{@invalid_links.size}
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
            I=Invalid P=Parse Error S=Status code bad
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
            SUM
         
     | 
| 
      
 72 
     | 
    
         
            +
                  exit(@errors.size)
         
     | 
| 
      
 73 
     | 
    
         
            +
                else
         
     | 
| 
      
 74 
     | 
    
         
            +
                   puts "\n\n#{@visited_documents.size} pages crawled"
         
     | 
| 
      
 75 
     | 
    
         
            +
                end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                puts
         
     | 
| 
      
 78 
     | 
    
         
            +
              end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
            private
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
              def validate(link, body)
         
     | 
| 
      
 83 
     | 
    
         
            +
                puts "  Validating..." if @verbose
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                json_response = RestClient.post 'http://validator.nu?out=json', body, :content_type => 'text/html; charset=utf-8'
         
     | 
| 
      
 86 
     | 
    
         
            +
                messages = JSON.parse(json_response.body)['messages']
         
     | 
| 
      
 87 
     | 
    
         
            +
                error_messages = messages.select { |message| message['type'] != 'info' }
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                if error_messages.empty?
         
     | 
| 
      
 90 
     | 
    
         
            +
                  handle_success
         
     | 
| 
      
 91 
     | 
    
         
            +
                  true
         
     | 
| 
      
 92 
     | 
    
         
            +
                else
         
     | 
| 
      
 93 
     | 
    
         
            +
                  response = error_messages.map do |message|
         
     | 
| 
      
 94 
     | 
    
         
            +
                    type, message = message['type'], message['message']
         
     | 
| 
      
 95 
     | 
    
         
            +
                    type_color = type == 'error' ? 31 : 33
         
     | 
| 
      
 96 
     | 
    
         
            +
                    "\e[#{type_color};1m" + type.capitalize + "\e[0m: " + message
         
     | 
| 
      
 97 
     | 
    
         
            +
                  end.join("\n\n")
         
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
                  @errors << Result.new(link, response)
         
     | 
| 
      
 100 
     | 
    
         
            +
                  handle_error('I')
         
     | 
| 
      
 101 
     | 
    
         
            +
                  false
         
     | 
| 
      
 102 
     | 
    
         
            +
                end
         
     | 
| 
      
 103 
     | 
    
         
            +
              rescue RestClient::ServiceUnavailable
         
     | 
| 
      
 104 
     | 
    
         
            +
                handle_error('U')
         
     | 
| 
      
 105 
     | 
    
         
            +
                false
         
     | 
| 
      
 106 
     | 
    
         
            +
              end
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
              def retrieve(link)
         
     | 
| 
      
 109 
     | 
    
         
            +
                test_suite = CI::Reporter::TestSuite.new(link)
         
     | 
| 
      
 110 
     | 
    
         
            +
                test_case  = CI::Reporter::TestCase.new(link)
         
     | 
| 
      
 111 
     | 
    
         
            +
                test_suite.start
         
     | 
| 
      
 112 
     | 
    
         
            +
                test_case.start
         
     | 
| 
      
 113 
     | 
    
         
            +
                puts "  Fetching.." if @verbose
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
                headers = {}
         
     | 
| 
      
 116 
     | 
    
         
            +
                #headers.merge!(Authorization: "Basic #{@authorization}") if options[:username]
         
     | 
| 
      
 117 
     | 
    
         
            +
                headers.merge(user: options[:username], password: options[:password])
         
     | 
| 
      
 118 
     | 
    
         
            +
                response = RestClient.get(options[:domain] + link, headers)
         
     | 
| 
      
 119 
     | 
    
         
            +
                test_suite.name = link
         
     | 
| 
      
 120 
     | 
    
         
            +
                test_case.name = link
         
     | 
| 
      
 121 
     | 
    
         
            +
                test_case.finish
         
     | 
| 
      
 122 
     | 
    
         
            +
                @visited_links << link
         
     | 
| 
      
 123 
     | 
    
         
            +
                unless VALID_RESPONSE_CODES.include?(response.code)
         
     | 
| 
      
 124 
     | 
    
         
            +
                  @errors << Result.new(link, "Status code was #{response.code}")
         
     | 
| 
      
 125 
     | 
    
         
            +
                  @broken_pages << link
         
     | 
| 
      
 126 
     | 
    
         
            +
                  test_case.failures << Crawl::Failure.new(link, response.code, linked_from(link))
         
     | 
| 
      
 127 
     | 
    
         
            +
                  test_suite.testcases << test_case
         
     | 
| 
      
 128 
     | 
    
         
            +
                  test_suite.finish
         
     | 
| 
      
 129 
     | 
    
         
            +
                  @report_manager.write_report(test_suite) if options[:ci]
         
     | 
| 
      
 130 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 131 
     | 
    
         
            +
                end
         
     | 
| 
      
 132 
     | 
    
         
            +
                test_suite.testcases << test_case
         
     | 
| 
      
 133 
     | 
    
         
            +
                test_suite.finish
         
     | 
| 
      
 134 
     | 
    
         
            +
                @report_manager.write_report(test_suite) if options[:ci]
         
     | 
| 
      
 135 
     | 
    
         
            +
                return response
         
     | 
| 
      
 136 
     | 
    
         
            +
              rescue RestClient::InternalServerError => e
         
     | 
| 
      
 137 
     | 
    
         
            +
                @errors << Result.new(link, "Error whilst retrieving page: #{e.message}")
         
     | 
| 
      
 138 
     | 
    
         
            +
                @invalid_links << link
         
     | 
| 
      
 139 
     | 
    
         
            +
                return nil
         
     | 
| 
      
 140 
     | 
    
         
            +
              end
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
              def linked_from(target)
         
     | 
| 
      
 143 
     | 
    
         
            +
                @link_sources[target] # => source
         
     | 
| 
      
 144 
     | 
    
         
            +
              end
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
              def find_links(source_link, body)
         
     | 
| 
      
 147 
     | 
    
         
            +
                puts "  Finding links.." if @verbose
         
     | 
| 
      
 148 
     | 
    
         
            +
                doc = Nokogiri::HTML(body)
         
     | 
| 
      
 149 
     | 
    
         
            +
                anchors = doc.css('a').to_a
         
     | 
| 
      
 150 
     | 
    
         
            +
                anchors.reject!{|anchor| anchor['onclick'].to_s =~ /f.method = 'POST'/}
         
     | 
| 
      
 151 
     | 
    
         
            +
                anchors.reject!{|anchor| anchor['data-method'] =~ /put|post|delete/ }
         
     | 
| 
      
 152 
     | 
    
         
            +
                anchors.reject!{|anchor| anchor['class'].to_s =~ /unobtrusive_/}
         
     | 
| 
      
 153 
     | 
    
         
            +
                raw_links = anchors.map{|anchor| anchor['href']}
         
     | 
| 
      
 154 
     | 
    
         
            +
                raw_links.compact!
         
     | 
| 
      
 155 
     | 
    
         
            +
                raw_links.map!{|link| link.sub(options[:domain], '')}
         
     | 
| 
      
 156 
     | 
    
         
            +
                raw_links.delete_if{|link| link =~ %r{^http://}}
         
     | 
| 
      
 157 
     | 
    
         
            +
                raw_links.delete_if{|link| IGNORE.any?{|pattern| link =~ pattern}}
         
     | 
| 
      
 158 
     | 
    
         
            +
                raw_links.each do |target_link|
         
     | 
| 
      
 159 
     | 
    
         
            +
                  unless @found_links.include?(target_link)
         
     | 
| 
      
 160 
     | 
    
         
            +
                    puts "    Adding #{target_link} found on #{source_link}" if @verbose
         
     | 
| 
      
 161 
     | 
    
         
            +
                    @link_sources[target_link] = source_link
         
     | 
| 
      
 162 
     | 
    
         
            +
                  end
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                raw_links
         
     | 
| 
      
 166 
     | 
    
         
            +
              end
         
     | 
| 
      
 167 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,30 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            class Crawl::Failure
         
     | 
| 
      
 3 
     | 
    
         
            +
              attr_reader :link, :code, :from
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              def initialize(link, code, from)
         
     | 
| 
      
 6 
     | 
    
         
            +
                @link = link
         
     | 
| 
      
 7 
     | 
    
         
            +
                @code = code
         
     | 
| 
      
 8 
     | 
    
         
            +
                @from = from
         
     | 
| 
      
 9 
     | 
    
         
            +
              end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              def failure?
         
     | 
| 
      
 12 
     | 
    
         
            +
                true
         
     | 
| 
      
 13 
     | 
    
         
            +
              end
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
              def error?
         
     | 
| 
      
 16 
     | 
    
         
            +
                !failure?
         
     | 
| 
      
 17 
     | 
    
         
            +
              end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              def name
         
     | 
| 
      
 20 
     | 
    
         
            +
                link
         
     | 
| 
      
 21 
     | 
    
         
            +
              end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
              def message
         
     | 
| 
      
 24 
     | 
    
         
            +
                "Status code was #{code}"
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
              def location
         
     | 
| 
      
 28 
     | 
    
         
            +
                "Linked from #{from}"
         
     | 
| 
      
 29 
     | 
    
         
            +
              end
         
     | 
| 
      
 30 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/crawl/string.rb
    ADDED
    
    
    
        data/lib/crawl.rb
    ADDED
    
    | 
         @@ -0,0 +1,17 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            puts require('nokogiri')
         
     | 
| 
      
 3 
     | 
    
         
            +
            puts require('rest_client')
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'ci/reporter/core'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            require 'base64'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'set'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'fileutils'
         
     | 
| 
      
 9 
     | 
    
         
            +
            require 'digest/sha1'
         
     | 
| 
      
 10 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 11 
     | 
    
         
            +
            require 'tempfile'
         
     | 
| 
      
 12 
     | 
    
         
            +
            require 'tmpdir'
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            require_relative "crawl/version"
         
     | 
| 
      
 15 
     | 
    
         
            +
            require_relative "crawl/engine"
         
     | 
| 
      
 16 
     | 
    
         
            +
            require_relative "crawl/string"
         
     | 
| 
      
 17 
     | 
    
         
            +
            require_relative "crawl/failure"
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,90 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: crawl
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.1
         
     | 
| 
      
 5 
     | 
    
         
            +
              prerelease: 
         
     | 
| 
      
 6 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 7 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 8 
     | 
    
         
            +
            - Tor Erik Linnerud
         
     | 
| 
      
 9 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 10 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 11 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2011-11-04 00:00:00.000000000 Z
         
     | 
| 
      
 13 
     | 
    
         
            +
            dependencies:
         
     | 
| 
      
 14 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 15 
     | 
    
         
            +
              name: nokogiri
         
     | 
| 
      
 16 
     | 
    
         
            +
              requirement: &70363418401240 !ruby/object:Gem::Requirement
         
     | 
| 
      
 17 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 18 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 19 
     | 
    
         
            +
                - - ! '>='
         
     | 
| 
      
 20 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 21 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 22 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 23 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 24 
     | 
    
         
            +
              version_requirements: *70363418401240
         
     | 
| 
      
 25 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 26 
     | 
    
         
            +
              name: rest-client
         
     | 
| 
      
 27 
     | 
    
         
            +
              requirement: &70363418400700 !ruby/object:Gem::Requirement
         
     | 
| 
      
 28 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 29 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 30 
     | 
    
         
            +
                - - ! '>='
         
     | 
| 
      
 31 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 32 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 33 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 34 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 35 
     | 
    
         
            +
              version_requirements: *70363418400700
         
     | 
| 
      
 36 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 37 
     | 
    
         
            +
              name: ci_reporter
         
     | 
| 
      
 38 
     | 
    
         
            +
              requirement: &70363418400280 !ruby/object:Gem::Requirement
         
     | 
| 
      
 39 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 40 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 41 
     | 
    
         
            +
                - - ! '>='
         
     | 
| 
      
 42 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 43 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 44 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 45 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 46 
     | 
    
         
            +
              version_requirements: *70363418400280
         
     | 
| 
      
 47 
     | 
    
         
            +
            description: Crawl all pages on a domain, checking for errors
         
     | 
| 
      
 48 
     | 
    
         
            +
            email:
         
     | 
| 
      
 49 
     | 
    
         
            +
            - tor@alphasights.com
         
     | 
| 
      
 50 
     | 
    
         
            +
            executables:
         
     | 
| 
      
 51 
     | 
    
         
            +
            - crawl
         
     | 
| 
      
 52 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 53 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 54 
     | 
    
         
            +
            files:
         
     | 
| 
      
 55 
     | 
    
         
            +
            - .gitignore
         
     | 
| 
      
 56 
     | 
    
         
            +
            - Gemfile
         
     | 
| 
      
 57 
     | 
    
         
            +
            - Rakefile
         
     | 
| 
      
 58 
     | 
    
         
            +
            - bin/crawl
         
     | 
| 
      
 59 
     | 
    
         
            +
            - crawl.gemspec
         
     | 
| 
      
 60 
     | 
    
         
            +
            - lib/crawl.rb
         
     | 
| 
      
 61 
     | 
    
         
            +
            - lib/crawl/engine.rb
         
     | 
| 
      
 62 
     | 
    
         
            +
            - lib/crawl/failure.rb
         
     | 
| 
      
 63 
     | 
    
         
            +
            - lib/crawl/string.rb
         
     | 
| 
      
 64 
     | 
    
         
            +
            - lib/crawl/version.rb
         
     | 
| 
      
 65 
     | 
    
         
            +
            homepage: http://github.com/alphasights/crawl
         
     | 
| 
      
 66 
     | 
    
         
            +
            licenses: []
         
     | 
| 
      
 67 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 68 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 69 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 70 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 71 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 72 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 73 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 74 
     | 
    
         
            +
              - - ! '>='
         
     | 
| 
      
 75 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 76 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 77 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 78 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 79 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 80 
     | 
    
         
            +
              - - ! '>='
         
     | 
| 
      
 81 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 82 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 83 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 84 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 85 
     | 
    
         
            +
            rubygems_version: 1.8.11
         
     | 
| 
      
 86 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 87 
     | 
    
         
            +
            specification_version: 3
         
     | 
| 
      
 88 
     | 
    
         
            +
            summary: Exhaustive search pages witin a domain, reporting any page that returns a
         
     | 
| 
      
 89 
     | 
    
         
            +
              bad response code
         
     | 
| 
      
 90 
     | 
    
         
            +
            test_files: []
         
     |