shelob 0.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/Guardfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +8 -0
- data/bin/shelob +38 -0
- data/lib/extractor.rb +23 -0
- data/lib/link_result.rb +13 -0
- data/lib/resolver.rb +16 -0
- data/lib/shelob/version.rb +3 -0
- data/lib/shelob.rb +64 -0
- data/shelob.gemspec +28 -0
- data/test/test_extractor.rb +37 -0
- data/test/test_link_result.rb +29 -0
- data/test/test_resolver.rb +31 -0
- data/test/test_shelob.rb +97 -0
- metadata +165 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: 4fd39cfeaa059a074821a0d60ee93b464ff48819
         | 
| 4 | 
            +
              data.tar.gz: fc04fd19c13c1a970abfed9c126854da9ad6eaeb
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 72b0645723887dfb1942108c93e5cfda5f50de550d5a240f5da0119763fbd02745fdce0f88038b42849b863e54a81cf214db6285badab74eed6dbb082debd8ab
         | 
| 7 | 
            +
              data.tar.gz: e53ddc74da61a78b19acde7d9e87859c9682dc48bd044f736f51942d1ee51d79b573b868ee5c72e3c87e9ab1e4e922f6c3760357947a2497492f1b1ce9ecd693
         | 
    
        data/.gitignore
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/Guardfile
    ADDED
    
    
    
        data/LICENSE.txt
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
| 1 | 
            +
            Copyright (c) 2013 Benjamin Nicholas
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            MIT License
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 6 | 
            +
            a copy of this software and associated documentation files (the
         | 
| 7 | 
            +
            "Software"), to deal in the Software without restriction, including
         | 
| 8 | 
            +
            without limitation the rights to use, copy, modify, merge, publish,
         | 
| 9 | 
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         | 
| 10 | 
            +
            permit persons to whom the Software is furnished to do so, subject to
         | 
| 11 | 
            +
            the following conditions:
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            The above copyright notice and this permission notice shall be
         | 
| 14 | 
            +
            included in all copies or substantial portions of the Software.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         | 
| 17 | 
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         | 
| 18 | 
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         | 
| 19 | 
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         | 
| 20 | 
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         | 
| 21 | 
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         | 
| 22 | 
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,29 @@ | |
| 1 | 
            +
            # LinkChecker
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            TODO: Write a gem description
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Installation
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Add this line to your application's Gemfile:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                gem 'link_checker'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            And then execute:
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                $ bundle
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            Or install it yourself as:
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                $ gem install link_checker
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ## Usage
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            TODO: Write usage instructions here
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ## Contributing
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            1. Fork it
         | 
| 26 | 
            +
            2. Create your feature branch (`git checkout -b my-new-feature`)
         | 
| 27 | 
            +
            3. Commit your changes (`git commit -am 'Add some feature'`)
         | 
| 28 | 
            +
            4. Push to the branch (`git push origin my-new-feature`)
         | 
| 29 | 
            +
            5. Create new Pull Request
         | 
    
        data/Rakefile
    ADDED
    
    
    
        data/bin/shelob
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'optparse'
         | 
| 4 | 
            +
            require 'shelob'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            def main args
         | 
| 7 | 
            +
              puts Shelob::Spider.new(args[0], verbose: options[:verbose]).check
         | 
| 8 | 
            +
            end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            options = {verbose: 0}
         | 
| 11 | 
            +
            optparse = OptionParser.new do |opts|
         | 
| 12 | 
            +
              opts.banner = "Usage: shelob [options] root_url"
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              opts.on('-v', "--[no-]verbose", "Print simple information(overrides -r)") do
         | 
| 15 | 
            +
                options[:verbose] = 1
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              opts.on('-r', '--[no-]really-verbose', "Print lots of information(overrides -v)") do
         | 
| 19 | 
            +
                options[:verbose] = 2
         | 
| 20 | 
            +
              end
         | 
| 21 | 
            +
              
         | 
| 22 | 
            +
              opts.on_tail('-h', '--help', 'Show this message') do
         | 
| 23 | 
            +
                puts opts
         | 
| 24 | 
            +
                exit
         | 
| 25 | 
            +
              end
         | 
| 26 | 
            +
            end.parse!
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            if ARGV.empty?
         | 
| 29 | 
            +
              puts optparse
         | 
| 30 | 
            +
              exit 1
         | 
| 31 | 
            +
            end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            begin
         | 
| 34 | 
            +
              exit main(ARGV)
         | 
| 35 | 
            +
            rescue => ex
         | 
| 36 | 
            +
              STDERR.puts ex.message
         | 
| 37 | 
            +
            end
         | 
| 38 | 
            +
             | 
    
        data/lib/extractor.rb
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
| 1 | 
            +
            require 'rubygems'
         | 
| 2 | 
            +
            require 'nokogiri'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Shelob
         | 
| 5 | 
            +
              class Extractor
         | 
| 6 | 
            +
                def initialize fetched
         | 
| 7 | 
            +
                  @fetched = fetched
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def extract
         | 
| 11 | 
            +
                  content = Nokogiri::HTML(@fetched.body)
         | 
| 12 | 
            +
                  raw = content.css('a').map { |anchor| anchor['href'] }
         | 
| 13 | 
            +
                  raw.map do |link| 
         | 
| 14 | 
            +
                    if link.start_with? '/' 
         | 
| 15 | 
            +
                      u = URI(@fetched.url)
         | 
| 16 | 
            +
                      "#{u.scheme}://#{u.host}#{link}"
         | 
| 17 | 
            +
                    else
         | 
| 18 | 
            +
                      link
         | 
| 19 | 
            +
                    end
         | 
| 20 | 
            +
                  end
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
            end
         | 
    
        data/lib/link_result.rb
    ADDED
    
    
    
        data/lib/resolver.rb
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
| 1 | 
            +
            require 'link_result'
         | 
| 2 | 
            +
            require 'net/http'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Shelob
         | 
| 5 | 
            +
              class Resolver
         | 
| 6 | 
            +
                def initialize url
         | 
| 7 | 
            +
                  @uri = URI(url)
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def resolve
         | 
| 11 | 
            +
                  resp = Net::HTTP.get_response(@uri)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  LinkResult.new @uri.to_s, resp.code.to_i, resp.body
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
            end
         | 
    
        data/lib/shelob.rb
    ADDED
    
    | @@ -0,0 +1,64 @@ | |
| 1 | 
            +
            require "shelob/version"
         | 
| 2 | 
            +
            require "resolver"
         | 
| 3 | 
            +
            require "extractor"
         | 
| 4 | 
            +
            require "set"
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Shelob
         | 
| 7 | 
            +
              class Spider
         | 
| 8 | 
            +
                attr_accessor :hostname
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def initialize hostname, options = {}
         | 
| 11 | 
            +
                  @hostname = hostname
         | 
| 12 | 
            +
                  @queue = [ hostname ]
         | 
| 13 | 
            +
                  @urls = Set.new @queue
         | 
| 14 | 
            +
                  @failures = []
         | 
| 15 | 
            +
                  @verbose = options[:verbose] == 1 ? true : false
         | 
| 16 | 
            +
                  @chatty = options[:verbose] == 2 ? true : false
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def check
         | 
| 20 | 
            +
                  while not @queue.empty?
         | 
| 21 | 
            +
                    url = @queue.shift
         | 
| 22 | 
            +
                    @urls << url
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    if @verbose
         | 
| 25 | 
            +
                      print '.'
         | 
| 26 | 
            +
                    end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    if @chatty
         | 
| 29 | 
            +
                      print "#{url}... "
         | 
| 30 | 
            +
                    end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    fetch = Resolver.new(url).resolve
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    @failures << fetch if fetch.status >= 400
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    links = Extractor.new(fetch).extract
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    filtered = links.select do |link| 
         | 
| 39 | 
            +
                      link.start_with? @hostname and !@urls.include? link
         | 
| 40 | 
            +
                    end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    if @chatty
         | 
| 43 | 
            +
                      puts "checked!"
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    @queue.push(*filtered)
         | 
| 47 | 
            +
                  end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                  @failures
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def remaining
         | 
| 53 | 
            +
                  return @queue.count
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def requests
         | 
| 57 | 
            +
                  return @urls.count
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                def fetched
         | 
| 61 | 
            +
                  return @urls
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
            end
         | 
    
        data/shelob.gemspec
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
| 1 | 
            +
            # coding: utf-8
         | 
| 2 | 
            +
            lib = File.expand_path('../lib', __FILE__)
         | 
| 3 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 4 | 
            +
            require 'shelob/version'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Gem::Specification.new do |spec|
         | 
| 7 | 
            +
              spec.name          = "shelob"
         | 
| 8 | 
            +
              spec.version       = Shelob::VERSION
         | 
| 9 | 
            +
              spec.authors       = ["Benjamin Nicholas"]
         | 
| 10 | 
            +
              spec.email         = ["bnicholas@brandnetworksinc.com"]
         | 
| 11 | 
            +
              spec.description   = %q{A giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url}
         | 
| 12 | 
            +
              spec.summary       = %q{Spider a site and check links}
         | 
| 13 | 
            +
              spec.license       = "MIT"
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              spec.files         = `git ls-files`.split($/)
         | 
| 16 | 
            +
              spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
         | 
| 17 | 
            +
              spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
         | 
| 18 | 
            +
              spec.require_paths = ["lib"]
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              spec.add_development_dependency "bundler", "~> 1.3"
         | 
| 21 | 
            +
              spec.add_development_dependency "rake"
         | 
| 22 | 
            +
              spec.add_development_dependency "minitest"
         | 
| 23 | 
            +
              spec.add_development_dependency "webmock"
         | 
| 24 | 
            +
              spec.add_development_dependency "guard"
         | 
| 25 | 
            +
              spec.add_development_dependency "guard-minitest"
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              spec.add_runtime_dependency     "nokogiri"
         | 
| 28 | 
            +
            end
         | 
| @@ -0,0 +1,37 @@ | |
| 1 | 
            +
            require 'minitest/autorun'
         | 
| 2 | 
            +
            require 'extractor'
         | 
| 3 | 
            +
            require 'link_result'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            describe Shelob::Extractor, "Link extracting module" do
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              describe "when created" do
         | 
| 8 | 
            +
                it "should be created with a LinkResult" do
         | 
| 9 | 
            +
                  le = LinkResult.new("http://google.com", 200, '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a></body></html>')
         | 
| 10 | 
            +
                  le.wont_be_nil
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              describe "when used" do
         | 
| 15 | 
            +
                before do
         | 
| 16 | 
            +
                  @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
         | 
| 17 | 
            +
                  @result2 = LinkResult.new("http://google.com/something", 200, '<html><head><title>hi</title></head><body><a href="/about">about</a></body></html>')
         | 
| 18 | 
            +
                  @le = Shelob::Extractor.new(@result)
         | 
| 19 | 
            +
                  @le2 = Shelob::Extractor.new(@result2)
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                it "should return a list of the links in the page" do
         | 
| 23 | 
            +
                  extracts = @le.extract 
         | 
| 24 | 
            +
                  extracts.must_be_kind_of Array
         | 
| 25 | 
            +
                  extracts.must_equal ["http://bing.com", "http://yahoo.com"]
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                it "should transform relative links to absolute" do
         | 
| 29 | 
            +
                  extracts = @le2.extract
         | 
| 30 | 
            +
                  extracts.must_be_kind_of Array
         | 
| 31 | 
            +
                  extracts.must_equal ["http://google.com/about"]
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              end # describe
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            end # describe
         | 
| 37 | 
            +
             | 
| @@ -0,0 +1,29 @@ | |
| 1 | 
            +
            require 'minitest/autorun'
         | 
| 2 | 
            +
            require 'link_result'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe LinkResult, "Link fetch result" do
         | 
| 5 | 
            +
              before do
         | 
| 6 | 
            +
                @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              describe "when created" do
         | 
| 10 | 
            +
                it "should take three arguments" do
         | 
| 11 | 
            +
                  @result.wont_be_nil
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                it "should save arguments" do
         | 
| 15 | 
            +
                  @result.url.must_equal "http://google.com"
         | 
| 16 | 
            +
                  @result.status.must_equal 200
         | 
| 17 | 
            +
                  @result.body.must_equal '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>'
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                it "should be immutable" do
         | 
| 21 | 
            +
                  proc { @result.status = 404 }.must_raise NoMethodError
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                it "should have a clean string rep" do
         | 
| 25 | 
            +
                  @result.to_s.must_equal "200: http://google.com"
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
            end
         | 
| 29 | 
            +
             | 
| @@ -0,0 +1,31 @@ | |
| 1 | 
            +
            require 'minitest/autorun'
         | 
| 2 | 
            +
            require 'webmock/minitest'
         | 
| 3 | 
            +
            require 'resolver'
         | 
| 4 | 
            +
            require 'link_result'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            WebMock.allow_net_connect!
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            describe Shelob::Resolver, "Link fetching module" do 
         | 
| 9 | 
            +
              describe "when created" do
         | 
| 10 | 
            +
                it "should be created with a url" do
         | 
| 11 | 
            +
                  Shelob::Resolver.new("http://bmnick.com/ruby-c-extensions") 
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
              end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              describe "when used" do
         | 
| 16 | 
            +
                before do
         | 
| 17 | 
            +
                  @resolver = Shelob::Resolver.new("http://bmnick.com/ruby-c-extensions")
         | 
| 18 | 
            +
                  @result = @resolver.resolve
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                it "should return a LinkResult" do
         | 
| 22 | 
            +
                  @result.must_be_kind_of LinkResult
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                it "should return live result" do
         | 
| 26 | 
            +
                  @result.body.must_match(/CExt/)
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
                  
         | 
| 31 | 
            +
            end
         | 
    
        data/test/test_shelob.rb
    ADDED
    
    | @@ -0,0 +1,97 @@ | |
| 1 | 
            +
            require 'minitest/autorun'
         | 
| 2 | 
            +
            require 'webmock/minitest'
         | 
| 3 | 
            +
            require 'shelob'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # Stub out requests
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            describe Shelob, "Link checking module" do
         | 
| 8 | 
            +
              describe "when created" do
         | 
| 9 | 
            +
                it "should exist" do
         | 
| 10 | 
            +
                  Shelob.wont_be_nil
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
            end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            describe Shelob::Spider, "Link checking spider" do
         | 
| 16 | 
            +
              before do
         | 
| 17 | 
            +
                stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>')
         | 
| 18 | 
            +
                stub_request(:any, 'http://bmnick.com/').to_return(status: 200, body: '<html><head><title>pdf</title></head><body><a href="http://bmnick.com/resume/">resume</a><a href="http://bmnick.com/">home</a><a href="http://bmnick.com/resume/secret">no touchy!</a></body></html>')
         | 
| 19 | 
            +
                stub_request(:any, 'http://bmnick.com/resume/secret').to_return(body: '<html><head><title>secrets</title></head><body><a href="http://bmnick.com/resume/boring">boredom</a><a href="http://bmnick.com/resume">resume</a><a href="/resume/relative">relative</a></body></html>"')
         | 
| 20 | 
            +
                stub_request(:any, 'http://bmnick.com/resume/resume.pdf').to_return(status: 404)
         | 
| 21 | 
            +
                stub_request(:any, 'http://bmnick.com/resume/boring').to_return(status: 500)
         | 
| 22 | 
            +
                stub_request(:any, 'http://bmnick.com/resume/relative').to_return(status: 204)
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
              describe "when created" do
         | 
| 25 | 
            +
                it "should exist" do
         | 
| 26 | 
            +
                  Shelob::Spider.wont_be_nil
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
                it "should store the initial url" do
         | 
| 29 | 
            +
                  spider = Shelob::Spider.new("https://openforum.com")
         | 
| 30 | 
            +
                  spider.wont_be_nil
         | 
| 31 | 
            +
                  spider.hostname.must_equal "https://openforum.com"
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
              describe "when checking links" do
         | 
| 35 | 
            +
                before do
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  @spider = Shelob::Spider.new("http://bmnick.com/resume")
         | 
| 38 | 
            +
                  @results = @spider.check
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                it "should return an array from check" do 
         | 
| 42 | 
            +
                  @results.must_be_kind_of Array
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
                it "should return only error links" do
         | 
| 45 | 
            +
                  @results.select{|r| r.status == 200}.must_be_empty
         | 
| 46 | 
            +
                end
         | 
| 47 | 
            +
                it "should provide remaining counts" do
         | 
| 48 | 
            +
                  @spider.remaining.must_equal 0
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
                it "should fetch the original url" do
         | 
| 51 | 
            +
                  @spider.fetched.must_include "http://bmnick.com/resume"
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
                it "should provide a number of urls fetched" do
         | 
| 54 | 
            +
                  # http://bmnick.com/resume
         | 
| 55 | 
            +
                  # http://bmnick.com/resume/resume.pdf
         | 
| 56 | 
            +
                  # http://bmnick.com/resume/secret
         | 
| 57 | 
            +
                  # http://bmnick.com/resume/boring
         | 
| 58 | 
            +
                  # http://bmnick.com/resume/relative
         | 
| 59 | 
            +
                  @spider.requests.must_equal 5
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
                it "should make a web request for the original url" do
         | 
| 62 | 
            +
                  assert_requested :get, "http://bmnick.com/resume"
         | 
| 63 | 
            +
                end
         | 
| 64 | 
            +
                it "should make a web request for child urls" do
         | 
| 65 | 
            +
                  # 404
         | 
| 66 | 
            +
                  assert_requested :get, "http://bmnick.com/resume/resume.pdf"
         | 
| 67 | 
            +
                  @spider.fetched.must_include "http://bmnick.com/resume/resume.pdf"
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                  # successful
         | 
| 70 | 
            +
                  assert_requested :get, "http://bmnick.com/resume/secret"
         | 
| 71 | 
            +
                  @spider.fetched.must_include "http://bmnick.com/resume/secret"
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
                it "should return the failed request" do
         | 
| 74 | 
            +
                  # http://bmnick.com/resume/resume.pdf => 404
         | 
| 75 | 
            +
                  # http://bmnick.com/resume/boring => 500
         | 
| 76 | 
            +
                  @results.count.must_equal 2
         | 
| 77 | 
            +
                end
         | 
| 78 | 
            +
                it "shouldn't request pages without the prefix" do
         | 
| 79 | 
            +
                  assert_not_requested :get, "http://bmnick.com"
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
                it "shouldn't request pages multiple times" do
         | 
| 82 | 
            +
                  assert_requested :get, "http://bmnick.com/resume", times: 1
         | 
| 83 | 
            +
                end
         | 
| 84 | 
            +
                it "should continue to spider down the page" do
         | 
| 85 | 
            +
                  assert_requested :get, "http://bmnick.com/resume/boring"
         | 
| 86 | 
            +
                  @spider.fetched.must_include "http://bmnick.com/resume/boring"
         | 
| 87 | 
            +
                end
         | 
| 88 | 
            +
                it "should support relative links" do
         | 
| 89 | 
            +
                  assert_requested :get, "http://bmnick.com/resume/relative"
         | 
| 90 | 
            +
                  @spider.fetched.must_include "http://bmnick.com/resume/relative"
         | 
| 91 | 
            +
                end
         | 
| 92 | 
            +
                it "should format a string cleanly" do
         | 
| 93 | 
            +
                  @results.map{|r|r.to_s}.join("\n").must_equal "404: http://bmnick.com/resume/resume.pdf
         | 
| 94 | 
            +
            500: http://bmnick.com/resume/boring"
         | 
| 95 | 
            +
                end
         | 
| 96 | 
            +
              end
         | 
| 97 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,165 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: shelob
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.1.0.beta1
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - Benjamin Nicholas
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2013-12-30 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies:
         | 
| 13 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 14 | 
            +
              name: bundler
         | 
| 15 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 | 
            +
                requirements:
         | 
| 17 | 
            +
                - - ~>
         | 
| 18 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            +
                    version: '1.3'
         | 
| 20 | 
            +
              type: :development
         | 
| 21 | 
            +
              prerelease: false
         | 
| 22 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 | 
            +
                requirements:
         | 
| 24 | 
            +
                - - ~>
         | 
| 25 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            +
                    version: '1.3'
         | 
| 27 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 28 | 
            +
              name: rake
         | 
| 29 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 | 
            +
                requirements:
         | 
| 31 | 
            +
                - - '>='
         | 
| 32 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            +
                    version: '0'
         | 
| 34 | 
            +
              type: :development
         | 
| 35 | 
            +
              prerelease: false
         | 
| 36 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 | 
            +
                requirements:
         | 
| 38 | 
            +
                - - '>='
         | 
| 39 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            +
                    version: '0'
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: minitest
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - '>='
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: '0'
         | 
| 48 | 
            +
              type: :development
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - '>='
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: '0'
         | 
| 55 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 56 | 
            +
              name: webmock
         | 
| 57 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 58 | 
            +
                requirements:
         | 
| 59 | 
            +
                - - '>='
         | 
| 60 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 61 | 
            +
                    version: '0'
         | 
| 62 | 
            +
              type: :development
         | 
| 63 | 
            +
              prerelease: false
         | 
| 64 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 65 | 
            +
                requirements:
         | 
| 66 | 
            +
                - - '>='
         | 
| 67 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 68 | 
            +
                    version: '0'
         | 
| 69 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 70 | 
            +
              name: guard
         | 
| 71 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 72 | 
            +
                requirements:
         | 
| 73 | 
            +
                - - '>='
         | 
| 74 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 75 | 
            +
                    version: '0'
         | 
| 76 | 
            +
              type: :development
         | 
| 77 | 
            +
              prerelease: false
         | 
| 78 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 79 | 
            +
                requirements:
         | 
| 80 | 
            +
                - - '>='
         | 
| 81 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 82 | 
            +
                    version: '0'
         | 
| 83 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 84 | 
            +
              name: guard-minitest
         | 
| 85 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 86 | 
            +
                requirements:
         | 
| 87 | 
            +
                - - '>='
         | 
| 88 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 89 | 
            +
                    version: '0'
         | 
| 90 | 
            +
              type: :development
         | 
| 91 | 
            +
              prerelease: false
         | 
| 92 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 93 | 
            +
                requirements:
         | 
| 94 | 
            +
                - - '>='
         | 
| 95 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 96 | 
            +
                    version: '0'
         | 
| 97 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 98 | 
            +
              name: nokogiri
         | 
| 99 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 100 | 
            +
                requirements:
         | 
| 101 | 
            +
                - - '>='
         | 
| 102 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 103 | 
            +
                    version: '0'
         | 
| 104 | 
            +
              type: :runtime
         | 
| 105 | 
            +
              prerelease: false
         | 
| 106 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 107 | 
            +
                requirements:
         | 
| 108 | 
            +
                - - '>='
         | 
| 109 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 110 | 
            +
                    version: '0'
         | 
| 111 | 
            +
            description: A giant spider that starts on a given page, finds all links on the page,
         | 
| 112 | 
            +
              ensure they resolve, and recurses if the link is underneath the starting url
         | 
| 113 | 
            +
            email:
         | 
| 114 | 
            +
            - bnicholas@brandnetworksinc.com
         | 
| 115 | 
            +
            executables:
         | 
| 116 | 
            +
            - shelob
         | 
| 117 | 
            +
            extensions: []
         | 
| 118 | 
            +
            extra_rdoc_files: []
         | 
| 119 | 
            +
            files:
         | 
| 120 | 
            +
            - .gitignore
         | 
| 121 | 
            +
            - Gemfile
         | 
| 122 | 
            +
            - Guardfile
         | 
| 123 | 
            +
            - LICENSE.txt
         | 
| 124 | 
            +
            - README.md
         | 
| 125 | 
            +
            - Rakefile
         | 
| 126 | 
            +
            - bin/shelob
         | 
| 127 | 
            +
            - lib/extractor.rb
         | 
| 128 | 
            +
            - lib/link_result.rb
         | 
| 129 | 
            +
            - lib/resolver.rb
         | 
| 130 | 
            +
            - lib/shelob.rb
         | 
| 131 | 
            +
            - lib/shelob/version.rb
         | 
| 132 | 
            +
            - shelob.gemspec
         | 
| 133 | 
            +
            - test/test_extractor.rb
         | 
| 134 | 
            +
            - test/test_link_result.rb
         | 
| 135 | 
            +
            - test/test_resolver.rb
         | 
| 136 | 
            +
            - test/test_shelob.rb
         | 
| 137 | 
            +
            homepage: 
         | 
| 138 | 
            +
            licenses:
         | 
| 139 | 
            +
            - MIT
         | 
| 140 | 
            +
            metadata: {}
         | 
| 141 | 
            +
            post_install_message: 
         | 
| 142 | 
            +
            rdoc_options: []
         | 
| 143 | 
            +
            require_paths:
         | 
| 144 | 
            +
            - lib
         | 
| 145 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 146 | 
            +
              requirements:
         | 
| 147 | 
            +
              - - '>='
         | 
| 148 | 
            +
                - !ruby/object:Gem::Version
         | 
| 149 | 
            +
                  version: '0'
         | 
| 150 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 151 | 
            +
              requirements:
         | 
| 152 | 
            +
              - - '>'
         | 
| 153 | 
            +
                - !ruby/object:Gem::Version
         | 
| 154 | 
            +
                  version: 1.3.1
         | 
| 155 | 
            +
            requirements: []
         | 
| 156 | 
            +
            rubyforge_project: 
         | 
| 157 | 
            +
            rubygems_version: 2.0.3
         | 
| 158 | 
            +
            signing_key: 
         | 
| 159 | 
            +
            specification_version: 4
         | 
| 160 | 
            +
            summary: Spider a site and check links
         | 
| 161 | 
            +
            test_files:
         | 
| 162 | 
            +
            - test/test_extractor.rb
         | 
| 163 | 
            +
            - test/test_link_result.rb
         | 
| 164 | 
            +
            - test/test_resolver.rb
         | 
| 165 | 
            +
            - test/test_shelob.rb
         |