RubyGems - shelob - Versions diffs - 0.1.0.beta2 → 0.1.0.beta3 - Mend

shelob 0.1.0.beta2 → 0.1.0.beta3

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6169a929fd11d18cbd5a14c6901ff233c27a56a0
-  data.tar.gz: b14b82eebbaf401303286b5e1c5744157c471a34
+  metadata.gz: 7554cee96efb6430592a84c0954da5df9cb7efc2
+  data.tar.gz: 1cf0cfad3ed2f1505f88cd45c22394b09d55c27a
 SHA512:
-  metadata.gz: c8f0f6363eb626baceab44365fea723a41dc4859a574aa881c87e3aed3bee96cbf3d24e00b6d795bdf83d535a521da23cc8d07b14a3cb91dd9333e624b09bc77
-  data.tar.gz: 80aa2da0b5596a9f4a294ddb0a6669728db1ca23e8e6980f9ac530dc761836b79fde792c62de623d41f0477e72014568e48728a70cd788167b4f03bd6913b102
+  metadata.gz: 94d3264022e2e80736a54eb5fe1d0e68e2252be4e4008fc2855df6e841dbd141a073009c97fe10e8121ab06ccae9bf21aa558a26f0067bc0e7dc9985bc836bfd
+  data.tar.gz: 8f2d4e39cae612176646421eb29d1441a24cfacf2ae64c00c6d08fa15e341d72d89d21c1106ed462d2a2d4cbd345bf7296d9a8da54ba30bb0d3b3afe03b245aa

data/README.md CHANGED Viewed

@@ -1,12 +1,21 @@
-# LinkChecker
+# Shelob
-TODO: Write a gem description
+Shelob is a giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url. Intended primarily for double checking that your site has no horrible error pages to be exposed to the user by clicking on a link.
+## Usage
+    shelob [-r|v] root_url
+    -r: really verbose, will print each url it checks
+    -v: verbose, will just print a progress indicator for each url so you don't think it just stopped
+You can also use the link resolver, extractor, or the spider itself programmatically. Check the tests for usage until I can write up some good documentation...
 ## Installation
 Add this line to your application's Gemfile:
-    gem 'link_checker'
+    gem 'shelob'
 And then execute:
@@ -14,16 +23,13 @@ And then execute:
 Or install it yourself as:
-    $ gem install link_checker
-## Usage
-TODO: Write usage instructions here
+    $ gem install shelob
 ## Contributing
 1. Fork it
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Make sure you have tests, and they pass! (`rake`)
 4. Push to the branch (`git push origin my-new-feature`)
 5. Create new Pull Request

data/lib/link_result.rb CHANGED Viewed

@@ -10,4 +10,8 @@ class LinkResult
   def to_s
     "#{@status}: #{@url}"
   end
+  def failed
+    @status.to_i >= 400
+  end
 end

data/lib/shelob/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Shelob
-  VERSION = "0.1.0.beta2"
+  VERSION = "0.1.0.beta3"
 end

data/lib/shelob.rb CHANGED Viewed

@@ -4,59 +4,151 @@ require "extractor"
 require "set"
 module Shelob
+  # This is the central workhorse class of Shelob. It takes
+  # a url, fetches it, and then spiders through any
+  # children of that url and fetches them as well.
   class Spider
+    # The root url which this Spider instance is working
+    # underneath
     attr_accessor :hostname
+    # Create a new spider with the given hostname and
+    # options
+    #
+    # Valid options:
+    # * Verbose: 0 for no output, 1 for progress output, 2
+    # for verbose output
     def initialize hostname, options = {}
+      # Data
       @hostname = hostname
-      @queue = [ hostname ]
-      @urls = Set.new @queue
-      @failures = []
+      # Options
       @verbose = options[:verbose] == 1 ? true : false
       @chatty = options[:verbose] == 2 ? true : false
+      # Internal
+      @queue = [ hostname ]
     end
-    def check
-      while not @queue.empty?
-        url = @queue.shift
-        @urls << url
+    # Notify that a url is about to be processed. Currently
+    # only used to print status
+    def pre_process_notify url
+      print "#{url}... " if @chatty
+    end
+    # Notify that a url has just been processed. Currently
+    # only used to print status
+    def post_process_notify url
+      print '.' if @verbose
+      puts "checked!" if @chatty
+    end
+    # Load a page from the internet, appending it to the
+    # failures array if the fetch encountered an error.
+    #
+    # Returns a LinkResult with the results of fetching the
+    # page.
+    def fetch url
+      page = Resolver.new(url).resolve
+      @failures << page if page.failed
+      page
+    end
+    # Extract links from the given url.
+    #
+    # Returns an array of all link targets on the page.
+    def extract url
+      page = fetch url
+      Extractor.new(page).extract
+    end
+    # Filter links to ensure they are children of the root
+    # url, and removes duplicates
+    def filter links
+      links.select do |link|
+        link.start_with? @hostname
+      end.uniq
+    end
+    # Add the given links to our internal queue to ensure
+    # they are checked.
+    def enqueue links
+      children = filter links
+      @queue.push(*children)
+    end
+    # Signal that processing is done on a given url, so
+    # that it won't be checked again
+    def finish url
+      @urls << url
+    end
-        if @verbose
-          print '.'
-        end
+    # Given a url, fetch it, extract all links, and enqueue
+    # those links for later processing.
+    def process url
+      links = extract url
-        if @chatty
-          print "#{url}... "
-        end
+      enqueue links
-        fetch = Resolver.new(url).resolve
+      finish url
+    end
-        @failures << fetch if fetch.status >= 400
+    # Internal helper method to kick off the spider once
+    # everything has been properly configured.
+    def run_spider
+      while not @queue.empty?
+        url = @queue.shift
-        links = Extractor.new(fetch).extract
+        next if @urls.include? url
-        filtered = links.select do |link|
-          link.start_with? @hostname and !@urls.include? link
-        end
+        pre_process_notify url
-        if @chatty
-          puts "checked!"
-        end
+        process url
-        @queue.push(*filtered)
+        post_process_notify url
       end
+    end
+    # Entry point to the main spider process. This is the
+    # main API point, and will return once the site has
+    # been completely spidered.
+    #
+    # Returns a list of all failed urls, and their
+    # particular error code (404, 500, etc.)
+    def check
+      # set up variables
+      @urls ||= Set.new
+      @failures ||= []
+      # kick the spider off
+      run_spider
       @failures
     end
+    # Returns a count of the remaining urls to parse - this
+    # number is only a view of the current state, as more
+    # urls are constantly being added as other urls
+    # resolve.
+    #
+    # This would only be useful to call from another thread
+    # at this time, as check is a blocking call
     def remaining
       return @queue.count
     end
+    # Return the total number of urls that were fetched in
+    # the spidering process.
     def requests
       return @urls.count
     end
+    # Return an array of all urls that were fetched in the
+    # process of spidering the site.
     def fetched
       return @urls
     end

data/shelob.gemspec CHANGED Viewed

@@ -6,6 +6,7 @@ require 'shelob/version'
 Gem::Specification.new do |spec|
   spec.name          = "shelob"
   spec.version       = Shelob::VERSION
+  spec.homepage      = 'https://github.com/bmnick/shelob'
   spec.authors       = ["Benjamin Nicholas"]
   spec.email         = ["bnicholas@brandnetworksinc.com"]
   spec.description   = %q{A giant spider that starts on a given page, finds all links on the page, ensure they resolve, and recurses if the link is underneath the starting url}

data/test/test_link_result.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'link_result'
 describe LinkResult, "Link fetch result" do
   before do
     @result = LinkResult.new("http://google.com", 200, '<html><head><title>hi</title></head><body><a href="http://bing.com">bing</a><a href="http://yahoo.com">yahoo</a></body></html>')
+    @failed = LinkResult.new("http://google.com", 404, 'Not found')
   end
   describe "when created" do
@@ -24,6 +25,11 @@ describe LinkResult, "Link fetch result" do
     it "should have a clean string rep" do
       @result.to_s.must_equal "200: http://google.com"
     end
+    it "should determine if a request is failed" do
+      @result.failed.must_equal false
+      @failed.failed.must_equal true
+    end
   end
 end

data/test/test_shelob.rb CHANGED Viewed

@@ -14,7 +14,7 @@ end
 describe Shelob::Spider, "Link checking spider" do
   before do
-    stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>')
+    stub_request(:any, 'http://bmnick.com/resume').to_return(body: '<html><head><title>resume</title></head><body><a href="http://bmnick.com">home</a><a href="http://bmnick.com/resume/resume.pdf">pdf</a><a href="http://bmnick.com/resume/secret"</body></html>').times(1).then.to_return(status: 514)
     stub_request(:any, 'http://bmnick.com/').to_return(status: 200, body: '<html><head><title>pdf</title></head><body><a href="http://bmnick.com/resume/">resume</a><a href="http://bmnick.com/">home</a><a href="http://bmnick.com/resume/secret">no touchy!</a></body></html>')
     stub_request(:any, 'http://bmnick.com/resume/secret').to_return(body: '<html><head><title>secrets</title></head><body><a href="http://bmnick.com/resume/boring">boredom</a><a href="http://bmnick.com/resume">resume</a><a href="/resume/relative">relative</a></body></html>"')
     stub_request(:any, 'http://bmnick.com/resume/resume.pdf').to_return(status: 404)
@@ -26,9 +26,9 @@ describe Shelob::Spider, "Link checking spider" do
       Shelob::Spider.wont_be_nil
     end
     it "should store the initial url" do
-      spider = Shelob::Spider.new("https://openforum.com")
+      spider = Shelob::Spider.new("http://bmnick.com")
       spider.wont_be_nil
-      spider.hostname.must_equal "https://openforum.com"
+      spider.hostname.must_equal "http://bmnick.com"
     end
   end
   describe "when checking links" do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: shelob
 version: !ruby/object:Gem::Version
-  version: 0.1.0.beta2
+  version: 0.1.0.beta3
 platform: ruby
 authors:
 - Benjamin Nicholas
@@ -134,7 +134,7 @@ files:
 - test/test_link_result.rb
 - test/test_resolver.rb
 - test/test_shelob.rb
-homepage:
+homepage: https://github.com/bmnick/shelob
 licenses:
 - MIT
 metadata: {}