RubyGems - cobweb - Versions diffs - 0.0.54 → 0.0.55 - Mend

cobweb 0.0.54 → 0.0.55

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/README.textile +4 -1
data/lib/cobweb.rb +24 -35
data/lib/cobweb_crawler.rb +5 -0
data/lib/cobweb_finished_job.rb +2 -0
data/lib/cobweb_links.rb +8 -1
data/lib/cobweb_process_job.rb +2 -0
data/lib/cobweb_version.rb +4 -1
data/lib/content_link_parser.rb +9 -2
data/lib/crawl_job.rb +17 -13
data/lib/encoding_safe_process_job.rb +2 -0
data/lib/hash_util.rb +16 -0
data/lib/redirect_error.rb +1 -0
data/lib/robots.rb +2 -0
data/lib/server.rb +7 -14
data/lib/stats.rb +42 -29
data/lib/uri_helper.rb +2 -0
data/spec/spec_helper.rb +1 -0
metadata +22 -21

data/README.textile CHANGED

@@ -1,5 +1,5 @@
-h1. Cobweb v0.0.54
+h1. Cobweb v0.0.55
 h2. Intro
@@ -14,6 +14,7 @@ h3. Standalone
   CobwebCrawler takes the same options as cobweb itself, so you can use any of the options available for that.  An example is listed below.
 bq. crawler = CobwebCrawler.new(:cache => 600);
 bq. stats = crawler.crawl("http://www.pepsico.com")
   While the crawler is running, you can view statistics on http://localhost:4567
@@ -111,6 +112,8 @@ h2. Todo
   * Add ability to start and stop crawls from web interface
   * Allow crawler to start as web interface only (ie not run crawls at start)
   * Fix content encoding issue requiring separate process job
+  * DRY the cobweb get/head calls, its got a lot of duplication
+  * Investigate using event machine for single threaded crawling
 h3. Big changes

data/lib/cobweb.rb CHANGED

@@ -10,19 +10,15 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
   require file
 end
+# Cobweb class is used to perform get and head requests.  You can use this on its own if you wish without the crawler
 class Cobweb
-  ## TASKS
-  # redesign to have a resque stack and a single threaded stack
-  # dry the code below, its got a lot of duplication
-  # detect the end of the crawl (queued == 0 ?)
-  # on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
-  # investigate using event machine for single threaded crawling
+  # retrieves current version
   def self.version
     CobwebVersion.version
   end
+  # used for setting default options
   def method_missing(method_sym, *arguments, &block)
     if method_sym.to_s =~ /^default_(.*)_to$/
       tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
@@ -32,6 +28,7 @@ class Cobweb
     end
   end
+  # See readme for more information on options available
   def initialize(options = {})
     @options = options
     default_use_encoding_safe_process_job_to  false
@@ -49,6 +46,7 @@ class Cobweb
   end
+  # This method starts the resque based crawl and enqueues the base_url
   def start(base_url)
     raise ":base_url is required" unless base_url
     request = {
@@ -75,7 +73,20 @@ class Cobweb
     Resque.enqueue(CrawlJob, request)
   end
+  # Returns array of cookies from content
+  def get_cookies(response)
+    all_cookies = response.get_fields('set-cookie')
+    unless all_cookies.nil?
+      cookies_array = Array.new
+      all_cookies.each { |cookie|
+        cookies_array.push(cookie.split('; ')[0])
+      }
+      cookies = cookies_array.join('; ')
+    end
+  end
+  # Performs a HTTP GET request to the specified url applying the options supplied
   def get(url, options = @options)
     raise "url cannot be nil" if url.nil?
     uri = Addressable::URI.parse(url)
@@ -103,7 +114,7 @@ class Cobweb
     # check if it has already been cached
     if redis.get(unique_id) and @options[:cache]
       puts "Cache hit for #{url}" unless @options[:quiet]
-      content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
+      content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
     else
       # retrieve data
       unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -173,7 +184,7 @@ class Cobweb
             content[:body] = Base64.encode64(response.body)
           end
           content[:location] = response["location"]
-          content[:headers] = deep_symbolize_keys(response.to_hash)
+          content[:headers] = HashUtil.deep_symbolize_keys(response.to_hash)
           # parse data for links
           link_parser = ContentLinkParser.new(content[:url], content[:body])
           content[:links] = link_parser.link_data
@@ -233,17 +244,7 @@ class Cobweb
     content
   end
-  def get_cookies(response)
-    all_cookies = response.get_fields('set-cookie')
-    unless all_cookies.nil?
-      cookies_array = Array.new
-      all_cookies.each { |cookie|
-        cookies_array.push(cookie.split('; ')[0])
-      }
-      cookies = cookies_array.join('; ')
-    end
-  end
+  # Performs a HTTP HEAD request to the specified url applying the options supplied
   def head(url, options = @options)
     raise "url cannot be nil" if url.nil?
     uri = Addressable::URI.parse(url)
@@ -271,7 +272,7 @@ class Cobweb
     # check if it has already been cached
     if redis.get("head-#{unique_id}") and @options[:cache]
       puts "Cache hit for #{url}" unless @options[:quiet]
-      content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
+      content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
     else
       # retrieve data
       unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -379,18 +380,6 @@ class Cobweb
       content
     end
-  end
-  def deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
+  end
 end

data/lib/cobweb_crawler.rb CHANGED

@@ -3,8 +3,10 @@ require 'date'
 require 'ap'
 #require 'namespaced_redis'
+# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
 class CobwebCrawler
+  # See README for more information on options available
   def initialize(options={})
     @options = options
@@ -31,6 +33,7 @@ class CobwebCrawler
     @cobweb = Cobweb.new(@options)
   end
+  # Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
   def crawl(base_url, crawl_options = {}, &block)
     @options[:base_url] = base_url unless @options.has_key? :base_url
@@ -107,7 +110,9 @@ class CobwebCrawler
 end
+# Monkey patch into String a starts_with method
 class String
+  # Monkey patch into String a starts_with method
   def cobweb_starts_with?(val)
     if self.length >= val.length
       self[0..val.length-1] == val

data/lib/cobweb_finished_job.rb CHANGED

@@ -1,8 +1,10 @@
+# Dummy resque job that executes at the end of the crawl if none are specified
 class CobwebFinishedJob
   require "ap"
   @queue = :cobweb_finished_job
+  # perform method for resque to execute
   def self.perform(statistics)
     puts "Dummy Finished Job"

data/lib/cobweb_links.rb CHANGED

@@ -1,6 +1,8 @@
+# CobwebLinks processes links to determine whether they are internal or external links
 class CobwebLinks
-  # processes links supplied to it
+  # Initalise's internal and external patterns and sets up regular expressions
   def initialize(options={})
     @options = options
@@ -15,6 +17,7 @@ class CobwebLinks
   end
+  # Returns true if the link is matched to an internal_url and not matched to an external_url
   def internal?(link)
     if @options[:debug]
       puts "--------------------------------"
@@ -27,6 +30,7 @@ class CobwebLinks
     !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
   end
+  # Returns true if the link is matched to an external_url or not matched to an internal_url
   def external?(link)
     if @options[:debug]
       puts "--------------------------------"
@@ -40,6 +44,7 @@ class CobwebLinks
   end
   private
+  # escapes characters with meaning in regular expressions and adds wildcard expression
   def escape_pattern_for_regex(pattern)
     pattern = pattern.gsub(".", "\\.")
     pattern = pattern.gsub("?", "\\?")
@@ -49,8 +54,10 @@ class CobwebLinks
   end
 end
+# Exception raised for :internal_urls missing from CobwebLinks
 class InternalUrlsMissingError < Exception
 end
+# Exception raised for :internal_urls being invalid from CobwebLinks
 class InvalidUrlsError < Exception
 end

data/lib/cobweb_process_job.rb CHANGED

@@ -1,8 +1,10 @@
+# Dummy resque process job that is ran if none are specified
 class CobwebProcessJob
   require "ap"
   @queue = :cobweb_process_job
+  # Resque perform method
   def self.perform(content)
     content = HashHelper.symbolize_keys(content)
     puts "Dummy Processing for #{content[:url]}"

data/lib/cobweb_version.rb CHANGED

@@ -1,6 +1,9 @@
+# CobwebVersion holds the current version of the gem
 class CobwebVersion
+  # Returns a string of the current version
   def self.version
-    "0.0.54"
+    "0.0.55"
   end
 end

data/lib/content_link_parser.rb CHANGED

@@ -1,8 +1,10 @@
+require "nokogiri"
+# ContentLinkParser extracts links from HTML content and assigns them to a hash based on the location the link was found.  The has contents can be configured in options, however, defaults to a pretty sensible default.
+# Links can also be returned regardless of the location they were located and can be filtered by the scheme
 class ContentLinkParser
-  require "nokogiri"
+  # Parses the content and absolutizes the urls based on url.  Options can be setup to determine the links that are extracted.
   def initialize(url, content, options = {})
     @options = options
     @url = url
@@ -29,6 +31,7 @@ class ContentLinkParser
   end
+  # Returns a hash with arrays of links
   def link_data
     data = {}
     @options[:tags].keys.each do |key|
@@ -37,6 +40,7 @@ class ContentLinkParser
     data
   end
+  # Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes.  Also filters repeating folders (ie if the crawler got in a link loop situation)
   def all_links(options = {})
     options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
     data = link_data
@@ -47,6 +51,7 @@ class ContentLinkParser
     links
   end
+  # Returns the type of links as a method rather than using the hash e.g. 'content_link_parser.images'
   def method_missing(m)
     if @options[:tags].keys.include?(m)
       links = []
@@ -60,6 +65,8 @@ class ContentLinkParser
     end
   end
+  private
+  # Processes the content to find links based on options[:tags]
   def find_matches(array, selector, attribute)
     if attribute.kind_of? String or attribute.kind_of? Symbol
       @doc.css(selector).each do |tag|

data/lib/crawl_job.rb CHANGED

@@ -1,3 +1,5 @@
+# CrawlJob defines a resque job to perform the crawl
 class CrawlJob
   require "net/https"
@@ -7,10 +9,11 @@ class CrawlJob
   @queue = :cobweb_crawl_job
+  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
   def self.perform(content_request)
     # change all hash keys to symbols
-    content_request = self.deep_symbolize_keys(content_request)
+    content_request = HashUtil.deep_symbolize_keys(content_request)
     content_request[:redis_options] = {} unless content_request.has_key? :redis_options
     @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
@@ -81,12 +84,14 @@ class CrawlJob
   end
+  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
   def self.finished(content_request)
     # finished
     @stats.end_crawl(content_request)
     Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
   end
+  # Enqueues the content to the processing queue setup in options
   def self.send_to_processing_queue(content, content_request)
     content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
     if content_request[:use_encoding_safe_process_job]
@@ -102,14 +107,17 @@ class CrawlJob
   private
+  # Returns true if the crawl count is within limits
   def self.within_crawl_limits?(crawl_limit)
     crawl_limit.nil? or @crawl_counter < crawl_limit.to_i
   end
+  # Returns true if the queue count is calculated to be still within limits when complete
   def self.within_queue_limits?(crawl_limit)
     within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or (@queue_counter + @crawl_counter) < crawl_limit.to_i)
   end
+  # Sets the base url in redis.  If the first page is a redirect, it sets the base_url to the destination
   def self.set_base_url(redis, content, content_request)
     if redis.get("base_url").nil?
       unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
@@ -120,6 +128,7 @@ class CrawlJob
     end
   end
+  # Enqueues content to the crawl_job queue
   def self.enqueue_content(content_request, link)
     new_request = content_request.clone
     new_request[:url] = link
@@ -129,37 +138,32 @@ class CrawlJob
     increment_queue_counter
   end
+  # Increments the queue counter and refreshes crawl counters
   def self.increment_queue_counter
     @redis.incr "queue-counter"
     refresh_counters
   end
+  # Increments the crawl counter and refreshes crawl counters
   def self.increment_crawl_counter
     @redis.incr "crawl-counter"
     refresh_counters
   end
+  # Decrements the queue counter and refreshes crawl counters
   def self.decrement_queue_counter
     @redis.decr "queue-counter"
     refresh_counters
   end
+  # Refreshes the crawl counters
   def self.refresh_counters
     @crawl_counter = @redis.get("crawl-counter").to_i
     @queue_counter = @redis.get("queue-counter").to_i
   end
+  # Sets the crawl counters based on the crawled and queued queues
   def self.reset_counters
     @redis.set("crawl-counter", @redis.smembers("crawled").count)
     @redis.set("queue-counter", @redis.smembers("queued").count)
     @crawl_counter = @redis.get("crawl-counter").to_i
     @queue_counter = @redis.get("queue-counter").to_i
   end
-  def self.deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
 end

data/lib/encoding_safe_process_job.rb CHANGED

@@ -1,7 +1,9 @@
+# Process Job to resolve encoding issue
 class EncodingSafeProcessJob
   @queue = :encoding_safe_process_job
+  # Resque perform method
   def self.perform(content)
     clazz = const_get(content["processing_queue"])
     content["body"] = Base64.decode64(content["body"])

data/lib/hash_util.rb ADDED

@@ -0,0 +1,16 @@
+# Collection of utility methods for the Hash object
+class HashUtil
+  # Returns a hash with the keys converted to symbols
+  def self.deep_symbolize_keys(hash)
+    hash.keys.each do |key|
+      value = hash[key]
+      hash.delete(key)
+      hash[key.to_sym] = value
+      if hash[key.to_sym].instance_of? Hash
+        hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
+      end
+    end
+    hash
+  end
+end

data/lib/redirect_error.rb CHANGED

@@ -1,2 +1,3 @@
+# Redirect Exception
 class RedirectError < Exception
 end

data/lib/robots.rb CHANGED

@@ -1,5 +1,7 @@
+# Robots retrieves and processes the robots.txt file from the target server
 class Robots
+  # Processes the robots.txt file
   def initialize(url, file_name="robots.txt")
     uri = URI.parse(url)
     [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join

data/lib/server.rb CHANGED

@@ -1,6 +1,7 @@
 require 'sinatra'
 require 'haml'
+# Sinatra server to host the statistics for the CobwebCrawler
 class Server < Sinatra::Base
   set :views, settings.root + '/../views'
@@ -8,6 +9,7 @@ class Server < Sinatra::Base
   set :public_folder, settings.root + '/../public'
   enable :static
+  # Sinatra Dashboard
   get '/' do
     @full_redis = Redis.new
@@ -27,6 +29,7 @@ class Server < Sinatra::Base
     haml :home
   end
+  # Sinatra Crawl Detail
   get '/statistics/:crawl_id' do
     redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
@@ -58,6 +61,7 @@ class Server < Sinatra::Base
     haml :statistics
   end
+  # Starts the Sinatra server, and kills the processes when shutdown
   def self.start
     unless Server.running?
       thread = Thread.new do
@@ -72,21 +76,10 @@ class Server < Sinatra::Base
 end
-class HashUtil
-  def self.deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
-end
+# Monkey Patch of the Numeric class
 class Numeric
+  #Returns a human readable format for a number representing a data size
   def to_human
     units = %w{B KB MB GB TB}
     ap self

data/lib/stats.rb CHANGED

@@ -1,10 +1,14 @@
+# Stats class is the main statisitics hub for monitoring crawls.  Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
 class Stats
+  # Sets up redis usage for statistics
   def initialize(options)
     @full_redis = Redis.new(options[:redis_options])
     @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
   end
+  # Sets up the crawl in statistics
   def start_crawl(options)
     unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
       @full_redis.sadd "cobweb_crawls", options[:crawl_id]
@@ -15,12 +19,14 @@ class Stats
     @redis.hset "statistics", "current_status", "Crawl Starting..."
   end
+  # Removes the crawl from the running crawls and updates status
   def end_crawl(options)
     @full_redis.srem "cobweb_crawls", options[:crawl_id]
     @redis.hset "statistics", "current_status", "Crawl Stopped"
     @redis.del "crawl_details"
   end
+  # Returns statistics hash.  update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
   def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
     @statistics = get_statistics
@@ -125,6 +131,41 @@ class Stats
     @statistics
   end
+  # Returns the statistics hash
+  def get_statistics
+    @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
+    if @statistics[:status_counts].nil?
+      @statistics[:status_counts]
+    else
+      @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
+    end
+    if @statistics[:mime_counts].nil?
+      @statistics[:mime_counts]
+    else
+      @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
+    end
+    @statistics
+  end
+  # Sets the current status of the crawl
+  def update_status(status)
+    @redis.hset "statistics", "current_status", status
+  end
+  # Returns the current status of the crawl
+  def get_status
+    @redis.hget "statistics", "current_status"
+  end
+  # Sets totals for the end of the crawl (Not Used)
+  def set_totals
+    stats = get_statistics
+    stats[:crawled] = @redis.smembers "crawled"
+  end
+  private
+  # Records a time based statistic
   def record_time_stat(stat_name, value, type="minute", duration=60)
     key = DateTime.now.strftime("%Y-%m-%d %H:%M")
     if type == "hour"
@@ -142,6 +183,7 @@ class Stats
     end
   end
+  # Increments a time based statistic (eg pages per minute)
   def increment_time_stat(stat_name, type="minute", duration=60)
     key = DateTime.now.strftime("%Y-%m-%d %H:%M")
     if type == "hour"
@@ -162,35 +204,6 @@ class Stats
     end
   end
-  def get_statistics
-    @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
-    if @statistics[:status_counts].nil?
-      @statistics[:status_counts]
-    else
-      @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
-    end
-    if @statistics[:mime_counts].nil?
-      @statistics[:mime_counts]
-    else
-      @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
-    end
-    @statistics
-  end
-  def update_status(status)
-    @redis.hset "statistics", "current_status", status
-  end
-  def get_status
-    @redis.hget "statistics", "current_status"
-  end
-  def set_totals
-    stats = get_statistics
-    stats[:crawled] = @redis.smembers "crawled"
-  end
 end

data/lib/uri_helper.rb CHANGED

@@ -1,4 +1,6 @@
+# Helper class to perform tasks on URI's
 class UriHelper
+  # Returns an Addressable::URI with the fragment section removed
   def self.join_no_fragment(content, link)
     new_link = Addressable::URI.join(content, link)
     new_link.fragment=nil

data/spec/spec_helper.rb CHANGED

@@ -1,6 +1,7 @@
 require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
 require 'mock_redis'
+# Sets up the environment as test so that exceptions are raised
 ENVIRONMENT = "test"
 RSpec.configure do |config|

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.54
+  version: 0.0.55
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70234816199400 !ruby/object:Gem::Requirement
+  requirement: &70166180363640 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816199400
+  version_requirements: *70166180363640
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70234816197960 !ruby/object:Gem::Requirement
+  requirement: &70166180362480 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816197960
+  version_requirements: *70166180362480
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70234816196900 !ruby/object:Gem::Requirement
+  requirement: &70166180361540 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816196900
+  version_requirements: *70166180361540
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &70234816195380 !ruby/object:Gem::Requirement
+  requirement: &70166180359680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816195380
+  version_requirements: *70166180359680
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70234816192860 !ruby/object:Gem::Requirement
+  requirement: &70166180357240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816192860
+  version_requirements: *70166180357240
 - !ruby/object:Gem::Dependency
   name: awesome_print
-  requirement: &70234820117420 !ruby/object:Gem::Requirement
+  requirement: &70166180380020 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820117420
+  version_requirements: *70166180380020
 - !ruby/object:Gem::Dependency
   name: sinatra
-  requirement: &70234820114840 !ruby/object:Gem::Requirement
+  requirement: &70166180377140 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820114840
+  version_requirements: *70166180377140
 - !ruby/object:Gem::Dependency
   name: thin
-  requirement: &70234820127140 !ruby/object:Gem::Requirement
+  requirement: &70166180389240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820127140
+  version_requirements: *70166180389240
 - !ruby/object:Gem::Dependency
   name: haml
-  requirement: &70234820126100 !ruby/object:Gem::Requirement
+  requirement: &70166180388200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820126100
+  version_requirements: *70166180388200
 - !ruby/object:Gem::Dependency
   name: namespaced_redis
-  requirement: &70234820125020 !ruby/object:Gem::Requirement
+  requirement: &70166180387040 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
         version: 1.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *70234820125020
+  version_requirements: *70166180387040
 description: Web Crawler that uses resque background job engine to allow you to cluster
   your crawl.
 email: stewart@rockwellcottage.com
@@ -146,6 +146,7 @@ files:
 - lib/content_link_parser.rb
 - lib/crawl_job.rb
 - lib/encoding_safe_process_job.rb
+- lib/hash_util.rb
 - lib/redirect_error.rb
 - lib/robots.rb
 - lib/server.rb