RubyGems - cobweb - Versions diffs - 0.0.54 → 0.0.55 - Mend

cobweb 0.0.54 → 0.0.55

Files changed (18) hide show

data/README.textile +4 -1
data/lib/cobweb.rb +24 -35
data/lib/cobweb_crawler.rb +5 -0
data/lib/cobweb_finished_job.rb +2 -0
data/lib/cobweb_links.rb +8 -1
data/lib/cobweb_process_job.rb +2 -0
data/lib/cobweb_version.rb +4 -1
data/lib/content_link_parser.rb +9 -2
data/lib/crawl_job.rb +17 -13
data/lib/encoding_safe_process_job.rb +2 -0
data/lib/hash_util.rb +16 -0
data/lib/redirect_error.rb +1 -0
data/lib/robots.rb +2 -0
data/lib/server.rb +7 -14
data/lib/stats.rb +42 -29
data/lib/uri_helper.rb +2 -0
data/spec/spec_helper.rb +1 -0
metadata +22 -21

data/README.textile CHANGED

@@ -1,5 +1,5 @@
-h1. Cobweb v0.0.54
+h1. Cobweb v0.0.55
 h2. Intro
@@ -14,6 +14,7 @@ h3. Standalone
   CobwebCrawler takes the same options as cobweb itself, so you can use any of the options available for that.  An example is listed below.
 bq. crawler = CobwebCrawler.new(:cache => 600);
 bq. stats = crawler.crawl("http://www.pepsico.com")
   While the crawler is running, you can view statistics on http://localhost:4567
@@ -111,6 +112,8 @@ h2. Todo
   * Add ability to start and stop crawls from web interface
   * Allow crawler to start as web interface only (ie not run crawls at start)
   * Fix content encoding issue requiring separate process job
+  * DRY the cobweb get/head calls, its got a lot of duplication
+  * Investigate using event machine for single threaded crawling
 h3. Big changes

data/lib/cobweb.rb CHANGED

@@ -10,19 +10,15 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
   require file
 end
+# Cobweb class is used to perform get and head requests.  You can use this on its own if you wish without the crawler
 class Cobweb
-  ## TASKS
-  # redesign to have a resque stack and a single threaded stack
-  # dry the code below, its got a lot of duplication
-  # detect the end of the crawl (queued == 0 ?)
-  # on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
-  # investigate using event machine for single threaded crawling
+  # retrieves current version
   def self.version
     CobwebVersion.version
   end
+  # used for setting default options
   def method_missing(method_sym, *arguments, &block)
     if method_sym.to_s =~ /^default_(.*)_to$/
       tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
@@ -32,6 +28,7 @@ class Cobweb
     end
   end
+  # See readme for more information on options available
   def initialize(options = {})
     @options = options
     default_use_encoding_safe_process_job_to  false
@@ -49,6 +46,7 @@ class Cobweb
   end
+  # This method starts the resque based crawl and enqueues the base_url
   def start(base_url)
     raise ":base_url is required" unless base_url
     request = {
@@ -75,7 +73,20 @@ class Cobweb
     Resque.enqueue(CrawlJob, request)
   end
+  # Returns array of cookies from content
+  def get_cookies(response)
+    all_cookies = response.get_fields('set-cookie')
+    unless all_cookies.nil?
+      cookies_array = Array.new
+      all_cookies.each { |cookie|
+        cookies_array.push(cookie.split('; ')[0])
+      }
+      cookies = cookies_array.join('; ')
+    end
+  end
+  # Performs a HTTP GET request to the specified url applying the options supplied
   def get(url, options = @options)
     raise "url cannot be nil" if url.nil?
     uri = Addressable::URI.parse(url)
@@ -103,7 +114,7 @@ class Cobweb
     # check if it has already been cached
     if redis.get(unique_id) and @options[:cache]
       puts "Cache hit for #{url}" unless @options[:quiet]
-      content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
+      content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
     else
       # retrieve data
       unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -173,7 +184,7 @@ class Cobweb
             content[:body] = Base64.encode64(response.body)
           end
           content[:location] = response["location"]
-          content[:headers] = deep_symbolize_keys(response.to_hash)
+          content[:headers] = HashUtil.deep_symbolize_keys(response.to_hash)
           # parse data for links
           link_parser = ContentLinkParser.new(content[:url], content[:body])
           content[:links] = link_parser.link_data
@@ -233,17 +244,7 @@ class Cobweb
     content
   end
-  def get_cookies(response)
-    all_cookies = response.get_fields('set-cookie')
-    unless all_cookies.nil?
-      cookies_array = Array.new
-      all_cookies.each { |cookie|
-        cookies_array.push(cookie.split('; ')[0])
-      }
-      cookies = cookies_array.join('; ')
-    end
-  end
+  # Performs a HTTP HEAD request to the specified url applying the options supplied
   def head(url, options = @options)
     raise "url cannot be nil" if url.nil?
     uri = Addressable::URI.parse(url)
@@ -271,7 +272,7 @@ class Cobweb
     # check if it has already been cached
     if redis.get("head-#{unique_id}") and @options[:cache]
       puts "Cache hit for #{url}" unless @options[:quiet]
-      content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
+      content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
     else
       # retrieve data
       unless @http && @http.address == uri.host && @http.port == uri.inferred_port
@@ -379,18 +380,6 @@ class Cobweb
       content
     end
-  end
-  def deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
+  end
 end

data/lib/cobweb_crawler.rb CHANGED

@@ -3,8 +3,10 @@ require 'date'
 require 'ap'
 #require 'namespaced_redis'
+# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
 class CobwebCrawler
+  # See README for more information on options available
   def initialize(options={})
     @options = options
@@ -31,6 +33,7 @@ class CobwebCrawler
     @cobweb = Cobweb.new(@options)
   end
+  # Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
   def crawl(base_url, crawl_options = {}, &block)
     @options[:base_url] = base_url unless @options.has_key? :base_url
@@ -107,7 +110,9 @@ class CobwebCrawler
 end
+# Monkey patch into String a starts_with method
 class String
+  # Monkey patch into String a starts_with method
   def cobweb_starts_with?(val)
     if self.length >= val.length
       self[0..val.length-1] == val

data/lib/cobweb_finished_job.rb CHANGED

@@ -1,8 +1,10 @@
+# Dummy resque job that executes at the end of the crawl if none are specified
 class CobwebFinishedJob
   require "ap"
   @queue = :cobweb_finished_job
+  # perform method for resque to execute
   def self.perform(statistics)
     puts "Dummy Finished Job"

data/lib/cobweb_links.rb CHANGED

@@ -1,6 +1,8 @@
+# CobwebLinks processes links to determine whether they are internal or external links
 class CobwebLinks
-  # processes links supplied to it
+  # Initalise's internal and external patterns and sets up regular expressions
   def initialize(options={})
     @options = options
@@ -15,6 +17,7 @@ class CobwebLinks
   end
+  # Returns true if the link is matched to an internal_url and not matched to an external_url
   def internal?(link)
     if @options[:debug]
       puts "--------------------------------"
@@ -27,6 +30,7 @@ class CobwebLinks
     !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
   end
+  # Returns true if the link is matched to an external_url or not matched to an internal_url
   def external?(link)
     if @options[:debug]
       puts "--------------------------------"
@@ -40,6 +44,7 @@ class CobwebLinks
   end
   private
+  # escapes characters with meaning in regular expressions and adds wildcard expression
   def escape_pattern_for_regex(pattern)
     pattern = pattern.gsub(".", "\\.")
     pattern = pattern.gsub("?", "\\?")
@@ -49,8 +54,10 @@ class CobwebLinks
   end
 end
+# Exception raised for :internal_urls missing from CobwebLinks
 class InternalUrlsMissingError < Exception
 end
+# Exception raised for :internal_urls being invalid from CobwebLinks
 class InvalidUrlsError < Exception
 end

data/lib/cobweb_process_job.rb CHANGED

@@ -1,8 +1,10 @@
+# Dummy resque process job that is ran if none are specified
 class CobwebProcessJob
   require "ap"
   @queue = :cobweb_process_job
+  # Resque perform method
   def self.perform(content)
     content = HashHelper.symbolize_keys(content)
     puts "Dummy Processing for #{content[:url]}"

data/lib/cobweb_version.rb CHANGED

@@ -1,6 +1,9 @@
+# CobwebVersion holds the current version of the gem
 class CobwebVersion
+  # Returns a string of the current version
   def self.version
-    "0.0.54"
+    "0.0.55"
   end
 end

data/lib/content_link_parser.rb CHANGED

@@ -1,8 +1,10 @@
+require "nokogiri"
+# ContentLinkParser extracts links from HTML content and assigns them to a hash based on the location the link was found.  The has contents can be configured in options, however, defaults to a pretty sensible default.
+# Links can also be returned regardless of the location they were located and can be filtered by the scheme
 class ContentLinkParser
-  require "nokogiri"
+  # Parses the content and absolutizes the urls based on url.  Options can be setup to determine the links that are extracted.
   def initialize(url, content, options = {})
     @options = options
     @url = url
@@ -29,6 +31,7 @@ class ContentLinkParser
   end
+  # Returns a hash with arrays of links
   def link_data
     data = {}
     @options[:tags].keys.each do |key|
@@ -37,6 +40,7 @@ class ContentLinkParser
     data
   end
+  # Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes.  Also filters repeating folders (ie if the crawler got in a link loop situation)
   def all_links(options = {})
     options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
     data = link_data
@@ -47,6 +51,7 @@ class ContentLinkParser
     links
   end
+  # Returns the type of links as a method rather than using the hash e.g. 'content_link_parser.images'
   def method_missing(m)
     if @options[:tags].keys.include?(m)
       links = []
@@ -60,6 +65,8 @@ class ContentLinkParser
     end
   end
+  private
+  # Processes the content to find links based on options[:tags]
   def find_matches(array, selector, attribute)
     if attribute.kind_of? String or attribute.kind_of? Symbol
       @doc.css(selector).each do |tag|

data/lib/crawl_job.rb CHANGED

@@ -1,3 +1,5 @@
+# CrawlJob defines a resque job to perform the crawl
 class CrawlJob
   require "net/https"
@@ -7,10 +9,11 @@ class CrawlJob
   @queue = :cobweb_crawl_job
+  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
   def self.perform(content_request)
     # change all hash keys to symbols
-    content_request = self.deep_symbolize_keys(content_request)
+    content_request = HashUtil.deep_symbolize_keys(content_request)
     content_request[:redis_options] = {} unless content_request.has_key? :redis_options
     @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
@@ -81,12 +84,14 @@ class CrawlJob
   end
+  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
   def self.finished(content_request)
     # finished
     @stats.end_crawl(content_request)
     Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
   end
+  # Enqueues the content to the processing queue setup in options
   def self.send_to_processing_queue(content, content_request)
     content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
     if content_request[:use_encoding_safe_process_job]
@@ -102,14 +107,17 @@ class CrawlJob
   private
+  # Returns true if the crawl count is within limits
   def self.within_crawl_limits?(crawl_limit)
     crawl_limit.nil? or @crawl_counter < crawl_limit.to_i
   end
+  # Returns true if the queue count is calculated to be still within limits when complete
   def self.within_queue_limits?(crawl_limit)
     within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or (@queue_counter + @crawl_counter) < crawl_limit.to_i)
   end
+  # Sets the base url in redis.  If the first page is a redirect, it sets the base_url to the destination
   def self.set_base_url(redis, content, content_request)
     if redis.get("base_url").nil?
       unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
@@ -120,6 +128,7 @@ class CrawlJob
     end
   end
+  # Enqueues content to the crawl_job queue
   def self.enqueue_content(content_request, link)
     new_request = content_request.clone
     new_request[:url] = link
@@ -129,37 +138,32 @@ class CrawlJob
     increment_queue_counter
   end
+  # Increments the queue counter and refreshes crawl counters
   def self.increment_queue_counter
     @redis.incr "queue-counter"
     refresh_counters
   end
+  # Increments the crawl counter and refreshes crawl counters
   def self.increment_crawl_counter
     @redis.incr "crawl-counter"
     refresh_counters
   end
+  # Decrements the queue counter and refreshes crawl counters
   def self.decrement_queue_counter
     @redis.decr "queue-counter"
     refresh_counters
   end
+  # Refreshes the crawl counters
   def self.refresh_counters
     @crawl_counter = @redis.get("crawl-counter").to_i
     @queue_counter = @redis.get("queue-counter").to_i
   end
+  # Sets the crawl counters based on the crawled and queued queues
   def self.reset_counters
     @redis.set("crawl-counter", @redis.smembers("crawled").count)
     @redis.set("queue-counter", @redis.smembers("queued").count)
     @crawl_counter = @redis.get("crawl-counter").to_i
     @queue_counter = @redis.get("queue-counter").to_i
   end
-  def self.deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
 end

data/lib/encoding_safe_process_job.rb CHANGED

@@ -1,7 +1,9 @@
+# Process Job to resolve encoding issue
 class EncodingSafeProcessJob
   @queue = :encoding_safe_process_job
+  # Resque perform method
   def self.perform(content)
     clazz = const_get(content["processing_queue"])
     content["body"] = Base64.decode64(content["body"])

data/lib/hash_util.rb ADDED

@@ -0,0 +1,16 @@
+# Collection of utility methods for the Hash object
+class HashUtil
+  # Returns a hash with the keys converted to symbols
+  def self.deep_symbolize_keys(hash)
+    hash.keys.each do |key|
+      value = hash[key]
+      hash.delete(key)
+      hash[key.to_sym] = value
+      if hash[key.to_sym].instance_of? Hash
+        hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
+      end
+    end
+    hash
+  end
+end

data/lib/redirect_error.rb CHANGED

@@ -1,2 +1,3 @@
+# Redirect Exception
 class RedirectError < Exception
 end

data/lib/robots.rb CHANGED

@@ -1,5 +1,7 @@
+# Robots retrieves and processes the robots.txt file from the target server
 class Robots
+  # Processes the robots.txt file
   def initialize(url, file_name="robots.txt")
     uri = URI.parse(url)
     [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join

data/lib/server.rb CHANGED

@@ -1,6 +1,7 @@
 require 'sinatra'
 require 'haml'
+# Sinatra server to host the statistics for the CobwebCrawler
 class Server < Sinatra::Base
   set :views, settings.root + '/../views'
@@ -8,6 +9,7 @@ class Server < Sinatra::Base
   set :public_folder, settings.root + '/../public'
   enable :static
+  # Sinatra Dashboard
   get '/' do
     @full_redis = Redis.new
@@ -27,6 +29,7 @@ class Server < Sinatra::Base
     haml :home
   end
+  # Sinatra Crawl Detail
   get '/statistics/:crawl_id' do
     redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
@@ -58,6 +61,7 @@ class Server < Sinatra::Base
     haml :statistics
   end
+  # Starts the Sinatra server, and kills the processes when shutdown
   def self.start
     unless Server.running?
       thread = Thread.new do
@@ -72,21 +76,10 @@ class Server < Sinatra::Base
 end
-class HashUtil
-  def self.deep_symbolize_keys(hash)
-    hash.keys.each do |key|
-      value = hash[key]
-      hash.delete(key)
-      hash[key.to_sym] = value
-      if hash[key.to_sym].instance_of? Hash
-        hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
-      end
-    end
-    hash
-  end
-end
+# Monkey Patch of the Numeric class
 class Numeric
+  #Returns a human readable format for a number representing a data size
   def to_human
     units = %w{B KB MB GB TB}
     ap self

data/lib/stats.rb CHANGED

@@ -1,10 +1,14 @@
+# Stats class is the main statisitics hub for monitoring crawls.  Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
 class Stats
+  # Sets up redis usage for statistics
   def initialize(options)
     @full_redis = Redis.new(options[:redis_options])
     @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
   end
+  # Sets up the crawl in statistics
   def start_crawl(options)
     unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
       @full_redis.sadd "cobweb_crawls", options[:crawl_id]
@@ -15,12 +19,14 @@ class Stats
     @redis.hset "statistics", "current_status", "Crawl Starting..."
   end
+  # Removes the crawl from the running crawls and updates status
   def end_crawl(options)
     @full_redis.srem "cobweb_crawls", options[:crawl_id]
     @redis.hset "statistics", "current_status", "Crawl Stopped"
     @redis.del "crawl_details"
   end
+  # Returns statistics hash.  update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
   def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
     @statistics = get_statistics
@@ -125,6 +131,41 @@ class Stats
     @statistics
   end
+  # Returns the statistics hash
+  def get_statistics
+    @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
+    if @statistics[:status_counts].nil?
+      @statistics[:status_counts]
+    else
+      @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
+    end
+    if @statistics[:mime_counts].nil?
+      @statistics[:mime_counts]
+    else
+      @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
+    end
+    @statistics
+  end
+  # Sets the current status of the crawl
+  def update_status(status)
+    @redis.hset "statistics", "current_status", status
+  end
+  # Returns the current status of the crawl
+  def get_status
+    @redis.hget "statistics", "current_status"
+  end
+  # Sets totals for the end of the crawl (Not Used)
+  def set_totals
+    stats = get_statistics
+    stats[:crawled] = @redis.smembers "crawled"
+  end
+  private
+  # Records a time based statistic
   def record_time_stat(stat_name, value, type="minute", duration=60)
     key = DateTime.now.strftime("%Y-%m-%d %H:%M")
     if type == "hour"
@@ -142,6 +183,7 @@ class Stats
     end
   end
+  # Increments a time based statistic (eg pages per minute)
   def increment_time_stat(stat_name, type="minute", duration=60)
     key = DateTime.now.strftime("%Y-%m-%d %H:%M")
     if type == "hour"
@@ -162,35 +204,6 @@ class Stats
     end
   end
-  def get_statistics
-    @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
-    if @statistics[:status_counts].nil?
-      @statistics[:status_counts]
-    else
-      @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
-    end
-    if @statistics[:mime_counts].nil?
-      @statistics[:mime_counts]
-    else
-      @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
-    end
-    @statistics
-  end
-  def update_status(status)
-    @redis.hset "statistics", "current_status", status
-  end
-  def get_status
-    @redis.hget "statistics", "current_status"
-  end
-  def set_totals
-    stats = get_statistics
-    stats[:crawled] = @redis.smembers "crawled"
-  end
 end

data/lib/uri_helper.rb CHANGED

@@ -1,4 +1,6 @@
+# Helper class to perform tasks on URI's
 class UriHelper
+  # Returns an Addressable::URI with the fragment section removed
   def self.join_no_fragment(content, link)
     new_link = Addressable::URI.join(content, link)
     new_link.fragment=nil

data/spec/spec_helper.rb CHANGED

@@ -1,6 +1,7 @@
 require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
 require 'mock_redis'
+# Sets up the environment as test so that exceptions are raised
 ENVIRONMENT = "test"
 RSpec.configure do |config|

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.54
+  version: 0.0.55
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70234816199400 !ruby/object:Gem::Requirement
+  requirement: &70166180363640 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816199400
+  version_requirements: *70166180363640
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70234816197960 !ruby/object:Gem::Requirement
+  requirement: &70166180362480 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816197960
+  version_requirements: *70166180362480
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70234816196900 !ruby/object:Gem::Requirement
+  requirement: &70166180361540 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816196900
+  version_requirements: *70166180361540
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &70234816195380 !ruby/object:Gem::Requirement
+  requirement: &70166180359680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816195380
+  version_requirements: *70166180359680
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70234816192860 !ruby/object:Gem::Requirement
+  requirement: &70166180357240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234816192860
+  version_requirements: *70166180357240
 - !ruby/object:Gem::Dependency
   name: awesome_print
-  requirement: &70234820117420 !ruby/object:Gem::Requirement
+  requirement: &70166180380020 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820117420
+  version_requirements: *70166180380020
 - !ruby/object:Gem::Dependency
   name: sinatra
-  requirement: &70234820114840 !ruby/object:Gem::Requirement
+  requirement: &70166180377140 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820114840
+  version_requirements: *70166180377140
 - !ruby/object:Gem::Dependency
   name: thin
-  requirement: &70234820127140 !ruby/object:Gem::Requirement
+  requirement: &70166180389240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820127140
+  version_requirements: *70166180389240
 - !ruby/object:Gem::Dependency
   name: haml
-  requirement: &70234820126100 !ruby/object:Gem::Requirement
+  requirement: &70166180388200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234820126100
+  version_requirements: *70166180388200
 - !ruby/object:Gem::Dependency
   name: namespaced_redis
-  requirement: &70234820125020 !ruby/object:Gem::Requirement
+  requirement: &70166180387040 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
         version: 1.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *70234820125020
+  version_requirements: *70166180387040
 description: Web Crawler that uses resque background job engine to allow you to cluster
   your crawl.
 email: stewart@rockwellcottage.com
@@ -146,6 +146,7 @@ files:
 - lib/content_link_parser.rb
 - lib/crawl_job.rb
 - lib/encoding_safe_process_job.rb
+- lib/hash_util.rb
 - lib/redirect_error.rb
 - lib/robots.rb
 - lib/server.rb