RubyGems - cobweb - Versions diffs - 0.0.22 → 0.0.24 - Mend

cobweb 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/README.textile CHANGED

@@ -1,5 +1,5 @@
-h1. Cobweb v0.0.21
+h1. Cobweb v0.0.23
 h2. Intro
@@ -54,14 +54,16 @@ Creates a new crawler object based on a base_url
   * options - Options are passed in as a hash,
-    ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
-    ** :redirect_limit   - sets the limit to be used for concurrent redirects (Default: 10)
-    ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
-    ** :debug            - enables debug output (Default: false)
-    ** :quiet            - hides default output (Default: false)
-    ** :cache            - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
-    ** :timeout          - http timeout for requests (Default: 10)
-    ** :redis_options    - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
+    ** :follow_redirects              - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
+    ** :redirect_limit                - sets the limit to be used for concurrent redirects (Default: 10)
+    ** :processing_queue              - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
+    ** :debug                         - enables debug output (Default: false)
+    ** :quiet                         - hides default output (Default: false)
+    ** :cache                         - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
+    ** :timeout                       - http timeout for requests (Default: 10)
+    ** :redis_options                 - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
+    ** :internal_urls                 - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
+    ** :first_page_redirect_internal  - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
 bq. crawler = CobWeb.new(:follow_redirects => false)
@@ -70,6 +72,8 @@ h4. start(base_url)
 Starts a crawl through resque.  Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
   * base_url - the url to start the crawl from
+Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
 bq. crawler.start("http://www.google.com/")

data/lib/cobweb.rb CHANGED

@@ -19,20 +19,33 @@ class Cobweb
   # investigate using event machine for single threaded crawling
   def self.version
-    "0.0.22"
+    "0.0.24"
+  end
+  def method_missing(method_sym, *arguments, &block)
+    if method_sym.to_s =~ /^default_(.*)_to$/
+      tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
+      @options[tag_name] = arguments[0] unless @options.has_key?(tag_name)
+    else
+      super
+    end
   end
   def initialize(options = {})
     @options = options
-    @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
-    @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
-    @options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
-    @options[:crawl_finished_queue] = CobwebFinishedJob unless @options.has_key?(:crawl_finished_queue)
-    @options[:quiet] = true unless @options.has_key?(:quiet)
-    @options[:debug] = false unless @options.has_key?(:debug)
-    @options[:cache] = 300 unless @options.has_key?(:cache)
-    @options[:timeout] = 10 unless @options.has_key?(:timeout)
-    @options[:redis_options] = {} unless @options.has_key?(:redis_options)
+    default_follow_redirects_to               true
+    default_redirect_limit_to                 10
+    default_processing_queue_to               CobwebProcessJob
+    default_crawl_finished_queue_to           CobwebFinishedJob
+    default_quiet_to                          true
+    default_debug_to                          false
+    default_cache_to                          300
+    default_timeout_to                        10
+    default_redis_options_to                  Hash.new
+    default_internal_urls_to                  []
+    default_first_page_redirect_internal_to   true
   end
   def start(base_url)
@@ -42,9 +55,20 @@ class Cobweb
       :url => base_url
     }
+    if @options[:internal_urls].empty?
+      uri = Addressable::URI.parse(base_url)
+      @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
+    end
     request.merge!(@options)
     @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
     @redis.hset "statistics", "queued_at", DateTime.now
+    @redis.set("crawl-counter", 0)
+    @redis.set("queue-counter", 1)
+    # add internal_urls into redis
+    @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
     Resque.enqueue(CrawlJob, request)
   end
@@ -70,7 +94,7 @@ class Cobweb
       redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
     end
-    content = {}
+    content = {:base_url => url}
     # check if it has already been cached
     if redis.get(unique_id) and @options[:cache]
@@ -96,7 +120,7 @@ class Cobweb
       begin
         print "Retrieving #{url }... " unless @options[:quiet]
         request = Net::HTTP::Get.new uri.request_uri
         response = @http.request request
         if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
@@ -125,7 +149,7 @@ class Cobweb
           content[:response_time] = Time.now.to_f - request_time
           puts "Retrieved." unless @options[:quiet]
           # create the content container
           content[:url] = uri.to_s
           content[:status_code] = response.code.to_i
@@ -138,12 +162,16 @@ class Cobweb
           end
           content[:length] = response.content_length
           if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
-            content[:body] = response.body
-          else
+            if response["Content-Encoding"]=="gzip"
+              content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
+            else
+              content[:body] = response.body
+            end
+          else
             content[:body] = Base64.encode64(response.body)
           end
           content[:location] = response["location"]
-          content[:headers] = response.to_hash.symbolize_keys
+          content[:headers] = response.to_hash.deep_symbolize_keys
           # parse data for links
           link_parser = ContentLinkParser.new(content[:url], content[:body])
           content[:links] = link_parser.link_data
@@ -170,7 +198,7 @@ class Cobweb
         content[:links] = {}
       rescue SocketError => e
-        puts "ERROR: #{e.message}"
+        puts "ERROR: SocketError#{e.message}"
         ## generate a blank content
         content = {}
@@ -185,7 +213,7 @@ class Cobweb
         content[:links] = {}
       rescue Timeout::Error => e
-        puts "ERROR: #{e.message}"
+        puts "ERROR Timeout::Error: #{e.message}"
         ## generate a blank content
         content = {}
@@ -207,10 +235,14 @@ class Cobweb
     raise "url cannot be nil" if url.nil?
     absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
     # get the unique id for this request
     unique_id = Digest::SHA1.hexdigest(url)
-    redirect_limit = options[:redirect_limit]
+    if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
+      redirect_limit = options[:redirect_limit].to_i
+    else
+      redirect_limit = 10
+    end
     # connect to redis
     if options.has_key? :crawl_id
@@ -224,7 +256,7 @@ class Cobweb
     # check if it has already been cached
     if redis.get("head-#{unique_id}") and @options[:cache]
       puts "Cache hit for #{url}" unless @options[:quiet]
-      Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
+      content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
     else
       print "Retrieving #{url }... " unless @options[:quiet]
       uri = Addressable::URI.parse(url.strip)
@@ -247,7 +279,9 @@ class Cobweb
           puts "redirected... " unless @options[:quiet]
           url = absolutize.url(response['location']).to_s
           redirect_limit = redirect_limit - 1
-          content = head(url, redirect_limit)
+          options = options.clone
+          options[:redirect_limit]=redirect_limit
+          content = head(url, options)
           content[:url] = uri.to_s
           content[:redirect_through] = [] if content[:redirect_through].nil?
           content[:redirect_through].insert(0, url)

data/lib/cobweb_process_job.rb CHANGED

@@ -4,7 +4,7 @@ class CobwebProcessJob
   @queue = :cobweb_process_job
   def self.perform(content)
-    content.symbolize_keys
+    content = HashHelper.symbolize_keys(content)
     puts "Dummy Processing for #{content[:url]}"
     #ap content.keys

data/lib/content_link_parser.rb CHANGED

@@ -2,7 +2,6 @@
 class ContentLinkParser
   require "nokogiri"
-  require "absolutize"
   def initialize(url, content, options = {})
     @options = options

data/lib/crawl_job.rb CHANGED

@@ -6,163 +6,142 @@ class CrawlJob
   @queue = :cobweb_crawl_job
-  ## redis params used
-  #
-  # crawl-counter
-  # crawled
-  # queue-counter
-  # statistics[:average_response_time]
-  # statistics[:maximum_response_time]
-  # statistics[:minimum_response_time]
-  # statistics[:average_length]
-  # statistics[:maximum_length]
-  # statistics[:minimum_length]
-  # statistics[:queued_at]
-  # statistics[:started_at]
-  # statistics]:finished_at]
-  # total_pages
-  # total_assets
-  # statistics[:mime_counts]["mime_type"]
-  # statistics[:status_counts][xxx]
   def self.perform(content_request)
-    # change all hash keys to symbols
-    content_request.deep_symbolize_keys
-    redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
+    # change all hash keys to symbols
+    content_request = content_request.deep_symbolize_keys
+    @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
     @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
+    @debug = content_request[:debug]
+    refresh_counters
     # check we haven't crawled this url before
-    crawl_counter = redis.get("crawl-counter").to_i
-    queue_counter = redis.get("queue-counter").to_i
-    unless redis.sismember "crawled", content_request[:url]
+    unless @redis.sismember "crawled", content_request[:url]
-      # increment counter and check we haven't hit our crawl limit
-      redis.incr "crawl-counter"
-      crawl_counter += 1
-      if crawl_counter <= content_request[:crawl_limit].to_i
+      # if there is no limit or we're still under it lets get the url
+      if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
         content = Cobweb.new(content_request).get(content_request[:url], content_request)
         ## update statistics
-        if redis.hexists "statistics", "average_response_time"
-          redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
-        else
-          redis.hset("statistics", "average_response_time", content[:response_time].to_f)
-        end
-        redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
-        redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
-        if redis.hexists "statistics", "average_length"
-          redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
-        else
-          redis.hset("statistics", "average_length", content[:length].to_i)
+        Stats.set_statistics_in_redis(@redis, content)
+        # set the base url if this is the first page
+        set_base_url @redis, content, content_request
+        internal_links = all_links_from_content(content).map{|link| link.to_s}
+        # reject the link if we've crawled it or queued it
+        internal_links.reject!{|link| @redis.sismember("crawled", link)}
+        internal_links.reject!{|link| @redis.sismember("queued", link)}
+        # select the link if its internal
+        internal_links.select!{|link| internal_link?(link)}
+        internal_links.each do |link|
+          enqueue_content(content_request, link)
         end
-        redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
-        redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
-        if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
-          redis.incr "total_pages"
-        else
-          redis.incr "total_assets"
-        end
-        mime_counts = {}
-        if redis.hexists "statistics", "mime_counts"
-          mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
-          if mime_counts.has_key? content[:mime_type]
-            mime_counts[content[:mime_type]] += 1
-          else
-            mime_counts[content[:mime_type]] = 1
-          end
-        else
-          mime_counts = {content[:mime_type] => 1}
-        end
-        redis.hset "statistics", "mime_counts", mime_counts.to_json
-        status_counts = {}
-        if redis.hexists "statistics", "status_counts"
-          status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
-          if status_counts.has_key? content[:status_code].to_i
-            status_counts[content[:status_code].to_i] += 1
-          else
-            status_counts[content[:status_code].to_i] = 1
-          end
-        else
-          status_counts = {content[:status_code].to_i => 1}
-        end
-        redis.hset "statistics", "status_counts", status_counts.to_json
-        redis.srem "queued", content_request[:url]
-        redis.sadd "crawled", content_request[:url]
-        set_base_url redis, content, content_request[:base_url]
-        content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
-          link = link.to_s
-          unless redis.sismember "crawled", link
-            puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
-            if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
-              puts "Matched as #{link} as internal" if content_request[:debug]
-              unless redis.sismember("crawled", link) or redis.sismember("queued", link)
-                if queue_counter <= content_request[:crawl_limit].to_i
-                  new_request = content_request.clone
-                  new_request[:url] = link
-                  new_request[:parent] = content_request[:url]
-                  Resque.enqueue(CrawlJob, new_request)
-                  redis.sadd "queued", link
-                  redis.incr "queue-counter"
-                  queue_counter += 1
-                end
-              end
-            end
-          end
-        end
+        # now that we're done, lets update the queues
+        @redis.srem "queued", content_request[:url]
+        decrement_queue_counter
+        @redis.sadd "crawled", content_request[:url]
+        increment_crawl_counter
         # enqueue to processing queue
         Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
         puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
-        puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
+        puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
       else
-        puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
+        puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
       end
     else
       puts "Already crawled #{content_request[:url]}" if content_request[:debug]
     end
-    # detect finished state
-    if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
+    # if the'res nothing left queued or the crawled limit has been reached
+    if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
-      puts "queue_counter: #{queue_counter}"
-      puts "crawl_counter: #{crawl_counter}"
+      puts "queue_counter: #{@queue_counter}"
+      puts "crawl_counter: #{@crawl_counter}"
       puts "crawl_limit: #{content_request[:crawl_limit]}"
       # finished
       puts "FINISHED"
-      stats = redis.hgetall "statistics"
-      stats[:total_pages] = redis.get "total_pages"
-      stats[:total_assets] = redis.get "total_assets"
-      stats[:crawl_counter] = redis.get "crawl_counter"
-      stats[:queue_counter] = redis.get "queue_counter"
-      stats[:crawled] = redis.smembers "crawled"
+      stats = @redis.hgetall "statistics"
+      stats[:total_pages] = @redis.get "total_pages"
+      stats[:total_assets] = @redis.get "total_assets"
+      stats[:crawl_counter] = @redis.get "crawl_counter"
+      stats[:queue_counter] = @redis.get "queue_counter"
+      stats[:crawled] = @redis.smembers "crawled"
-      Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
+      Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
-      ap stats
     end
   end
   private
-  def self.set_base_url(redis, content, base_url)
+  def self.set_base_url(redis, content, content_request)
     if redis.get("base_url").nil?
-      if content[:status_code] >= 300 and content[:status_code] < 400
-        #redirect received for first url
-        redis.set("base_url", @absolutize.url(content[:location]).to_s)
-        puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
-      else
-        redis.set("base_url", base_url)
+      unless content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
+        uri = Addressable::URI.parse(content[:redirect_through].last)
+        redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
+      end
+      redis.set("base_url", content[:url])
+    end
+  end
+  def self.internal_link?(link)
+    puts "Checking for internal link for: #{link}" if @debug
+    @internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
+    valid_link = true
+    @internal_patterns.each do |pattern|
+      puts "Matching against #{pattern.source}" if @debug
+      if link.match(pattern)
+        puts "Matched as internal" if @debug
+        return true
       end
     end
+    puts "Didn't match any pattern so marked as not internal" if @debug
+    false
   end
+  def self.all_links_from_content(content)
+    content[:links].keys.map{|key| content[:links][key]}.flatten
+  end
+  def self.enqueue_content(content_request, link)
+    new_request = content_request.clone
+    new_request[:url] = link
+    new_request[:parent] = content_request[:url]
+    Resque.enqueue(CrawlJob, new_request)
+    @redis.sadd "queued", link
+    increment_queue_counter
+  end
+  def self.increment_queue_counter
+    @redis.incr "queue-counter"
+    refresh_counters
+  end
+  def self.increment_crawl_counter
+    @redis.incr "crawl-counter"
+    refresh_counters
+  end
+  def self.decrement_queue_counter
+    @redis.decr "queue-counter"
+    refresh_counters
+  end
+  def self.refresh_counters
+    @crawl_counter = @redis.get("crawl-counter").to_i
+    @queue_counter = @redis.get("queue-counter").to_i
+  end
+  def self.reset_counters
+    @redis.set("crawl-counter", @redis.smembers("crawled").count)
+    @redis.set("queue-counter", @redis.smembers("queued").count)
+    @crawl_counter = @redis.get("crawl-counter").to_i
+    @queue_counter = @redis.get("queue-counter").to_i
+  end
 end

data/lib/stats.rb CHANGED

@@ -11,6 +11,59 @@ class Stats < Sinatra::Base
     @@status = status
   end
+  def self.set_statistics_in_redis(redis, content)
+    crawl_counter = redis.get("crawl-counter").to_i
+    queue_counter = redis.get("queue-counter").to_i
+    if redis.hexists "statistics", "average_response_time"
+      redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
+    else
+      redis.hset("statistics", "average_response_time", content[:response_time].to_f)
+    end
+    redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
+    redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
+    if redis.hexists "statistics", "average_length"
+      redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
+    else
+      redis.hset("statistics", "average_length", content[:length].to_i)
+    end
+    redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
+    redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
+    if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
+      redis.incr "total_pages"
+    else
+      redis.incr "total_assets"
+    end
+    mime_counts = {}
+    if redis.hexists "statistics", "mime_counts"
+      mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
+      if mime_counts.has_key? content[:mime_type]
+        mime_counts[content[:mime_type]] += 1
+      else
+        mime_counts[content[:mime_type]] = 1
+      end
+    else
+      mime_counts = {content[:mime_type] => 1}
+    end
+    redis.hset "statistics", "mime_counts", mime_counts.to_json
+    status_counts = {}
+    if redis.hexists "statistics", "status_counts"
+      status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
+      if status_counts.has_key? content[:status_code].to_i
+        status_counts[content[:status_code].to_i] += 1
+      else
+        status_counts[content[:status_code].to_i] = 1
+      end
+    else
+      status_counts = {content[:status_code].to_i => 1}
+    end
+    redis.hset "statistics", "status_counts", status_counts.to_json
+  end
   set :views, settings.root + '/../views'
   get '/' do
@@ -19,7 +72,6 @@ class Stats < Sinatra::Base
     haml :statistics
   end
   def self.start
     thread = Thread.new do
       Stats.run!

data/spec/cobweb/cobweb_spec.rb CHANGED

@@ -74,6 +74,25 @@ describe Cobweb do
       Cobweb.new.should be_an_instance_of Cobweb
     end
+    it "should setup with defaults" do
+      cobweb = Cobweb.new
+      options = cobweb.instance_eval("@options")
+      ap options
+      options[:follow_redirects].should == true
+      options[:redirect_limit].should == 10
+      options[:processing_queue].should == CobwebProcessJob
+      options[:crawl_finished_queue].should == CobwebFinishedJob
+      options[:quiet].should == true
+      options[:debug].should == false
+      options[:cache].should == 300
+      options[:timeout].should == 10
+      options[:redis_options].should == {}
+      options[:internal_urls].should == []
+    end
     describe "get" do
       it "should return a hash with default values" do
         @cobweb.get(@base_url).should be_an_instance_of Hash
@@ -141,7 +160,7 @@ describe Cobweb do
           #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
           #
           #content = @cobweb.get(@base_url)
-          #content.should be_an_instance_of Hash
+          #content.should be_an_instance_of HashHelper
           #ap content
           #content[:url].should == "http://redirect-me.com/redirect.html"
           #content[:redirect_through].length.should == 2

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.22
+  version: 0.0.24
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-03-09 00:00:00.000000000 Z
+date: 2012-03-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70101145919340 !ruby/object:Gem::Requirement
+  requirement: &70268501331520 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145919340
+  version_requirements: *70268501331520
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70101145918920 !ruby/object:Gem::Requirement
+  requirement: &70268501331100 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145918920
+  version_requirements: *70268501331100
 - !ruby/object:Gem::Dependency
-  name: absolutize
-  requirement: &70101145918500 !ruby/object:Gem::Requirement
+  name: nokogiri
+  requirement: &70268501330680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145918500
+  version_requirements: *70268501330680
 - !ruby/object:Gem::Dependency
-  name: nokogiri
-  requirement: &70101145934440 !ruby/object:Gem::Requirement
+  name: addressable
+  requirement: &70268501330240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145934440
+  version_requirements: *70268501330240
 - !ruby/object:Gem::Dependency
-  name: addressable
-  requirement: &70101145934020 !ruby/object:Gem::Requirement
+  name: rspec
+  requirement: &70268501329820 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145934020
+  version_requirements: *70268501329820
 - !ruby/object:Gem::Dependency
-  name: rspec
-  requirement: &70101145933580 !ruby/object:Gem::Requirement
+  name: awesome_print
+  requirement: &70268501329400 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145933580
+  version_requirements: *70268501329400
 - !ruby/object:Gem::Dependency
-  name: awesome_print
-  requirement: &70101145933160 !ruby/object:Gem::Requirement
+  name: sinatra
+  requirement: &70268501328980 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145933160
+  version_requirements: *70268501328980
 - !ruby/object:Gem::Dependency
-  name: sinatra
-  requirement: &70101145932740 !ruby/object:Gem::Requirement
+  name: thin
+  requirement: &70268501328560 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145932740
+  version_requirements: *70268501328560
 - !ruby/object:Gem::Dependency
-  name: thin
-  requirement: &70101145932320 !ruby/object:Gem::Requirement
+  name: haml
+  requirement: &70268501328140 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145932320
+  version_requirements: *70268501328140
 - !ruby/object:Gem::Dependency
-  name: haml
-  requirement: &70101145931900 !ruby/object:Gem::Requirement
+  name: hashie
+  requirement: &70268501344080 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70101145931900
+  version_requirements: *70268501344080
 description:
 email: stewart@rockwellcottage.com
 executables: []
@@ -134,14 +134,12 @@ files:
 - spec/samples/sample_html_links.html
 - spec/spec.opts
 - spec/spec_helper.rb
-- lib/cobweb/version.rb
 - lib/cobweb.rb
 - lib/cobweb_crawler.rb
 - lib/cobweb_finished_job.rb
 - lib/cobweb_process_job.rb
 - lib/content_link_parser.rb
 - lib/crawl_job.rb
-- lib/hash.rb
 - lib/namespaced_redis.rb
 - lib/redirect_error.rb
 - lib/robots.rb

data/lib/cobweb/version.rb DELETED

	@@ -1 +0,0 @@
1	- VERSION = "0.0.21"

data/lib/hash.rb DELETED

@@ -1,22 +0,0 @@
-## add symbolize methods to hash
-class Hash
-  def symbolize_keys
-    keys.each do |key|
-      if key.instance_of? String
-        value = self[key]
-        self.delete(key)
-        self[key.to_sym] = value
-      end
-    end
-    self
-  end
-  def deep_symbolize_keys
-    symbolize_keys
-    keys.each do |key|
-      if self[key].instance_of? Hash
-        self[key].deep_symbolize_keys
-      end
-    end
-    self
-  end
-end