RubyGems - cobweb - Versions diffs - 1.0.19 → 1.0.20 - Mend

cobweb 1.0.19 → 1.0.20

Files changed (18) hide show

checksums.yaml +15 -0
data/README.textile +3 -4
data/lib/cobweb.rb +7 -1
data/lib/cobweb_crawl_helper.rb +18 -13
data/lib/cobweb_crawler.rb +1 -1
data/lib/cobweb_version.rb +1 -1
data/lib/crawl.rb +49 -42
data/lib/crawl_helper.rb +7 -1
data/lib/crawl_worker.rb +9 -8
data/lib/sidekiq/cobweb_helper.rb +7 -0
data/lib/stats.rb +31 -30
data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -2
data/spec/cobweb/cobweb_crawler_spec.rb +2 -1
data/spec/cobweb/crawl_job_spec.rb +37 -34
data/spec/cobweb/crawl_worker_spec.rb +2 -2
data/spec/cobweb/robots_spec.rb +1 -0
data/spec/spec_helper.rb +9 -4
metadata +6 -58

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    MDE5MzU3NzI2MTRhYzM5NzIwMDZlMTJjMTg5NzNiMzAyMjFkMjcxOQ==
+  data.tar.gz: !binary |-
+    MWE3ZTAwYjExZjc4NzU2MDYzOTlhOTQwMTNlNTcyZjNmZTYwNmU3Zg==
+SHA512:
+  metadata.gz: !binary |-
+    N2U2MDk1MmI3ZTU3OTFmZDI5YjY4YTEyNDNkMTE4MmJjOTFkNTZiYzNhY2Q4
+    Mjk0ZjM1YThhNzhkMGNjNjJiZTJkNjM1OWQ1MGMzZmVlMDI5MzUyOTU5YTRk
+    NzEzZjBiZjM2OTUxZTc2NzZjZDIyOWQ4ZmVlYzYyOGViMDIyYzY=
+  data.tar.gz: !binary |-
+    ZTJiYmRlNDY0M2FkNTdlN2I0ZjNiODYxOGQyN2MxZGZlMGViMWIxZDA4YmY1
+    ZDc2ZDU3NDc4ODg1YmExYjFmYjMyY2U0MDU4MGQ0OTJkZjRmNjAyYmQ3NWVl
+    ODY0ZTE5MGUzNzAzZWFlMzdmZmY1YzNhMmEzNWE1NzVkYzAwZDE=

data/README.textile CHANGED Viewed

@@ -1,5 +1,4 @@
-h1. Cobweb v1.0.19
+h1. Cobweb v1.0.20
 "@cobweb_gem":https://twitter.com/cobweb_gem
 !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -155,7 +154,7 @@ The :processing_queue option is used to specify the class that contains the resq
 h3. CobwebCrawler
-CobwebCrawler is the standalone crawling class.  If you don't want to use redis and just want to crawl the site within your ruby process, you can use this class.
+CobwebCrawler is the standalone crawling class.  If you don't want to use resque or sidekiq and just want to crawl the site within your ruby process, you can use this class.
 bc. crawler = CobwebCrawler.new(:cache => 600)
 statistics = crawler.crawl("http://www.pepsico.com")
@@ -207,7 +206,7 @@ h2. License
 h3. The MIT License
-Copyright (c) 2010 6Central Limited
+Copyright (c) 2013 Active Information Design
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

data/lib/cobweb.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require 'rubygems'
 require 'uri'
-require 'resque'
 require "addressable/uri"
 require 'digest/sha1'
 require 'base64'
@@ -178,6 +177,9 @@ class Cobweb
           raise ":username and :password are required if using basic authentication" unless @options[:username] && @options[:password]
           request.basic_auth @options[:username], @options[:password]
         end
+        if @options[:range]
+          request.set_range(@options[:range])
+        end
         response = @http.request request
@@ -451,6 +453,10 @@ class Cobweb
     pattern = pattern.gsub("*", ".*?")
     pattern
   end
+  def clear_cache
+  end
   private
   # checks if the mime_type is textual

data/lib/cobweb_crawl_helper.rb CHANGED Viewed

@@ -15,15 +15,18 @@ class CobwebCrawlHelper
     @stats = Stats.new(data)
   end
-  def destroy(options={})
+  def destroy
+    options = @data
     options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
-    options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
+    if RESQUE_INSTALLED
+      options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
+    end
     # set status as cancelled now so that we don't enqueue any further pages
     self.statistics.end_crawl(@data, true)
-    if options[:finished_resque_queue]
+    if options[:finished_resque_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
       additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
       additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
@@ -38,15 +41,17 @@ class CobwebCrawlHelper
       sleep 1
       counter += 1
     end
-    position = Resque.size(options[:queue_name])
-    until position == 0
-      position-=BATCH_SIZE
-      position = 0 if position < 0
-      job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
-      job_items.each do |item|
-        if item["args"][0]["crawl_id"] == id
-          # remove this job from the queue
-          Resque.dequeue(CrawlJob, item["args"][0])
+    if options[:queue_system] == :resque && RESQUE_INSTALLED
+      position = Resque.size(options[:queue_name])
+      until position == 0
+        position-=BATCH_SIZE
+        position = 0 if position < 0
+        job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
+        job_items.each do |item|
+          if item["args"][0]["crawl_id"] == id
+            # remove this job from the queue
+            Resque.dequeue(CrawlJob, item["args"][0])
+          end
         end
       end
     end

data/lib/cobweb_crawler.rb CHANGED Viewed

@@ -122,7 +122,7 @@ class CobwebCrawler
             if @options[:store_inbound_links]
               document_links.each do |target_link|
-                target_uri = UriHelper.parse(target_link)
+                target_uri = UriHelper.parse(target_link).normalize
                 @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
               end
             end

data/lib/cobweb_version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "1.0.19"
+    "1.0.20"
   end
 end

data/lib/crawl.rb CHANGED Viewed

@@ -30,7 +30,6 @@ module CobwebModule
       already_crawled?(link) || already_queued?(link) || already_running?(link)
     end
     # Returns true if the crawl count is within limits
     def within_crawl_limits?
       @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -62,15 +61,15 @@ module CobwebModule
       unless already_running? @options[:url]
         unless already_crawled? @options[:url]
-          @redis.sadd("currently_running", @options[:url])
+          update_queues
           if within_crawl_limits?
+            @redis.sadd("currently_running", @options[:url])
             @stats.update_status("Retrieving #{@options[:url]}...")
-            lock("update_queues") do
-              @content = Cobweb.new(@options).get(@options[:url], @options)
-              if @options[:url] == @redis.get("original_base_url")
-                @redis.set("crawled_base_url", @content[:base_url])
-              end
-              update_queues
+            @content = Cobweb.new(@options).get(@options[:url], @options)
+            update_counters
+            if @options[:url] == @redis.get("original_base_url")
+              @redis.set("crawled_base_url", @content[:base_url])
             end
             if content.permitted_type?
@@ -80,9 +79,15 @@ module CobwebModule
               return true
             end
           else
+            puts "======================================="
+            puts "OUTWITH CRAWL LIMITS"
+            puts "======================================="
             decrement_queue_counter
           end
         else
+          puts "======================================="
+          puts "ALREADY CRAWLED"
+          puts "======================================="
           decrement_queue_counter
         end
       else
@@ -108,26 +113,28 @@ module CobwebModule
         internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
         # reject the link if we've crawled it or queued it
-        internal_links.reject! { |link| @redis.sismember("crawled", link) }
-        internal_links.reject! { |link| @redis.sismember("queued", link) }
-        internal_links.each do |link|
-          if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
-            if status != CobwebCrawlHelper::CANCELLED
-              yield link if block_given?
-              unless link.nil?
-                @redis.sadd "queued", link
-                increment_queue_counter
+        internal_links.reject! { |link| already_handled?(link)}
+        lock("internal-links") do
+          internal_links.each do |link|
+            if within_queue_limits? && !already_handled?(link)
+              if status != CobwebCrawlHelper::CANCELLED
+                yield link if block_given?
+                unless link.nil?
+                  @redis.sadd "queued", link
+                  increment_queue_counter
+                end
+              else
+                debug_puts "Cannot enqueue new content as crawl has been cancelled."
               end
-            else
-              debug_puts "Cannot enqueue new content as crawl has been cancelled."
             end
           end
         end
         if @options[:store_inbound_links]
           document_links.each do |link|
-            uri = URI.parse(link)
+            uri = URI.parse(link).normalize
             @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
           end
         end
@@ -140,25 +147,25 @@ module CobwebModule
     end
     def update_queues
-      #lock("update_queues") do
+      lock("update_queues") do
         #@redis.incr "inprogress"
         # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
         @redis.srem "queued", @options[:url]
         @redis.sadd "crawled", @options[:url]
-        if content.url != @options[:url]
-          @redis.srem "queued", content.url
-          @redis.sadd "crawled", content.url
-        end
         # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
-        if @options[:crawl_limit_by_page]
-          if content.mime_type.match("text/html")
-            increment_crawl_counter
-          end
-        else
+      end
+    end
+    def update_counters
+      if @options[:crawl_limit_by_page]
+        if content.mime_type.match("text/html")
           increment_crawl_counter
         end
-        decrement_queue_counter
-      #end
+      else
+        increment_crawl_counter
+      end
+      decrement_queue_counter
     end
     def to_be_processed?
@@ -166,7 +173,7 @@ module CobwebModule
     end
     def process(&block)
-      lock("process") do
+      lock("process-count") do
         if @options[:crawl_limit_by_page]
           if content.mime_type.match("text/html")
             increment_process_counter
@@ -175,10 +182,10 @@ module CobwebModule
           increment_process_counter
         end
         #@redis.sadd "queued", @options[:url]
-        yield if block_given?
-        @redis.incr("crawl_job_enqueued_count")
       end
+      yield if block_given?
+      @redis.incr("crawl_job_enqueued_count")
     end
     def finished_processing
@@ -250,17 +257,17 @@ module CobwebModule
     end
     def lock(key, &block)
-      #debug_puts "REQUESTING LOCK [#{key}]"
+      debug_puts "REQUESTING LOCK [#{key}]"
       set_nx = @redis.setnx("#{key}_lock", "locked")
-      #debug_puts "LOCK:#{key}:#{set_nx}"
+      debug_puts "LOCK:#{key}:#{set_nx}"
       while !set_nx
-        #debug_puts "===== WAITING FOR LOCK [#{key}] ====="
+        debug_puts "===== WAITING FOR LOCK [#{key}] ====="
         sleep 0.01
         set_nx = @redis.setnx("#{key}_lock", "locked")
       end
-      #debug_puts "RECEIVED LOCK [#{key}]"
-      @redis.expire("#{key}_lock", 10)
+      debug_puts "RECEIVED LOCK [#{key}]"
+      @redis.expire("#{key}_lock", 30)
       begin
         result = yield
       ensure

data/lib/crawl_helper.rb CHANGED Viewed

@@ -150,7 +150,13 @@ class CrawlHelper
     elsif content_request[:use_encoding_safe_process_job]
       content_to_send[:body] = Base64.encode64(content[:body])
       content_to_send[:processing_queue] = content_request[:processing_queue]
-      Resque.enqueue(EncodingSafeProcessJob, content_to_send)
+      if content_request[:queue_system] == :resque
+        Resque.enqueue(EncodingSafeProcessJob, content_to_send)
+      elsif content_request[:queue_system] == :sidekiq
+        const_get(content_request[:processing_queue]).perform_async(content_to_send)
+      else
+        raise "Unknown queue system: #{content_request[:queue_system]}"
+      end
     else
       if content_request[:queue_system] == :resque
         Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)

data/lib/crawl_worker.rb CHANGED Viewed

@@ -25,17 +25,15 @@ class CrawlWorker
       # if the crawled object is an object type we are interested
       if @crawl.content.permitted_type?
-        @crawl.lock("queue_links") do
-          # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
-          @crawl.process_links do |link|
+        # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
+        @crawl.process_links do |link|
+          @crawl.lock("queue_links") do
             if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
               # enqueue the links to sidekiq
               @crawl.debug_puts "QUEUED LINK: #{link}"
               enqueue_content(content_request, link)
             end
           end
         end
@@ -64,7 +62,7 @@ class CrawlWorker
       end
     end
-    @crawl.lock("finished") do
+    #@crawl.lock("finished") do
       # let the crawl know we're finished with this object
       @crawl.finished_processing
@@ -74,7 +72,7 @@ class CrawlWorker
         @crawl.debug_puts "Calling crawl_job finished"
         finished(content_request)
       end
-    end
+    #end
   end
   def self.jobs
     Sidekiq.redis do |conn|
@@ -99,6 +97,9 @@ class CrawlWorker
   # Enqueues the content to the processing queue setup in options
   def send_to_processing_queue(content, content_request)
     content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
+    content_to_send.keys.each do |key|
+      content_to_send[key] = content_to_send[key].force_encoding('UTF-8') if content_to_send[key].kind_of?(String)
+    end
     if content_request[:direct_call_process_job]
       clazz = content_request[:processing_queue].constantize
       clazz.perform(content_to_send)

data/lib/sidekiq/cobweb_helper.rb CHANGED Viewed

@@ -6,6 +6,13 @@ else
   SIDEKIQ_INSTALLED = false
   puts "sidekiq gem not installed, skipping crawl_worker specs"
 end
+if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
+  RESQUE_INSTALLED = true
+  require 'resque'
+else
+  RESQUE_INSTALLED = false
+  puts "resque gem not installed, skipping crawl_job specs"
+end
 module Sidekiq
   module Worker

data/lib/stats.rb CHANGED Viewed

@@ -1,17 +1,21 @@
 # Stats class is the main statisitics hub for monitoring crawls.  Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
 class Stats
   require 'json'
   attr_reader :redis
   # Sets up redis usage for statistics
   def initialize(options)
     options[:redis_options] = {} unless options.has_key? :redis_options
-    @full_redis = Redis.new(options[:redis_options])
+    if options[:redis]
+      @full_redis = options[:redis]
+    else
+      @full_redis = Redis.new(options[:redis_options])
+    end
     @lock = Mutex.new
     @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
   end
   # Sets up the crawl in statistics
   def start_crawl(options)
     unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
@@ -23,7 +27,7 @@ class Stats
     @redis.hset "statistics", "crawl_started_at", DateTime.now
     @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
   end
   # Removes the crawl from the running crawls and updates status
   def end_crawl(options, cancelled=false)
     #@full_redis.srem "cobweb_crawls", options[:crawl_id]
@@ -35,21 +39,21 @@ class Stats
     @redis.hset "statistics", "crawl_finished_at", DateTime.now
     #@redis.del "crawl_details"
   end
   def get_crawled
     @redis.smembers "crawled"
   end
   def inbound_links_for(url)
-    uri = UriHelper.parse(url)
+    uri = UriHelper.parse(url).normalize
     @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
   end
-  # Returns statistics hash.  update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
+  # Returns statistics hash.  update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
   def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
     @lock.synchronize {
       @statistics = get_statistics
       if @statistics.has_key? :average_response_time
         @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
       else
@@ -64,7 +68,7 @@ class Stats
       end
       @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
       @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
       if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
         @statistics[:page_count] = @statistics[:page_count].to_i + 1
         @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
@@ -74,14 +78,14 @@ class Stats
         @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
         increment_time_stat("assets_count")
       end
       total_redirects = @statistics[:total_redirects].to_i
       @statistics[:total_redirects] = 0 if total_redirects.nil?
       @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?
       @statistics[:crawl_counter] = crawl_counter
       @statistics[:queue_counter] = queue_counter
       total_length = @statistics[:total_length].to_i
       @statistics[:total_length] = total_length + content[:length].to_i
@@ -117,7 +121,7 @@ class Stats
       elsif content[:mime_type].cobweb_starts_with? "video"
         increment_time_stat("mime_video_count")
       end
       status_counts = {}
       if @statistics.has_key? :status_counts
         status_counts = @statistics[:status_counts]
@@ -126,11 +130,11 @@ class Stats
           status_counts[status_code] += 1
         else
           status_counts[status_code] = 1
-        end
+        end
       else
         status_counts = {status_code => 1}
       end
       # record statistics by status type
       if content[:status_code] >= 200 && content[:status_code] < 300
         increment_time_stat("status_200_count")
@@ -139,21 +143,21 @@ class Stats
       elsif content[:status_code] >= 500 && content[:status_code] < 600
         increment_time_stat("status|_500_count")
       end
       @statistics[:status_counts] = status_counts.to_json
       ## time based statistics
       increment_time_stat("minute_totals", "minute", 60)
       redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
       instance_eval redis_command
     }
     @statistics
   end
   # Returns the statistics hash
   def get_statistics
     statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
     if statistics[:status_counts].nil?
       statistics[:status_counts]
@@ -167,23 +171,23 @@ class Stats
     end
     statistics
   end
   # Sets the current status of the crawl
   def update_status(status)
     @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
   end
   # Returns the current status of the crawl
   def get_status
     @redis.hget "statistics", "current_status"
   end
   # Sets totals for the end of the crawl (Not Used)
   def set_totals
     stats = get_statistics
     stats[:crawled] = @redis.smembers "crawled"
   end
   private
   # Records a time based statistic
   def record_time_stat(stat_name, value, type="minute", duration=60)
@@ -193,7 +197,7 @@ class Stats
     end
     stat_value = @redis.hget(stat_name, key).to_i
     stat_count = @redis.hget("#{stat_name}-count", key).to_i
     if minute_count.nil?
       @redis.hset stat_name, key, value
       @redis.hset "#{stat_name}-count", key, 1
@@ -202,7 +206,7 @@ class Stats
       @redis.hset "#{stat_name}-count", key, stat_count+1
     end
   end
   # Increments a time based statistic (eg pages per minute)
   def increment_time_stat(stat_name, type="minute", duration=60)
     key = DateTime.now.strftime("%Y-%m-%d %H:%M")
@@ -218,12 +222,9 @@ class Stats
     #clear up older data
     @redis.hgetall(stat_name).keys.each do |key|
       if DateTime.parse(key) < DateTime.now-(duration/1440.0)
-        puts "Deleting #{stat_name} - #{key}"
         @redis.hdel(stat_name, key)
       end
     end
   end
-end
+end

data/spec/cobweb/cobweb_crawl_helper_spec.rb CHANGED Viewed

@@ -3,6 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 describe CobwebCrawlHelper do
   include HttpStubs
   before(:each) do
+    pending("not enabled for non resque installs") unless RESQUE_INSTALLED
     setup_stubs
   end
   # this spec tests the crawl object
@@ -42,14 +44,14 @@ describe CobwebCrawlHelper do
           end
         end
         after(:each) do
-          Resque.remove_queue("cobweb_crawl_job")
+          Resque.remove_queue("cobweb_crawl_job") if RESQUE_INSTALLED
         end
         it "should have a queue length of 210" do
           Resque.size("cobweb_crawl_job").should == 210
         end
         describe "after called" do
           before(:each) do
-            @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
+            @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :resque})
             @crawl.destroy
           end
           it "should delete only the crawl specified" do

data/spec/cobweb/cobweb_crawler_spec.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 describe CobwebCrawler do
   before(:each) do
+    pending("thin not installed") unless THIN_INSTALLED
     @base_url = "http://localhost:3532/"
@@ -55,12 +56,12 @@ describe CobwebCrawler do
     context "storing inbound links" do
       before(:each) do
+        pending("thin not installed") unless THIN_INSTALLED
         @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
         @statistics = @crawler.crawl(@base_url)
       end
       it "should store inbound links" do
         @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
         @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
       end

data/spec/cobweb/crawl_job_spec.rb CHANGED Viewed

@@ -6,48 +6,51 @@ describe CrawlJob, :local_only => true, :disabled => true do
   before(:all) do
     #store all existing resque process ids so we don't kill them afterwards
+    if RESQUE_INSTALLED  && THIN_INSTALLED
+      @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
+      if Resque.workers.count > 0 && @existing_processes.empty?
+        raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
+      elsif Resque.workers.count == 0 && !@existing_processes.empty?
+        raise "Ghost worker processes present (#{@existing_processes.join(',')})"
+      elsif Resque.workers.count > 0 && !@existing_processes.empty?
+        raise "Resque workers present, please end other resque processes before running this spec"
+      end
-    @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
-    if Resque.workers.count > 0 && @existing_processes.empty?
-      raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
-    elsif Resque.workers.count == 0 && !@existing_processes.empty?
-      raise "Ghost worker processes present (#{@existing_processes.join(',')})"
-    elsif Resque.workers.count > 0 && !@existing_processes.empty?
-      raise "Resque workers present, please end other resque processes before running this spec"
-    end
-    # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
-    `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
-    `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
-    `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
-    io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
-    counter = 0
-    until counter > 10 || workers_processes_started?
-      print "\rStarting Resque Processes... #{10-counter} "
-      counter += 1
-      sleep 1
-    end
-    puts ""
+      # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
+      `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
+      `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
+      `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
+      io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
+      counter = 0
+      until counter > 10 || workers_processes_started?
+        print "\rStarting Resque Processes... #{10-counter} "
+        counter += 1
+        sleep 1
+      end
+      puts ""
-    counter = 0
-    until counter > 30 || workers_running?
-      print "\rWaiting for Resque Workers... #{30-counter} "
-      counter += 1
-      sleep 1
-    end
-    puts ""
+      counter = 0
+      until counter > 30 || workers_running?
+        print "\rWaiting for Resque Workers... #{30-counter} "
+        counter += 1
+        sleep 1
+      end
+      puts ""
-    if workers_running?
-      puts "Workers Running."
-    else
-      raise "Workers didn't appear, please check environment"
+      if workers_running?
+        puts "Workers Running."
+      else
+        raise "Workers didn't appear, please check environment"
+      end
     end
   end
   before(:each) do
+    pending("Resque not installed") unless RESQUE_INSTALLED
+    pending("thin not installed") unless THIN_INSTALLED
     @base_url = "http://localhost:3532/"
     @base_page_count = 77

data/spec/cobweb/crawl_worker_spec.rb CHANGED Viewed

@@ -4,8 +4,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 describe CrawlWorker, :local_only => true do
   before(:all) do
-    if SIDEKIQ_INSTALLED
+    if SIDEKIQ_INSTALLED && THIN_INSTALLED
       #store all existing resque process ids so we don't kill them afterwards
       @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
@@ -22,6 +21,7 @@ describe CrawlWorker, :local_only => true do
   before(:each) do
     pending("Sidkiq not installed") unless SIDEKIQ_INSTALLED
+    pending("thin not installed") unless THIN_INSTALLED
     @base_url = "http://localhost:3532/"
     @base_page_count = 77

data/spec/cobweb/robots_spec.rb CHANGED Viewed

@@ -9,6 +9,7 @@ describe Robots do
   describe "default user-agent" do
     before(:each) do
+      pending("thin not installed") unless THIN_INSTALLED
       @options = {:url => "http://localhost:3532/"}
     end

data/spec/spec_helper.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
 require File.expand_path(File.dirname(__FILE__) + '/../spec/samples/sample_server')
 require File.expand_path(File.dirname(__FILE__) + '/../spec/http_stubs')
 require 'mock_redis'
-require 'thin' if ENV["TRAVIS_RUBY_VERSION"].nil?
 require 'coveralls'
 Coveralls.wear!
@@ -17,9 +17,14 @@ RSpec.configure do |config|
   if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
     config.filter_run_excluding :local_only => true
   end
-  Thread.new do
-    @thin ||= Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
+  THIN_INSTALLED = false
+  if Gem::Specification.find_all_by_name("thin", ">=1.0.0").count >= 1
+    require 'thin'
+    THIN_INSTALLED = true
+    Thread.new do
+      @thin ||= Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
+    end
   end
   # WAIT FOR START TO COMPLETE

metadata CHANGED Viewed

@@ -1,20 +1,18 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 1.0.19
-  prerelease:
+  version: 1.0.20
 platform: ruby
 authors:
 - Stewart McKee
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-26 00:00:00.000000000 Z
+date: 2014-08-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redis
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -54,23 +48,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rspec
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -78,7 +55,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: awesome_print
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -86,7 +62,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -94,23 +69,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: sinatra
   requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: thin
-  requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -118,7 +76,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -126,7 +83,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: haml
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -134,15 +90,13 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: namespaced_redis
+  name: redis-namespace
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -150,7 +104,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -158,7 +111,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: json
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -166,7 +118,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -174,7 +125,6 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: slop
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -182,7 +132,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
@@ -564,27 +513,26 @@ files:
 homepage: http://github.com/stewartmckee/cobweb
 licenses:
 - MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.25
+rubygems_version: 2.1.11
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
   crawl extremely large sites faster than multi-threaded crawlers.  It is also a standalone
   crawler that has a sophisticated statistics monitoring interface to monitor the