RubyGems - cobweb - Versions diffs - 0.0.68 → 0.0.70 - Mend

cobweb 0.0.68 → 0.0.70

Files changed (8) hide show

data/README.textile +5 -7
data/lib/{crawl.rb → cobweb_crawl_helper.rb} +1 -1
data/lib/cobweb_version.rb +1 -1
data/lib/crawl_job.rb +10 -7
data/lib/stats.rb +4 -4
data/spec/cobweb/cobweb_job_spec.rb +3 -3
data/spec/cobweb/crawl_spec.rb +5 -5
metadata +23 -23

data/README.textile CHANGED

@@ -1,5 +1,5 @@
-h1. Cobweb v0.0.68
+h1. Cobweb v0.0.70
 !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
@@ -138,15 +138,13 @@ You can also run within a block and get access to each page as it is being crawl
 bc. statistics = CobwebCrawler.new(:cache => 600).crawl("http://www.pepsico.com") do |page|
   puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
 end
-puts "Finished Crawl in "
+puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets."
+h3. CobwebCrawlHelper
+The CobwebCrawlHelper class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
-h3. Crawl
-The crawl class is a helper class to assist in getting information about a crawl and to perform functions against the crawl
-bc. crawl = Crawl.new(options)
+bc. crawl = CobwebCrawlHelper.new(options)
   * options - the hash of options passed into Cobweb.new (must include a :crawl_id)

data/lib/{crawl.rb → cobweb_crawl_helper.rb} RENAMED

@@ -1,5 +1,5 @@
 # The crawl class gives easy access to information about the crawl, and gives the ability to stop a crawl
-class Crawl
+class CobwebCrawlHelper
   attr_accessor :id

data/lib/cobweb_version.rb CHANGED

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "0.0.68"
+    "0.0.70"
   end
 end

data/lib/crawl_job.rb CHANGED

@@ -14,7 +14,7 @@ class CrawlJob
     # change all hash keys to symbols
     content_request = HashUtil.deep_symbolize_keys(content_request)
     @content_request = content_request
-    @crawl = Crawl.new(content_request)
+    @crawl = CobwebCrawlHelper.new(content_request)
     content_request[:redis_options] = {} unless content_request.has_key? :redis_options
     content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
@@ -28,13 +28,14 @@ class CrawlJob
     # check we haven't crawled this url before
     unless @redis.sismember "crawled", content_request[:url]
       # if there is no limit or we're still under it lets get the url
-      if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
+      if within_crawl_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
         content = Cobweb.new(content_request).get(content_request[:url], content_request)
         if content_request[:url] == @redis.get("original_base_url")
            @redis.set("crawled_base_url", content[:base_url])
         end
         if is_permitted_type(content)
           begin
+            @redis.incr "inprogress"
             # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
             @redis.srem "queued", content_request[:url]
             @redis.sadd "crawled", content_request[:url]
@@ -70,8 +71,8 @@ class CrawlJob
               internal_links.each do |link|
                 puts link
-                puts "Not enqueuing due to cancelled crawl" if @crawl.status == Crawl::CANCELLED
-                if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != Crawl::CANCELLED
+                puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
+                if within_queue_limits?(content_request[:crawl_limit]) and @crawl.status != CobwebCrawlHelper::CANCELLED
                   enqueue_content(content_request, link)
                 end
               end
@@ -88,6 +89,7 @@ class CrawlJob
             end
           ensure
+            @redis.decr "inprogress"
             #update the queued and crawled lists if we are within the crawl limits.
             # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
@@ -126,10 +128,10 @@ class CrawlJob
   end
-  # Sets the crawl status to Crawl::FINISHED and enqueues the crawl finished job
+  # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
   def self.finished(content_request)
     # finished
-    if @crawl.status != Crawl::FINISHED and @crawl.status != Crawl::CANCELLED
+    if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
       ap "CRAWL FINISHED  #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
       @stats.end_crawl(content_request)
@@ -195,8 +197,9 @@ class CrawlJob
     new_request = content_request.clone
     new_request[:url] = link
     new_request[:parent] = content_request[:url]
-    Resque.enqueue(CrawlJob, new_request)
+    #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
     @redis.sadd "queued", link
+    Resque.enqueue(CrawlJob, new_request)
     increment_queue_counter
   end

data/lib/stats.rb CHANGED

@@ -16,16 +16,16 @@ class Stats
         @redis.hset "crawl_details", key, options[key].to_s
       end
     end
-    @redis.hset "statistics", "current_status", Crawl::STARTING
+    @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
   end
   # Removes the crawl from the running crawls and updates status
   def end_crawl(options, cancelled=false)
     @full_redis.srem "cobweb_crawls", options[:crawl_id]
     if cancelled
-      @redis.hset "statistics", "current_status", Crawl::CANCELLED
+      @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
     else
-      @redis.hset "statistics", "current_status", Crawl::FINISHED
+      @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
     end
     @redis.del "crawl_details"
   end
@@ -158,7 +158,7 @@ class Stats
   # Sets the current status of the crawl
   def update_status(status)
-    #@redis.hset("statistics", "current_status", status) unless status == Crawl::CANCELLED
+    #@redis.hset("statistics", "current_status", status) unless status == CobwebCrawlHelper::CANCELLED
   end
   # Returns the current status of the crawl

data/spec/cobweb/cobweb_job_spec.rb CHANGED

@@ -33,7 +33,7 @@ describe Cobweb, :local_only => true do
     end
     it "should not crawl anything if nothing has started" do
       crawl = @cobweb.start(@base_url)
-      crawl_obj = Crawl.new(crawl)
+      crawl_obj = CobwebCrawlHelper.new(crawl)
       crawl_obj.destroy
       @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
       wait_for_crawl_finished crawl[:crawl_id]
@@ -42,7 +42,7 @@ describe Cobweb, :local_only => true do
     it "should not complete the crawl when cancelled" do
       crawl = @cobweb.start(@base_url)
-      crawl_obj = Crawl.new(crawl)
+      crawl_obj = CobwebCrawlHelper.new(crawl)
       sleep 6
       crawl_obj.destroy
       @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -231,7 +231,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
   end
   def running?(crawl_id)
-    @stat.get_status != Crawl::FINISHED and @stat.get_status != Crawl::CANCELLED
+    @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
   end
   def clear_queues

data/spec/cobweb/crawl_spec.rb CHANGED

@@ -1,23 +1,23 @@
 require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
-describe Crawl do
+describe CobwebCrawlHelper do
   # this spec tests the crawl object
   describe "initialize" do
     describe "without data" do
       it "should raise an exception" do
-        lambda {Crawl.new}.should raise_exception
+        lambda {CobwebCrawlHelper.new}.should raise_exception
       end
     end
     describe "with data" do
       before(:each) do
         data = {:crawl_id => "asdf"}
-        @crawl = Crawl.new(data)
+        @crawl = CobwebCrawlHelper.new(data)
       end
       it "should create a crawl object" do
-        @crawl.should be_an_instance_of Crawl
+        @crawl.should be_an_instance_of CobwebCrawlHelper
       end
       it "should return an id" do
         @crawl.should respond_to "id"
@@ -46,7 +46,7 @@ describe Crawl do
         end
         describe "after called" do
           before(:each) do
-            @crawl = Crawl.new({:crawl_id => "crawl_0_id"})
+            @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
             @crawl.destroy
           end
           it "should delete only the crawl specified" do

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cobweb
 version: !ruby/object:Gem::Version
-  version: 0.0.68
+  version: 0.0.70
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-09-10 00:00:00.000000000 Z
+date: 2012-09-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: resque
-  requirement: &70324863540700 !ruby/object:Gem::Requirement
+  requirement: &70248368307060 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863540700
+  version_requirements: *70248368307060
 - !ruby/object:Gem::Dependency
   name: redis
-  requirement: &70324863539560 !ruby/object:Gem::Requirement
+  requirement: &70248368306020 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863539560
+  version_requirements: *70248368306020
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70324863538960 !ruby/object:Gem::Requirement
+  requirement: &70248368305360 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863538960
+  version_requirements: *70248368305360
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &70324863537700 !ruby/object:Gem::Requirement
+  requirement: &70248368304140 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863537700
+  version_requirements: *70248368304140
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70324863537120 !ruby/object:Gem::Requirement
+  requirement: &70248368303560 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863537120
+  version_requirements: *70248368303560
 - !ruby/object:Gem::Dependency
   name: awesome_print
-  requirement: &70324863536500 !ruby/object:Gem::Requirement
+  requirement: &70248368302820 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863536500
+  version_requirements: *70248368302820
 - !ruby/object:Gem::Dependency
   name: sinatra
-  requirement: &70324863535620 !ruby/object:Gem::Requirement
+  requirement: &70248368302080 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863535620
+  version_requirements: *70248368302080
 - !ruby/object:Gem::Dependency
   name: thin
-  requirement: &70324863534860 !ruby/object:Gem::Requirement
+  requirement: &70248368301260 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863534860
+  version_requirements: *70248368301260
 - !ruby/object:Gem::Dependency
   name: haml
-  requirement: &70324863534000 !ruby/object:Gem::Requirement
+  requirement: &70248368300400 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70324863534000
+  version_requirements: *70248368300400
 - !ruby/object:Gem::Dependency
   name: namespaced_redis
-  requirement: &70324863533220 !ruby/object:Gem::Requirement
+  requirement: &70248368299680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
         version: 1.0.2
   type: :runtime
   prerelease: false
-  version_requirements: *70324863533220
+  version_requirements: *70248368299680
 description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
   crawl extremely large sites which is much more perofmant than multi-threaded crawlers.  It
   is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -310,13 +310,13 @@ files:
 - spec/spec.opts
 - spec/spec_helper.rb
 - lib/cobweb.rb
+- lib/cobweb_crawl_helper.rb
 - lib/cobweb_crawler.rb
 - lib/cobweb_finished_job.rb
 - lib/cobweb_links.rb
 - lib/cobweb_process_job.rb
 - lib/cobweb_version.rb
 - lib/content_link_parser.rb
-- lib/crawl.rb
 - lib/crawl_job.rb
 - lib/encoding_safe_process_job.rb
 - lib/hash_util.rb