RubyGems - cobweb - Versions diffs - 1.0.28 → 1.0.29 - Mend

cobweb 1.0.28 → 1.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/README.textile +71 -67
data/lib/cobweb.rb +41 -41
data/lib/cobweb_crawl_helper.rb +1 -5
data/lib/cobweb_version.rb +2 -2
data/lib/crawl_worker.rb +14 -14
data/lib/export_command.rb +3 -3
data/lib/report_command.rb +1 -1
data/lib/string.rb +4 -9
data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
data/spec/cobweb/crawl_job_spec.rb +8 -6
data/spec/cobweb/crawl_worker_spec.rb +32 -32
data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
data/spec/samples/sample_site/dashboard.html +1 -1
data/spec/samples/sample_site/forms.html +1 -1
data/spec/samples/sample_site/gallery.html +1 -1
data/spec/samples/sample_site/more.html +1 -1
data/spec/samples/sample_site/tables.html +1 -1
data/spec/samples/sample_site/typography.html +1 -1
metadata +4 -4

data/lib/cobweb_crawl_helper.rb CHANGED

@@ -52,19 +52,15 @@ class CobwebCrawlHelper
     end
     if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
-      puts "deleteing from crawl_worker"
       queue = Sidekiq::Queue.new("crawl_worker")
       queue.each do |job|
-        ap job.args # => [1, 2, 3]
         job.delete if job.args[0]["crawl_id"] == id
       end
       process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
-      puts "deleting from #{process_queue_name}"
       queue = Sidekiq::Queue.new(process_queue_name)
       queue.each do |job|
-        ap job.args # => [1, 2, 3]
         job.delete if job.args[0]["crawl_id"] == id
       end
     end
@@ -102,4 +98,4 @@ class CobwebCrawlHelper
     @data[:crawl_id]
   end
-end
+end

data/lib/cobweb_version.rb CHANGED

@@ -3,7 +3,7 @@ class CobwebVersion
   # Returns a string of the current version
   def self.version
-    "1.0.28"
+    "1.0.29"
   end
-end
+end

data/lib/crawl_worker.rb CHANGED

@@ -14,31 +14,31 @@ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
 class CrawlWorker
   include Sidekiq::Worker
   sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
   def perform(content_request)
     puts "Performing for #{content_request["url"]}"
     # setup the crawl class to manage the crawl of this object
     @crawl = CobwebModule::Crawl.new(content_request)
     # update the counters and then perform the get, returns false if we are outwith limits
     if @crawl.retrieve
       # if the crawled object is an object type we are interested
       if @crawl.content.permitted_type?
         # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
         @crawl.process_links do |link|
           @crawl.lock("queue_links") do
             if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
               # enqueue the links to sidekiq
-              @crawl.debug_puts "QUEUED LINK: #{link}"
+              @crawl.debug_puts "QUEUED LINK: #{link}"
               enqueue_content(content_request, link)
             end
           end
         end
         if @crawl.to_be_processed?
           @crawl.process do
             # enqueue to processing queue
@@ -51,17 +51,17 @@ class CrawlWorker
               current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
               enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
             end
           end
         else
           @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
           @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
           @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
         end
       end
     end
     #@crawl.lock("finished") do
       # let the crawl know we're finished with this object
       @crawl.finished_processing
@@ -79,7 +79,7 @@ class CrawlWorker
       conn.smembers(get_sidekiq_options[:queue]).count
     end
   end
     # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
   def finished(content_request)
@@ -93,7 +93,7 @@ class CrawlWorker
     @crawl.redis.incr("crawl_finished_enqueued_count")
     content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
   end
   # Enqueues the content to the processing queue setup in options
   def send_to_processing_queue(content, content_request)
     content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
@@ -110,7 +110,7 @@ class CrawlWorker
   end
   private
   # Enqueues content to the crawl_job queue
   def enqueue_content(content_request, link)
     new_request = content_request.clone
@@ -119,4 +119,4 @@ class CrawlWorker
     CrawlWorker.perform_async(new_request)
   end
-end
+end

data/lib/export_command.rb CHANGED

@@ -35,7 +35,7 @@ class ExportCommand
         Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
         uri.path.split("/")[0..-2].each do |dir|
-          path+="/" unless path.ends_with?("/")
+          path+="/" unless path.cobweb_ends_with?("/")
           path+=dir
           if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
             FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
@@ -45,7 +45,7 @@ class ExportCommand
             Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
           end
         end
-        path += "/" unless path.ends_with?("/")
+        path += "/" unless path.cobweb_ends_with?("/")
         filename = uri.path.split("/")[-1]
         if filename.nil? || filename.empty?
           filename = "index.html"
@@ -82,4 +82,4 @@ class ExportCommand
     end
   end
-end
+end

data/lib/report_command.rb CHANGED

@@ -22,7 +22,7 @@ class ReportCommand
       CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
         statistics = @crawler.crawl(options[:url]) do |page|
-          puts "Reporting on #{page[:url]}"
+          puts "Reporting on #{page[:url]} [#{page[:status_code]}]"
           @doc = page[:body]
           page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
           page["title"] = scope.head_tag.title_tag.contents

data/lib/string.rb CHANGED

@@ -1,12 +1,7 @@
 class String
-  # add ends_with? support if method is missing
-  def method_missing(m, *args, &block)
-    if m == :ends_with?
-      suffix = args[0]
-      suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
-    else
-      super
-    end
+  def cobweb_ends_with?(val)
+    suffix = val
+    suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
   end
-end
+end

data/spec/cobweb/cobweb_crawler_spec.rb CHANGED

@@ -4,9 +4,9 @@ describe CobwebCrawler do
   before(:each) do
     pending("thin not installed") unless THIN_INSTALLED
     @base_url = "http://localhost:3532/"
     @default_headers = {"Cache-Control" => "private, max-age=0",
                         "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
                         "Expires" => "-1",
@@ -16,13 +16,13 @@ describe CobwebCrawler do
                         "Server" => "gws",
                         "X-XSS-Protection" => "1; mode=block"}
-  end
+  end
   it "should generate a cobweb_crawler object" do
     CobwebCrawler.new.should be_an_instance_of CobwebCrawler
   end
   describe "crawl" do
     it "should crawl a site" do
@@ -36,21 +36,21 @@ describe CobwebCrawler do
       @statistics.get_statistics[:mime_counts]["text/html"].should == 8
       @statistics.get_statistics[:mime_counts]["text/css"].should == 18
       @statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
     end
     it "should take a block" do
       crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
       statistics = crawler.crawl(@base_url) do |content, statistics|
         content[:url].should_not be_nil
         statistics[:average_length].should_not be_nil
       end
       statistics.should_not be_nil
       statistics.get_statistics.should be_an_instance_of Hash
       statistics.get_statistics[:mime_counts]["text/html"].should == 1
     end
     context "internal_links" do
@@ -91,19 +91,19 @@ describe CobwebCrawler do
       before(:each) do
         pending("thin not installed") unless THIN_INSTALLED
         @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
-        @statistics = @crawler.crawl(@base_url)
+        @statistics = @crawler.crawl(@base_url)
       end
       it "should store inbound links" do
         @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
-        @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
+        @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
       end
       it "should handle url encoding" do
-        @statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
+        @statistics.inbound_links_for("http://localhost:3532/boxgridwithsillyname.html").sort.should == ["http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
       end
     end
-  end
+  end
-end
+end

data/spec/cobweb/crawl_job_spec.rb CHANGED

@@ -6,7 +6,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
   before(:all) do
     #store all existing resque process ids so we don't kill them afterwards
-    if RESQUE_INSTALLED  && THIN_INSTALLED
+    if RESQUE_INSTALLED  && THIN_INSTALLED
       @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
       if Resque.workers.count > 0 && @existing_processes.empty?
@@ -168,7 +168,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
     #     wait_for_crawl_finished crawl[:crawl_id]
     #     @redis.get("crawl_job_enqueued_count").to_i.should == 20
     #   end
-    #
+    #
     # end
     describe "limit to 1" do
       before(:each) do
@@ -271,11 +271,13 @@ describe CrawlJob, :local_only => true, :disabled => true do
   after(:all) do
-    @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
-    command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
-    IO.popen(command)
+    if RESQUE_INSTALLED
+      @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
+      command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
+      IO.popen(command)
-    clear_queues
+      clear_queues
+    end
   end
 end

data/spec/cobweb/crawl_worker_spec.rb CHANGED

@@ -9,14 +9,15 @@ describe CrawlWorker, :local_only => true do
       @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
       raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
       # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
       puts "Starting Workers... Please Wait..."
       `mkdir log`
       `rm -rf output.log`
-      io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log &")
+      puts "calling: nohup sidekiq -v -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1"
+      io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1")
       puts "Workers Started."
-    end
+    end
   end
   before(:each) do
@@ -24,7 +25,7 @@ describe CrawlWorker, :local_only => true do
     pending("thin not installed") unless THIN_INSTALLED
     @base_url = "http://localhost:3532/"
     @base_page_count = 77
     clear_sidekiq_queues
   end
@@ -40,7 +41,7 @@ describe CrawlWorker, :local_only => true do
       }
       @cobweb = Cobweb.new @request
     end
     it "should crawl entire site" do
       crawl = @cobweb.start(@base_url)
       @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -66,7 +67,7 @@ describe CrawlWorker, :local_only => true do
       }
       @cobweb = Cobweb.new @request
     end
     it "should only crawl html pages" do
       crawl = @cobweb.start(@base_url)
       @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -78,10 +79,10 @@ describe CrawlWorker, :local_only => true do
       mime_types.count.should == 8
       mime_types.map{|m| m.should == "text/html"}
       mime_types.select{|m| m=="text/html"}.count.should == 8
     end
   end
   describe "with a crawl limit" do
     before(:each) do
@@ -93,34 +94,34 @@ describe CrawlWorker, :local_only => true do
         :cache => nil
       }
     end
     describe "of 1" do
       before(:each) do
         @request[:crawl_limit] = 1
         @cobweb = Cobweb.new @request
       end
       it "should not crawl the entire site" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlProcessWorker.queue_size.should_not == @base_page_count
-      end
+      end
       it "should only crawl 1 page" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlProcessWorker.queue_size.should == 1
-      end
+      end
       it "should notify of crawl finished" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlFinishedWorker.queue_size.should == 1
-      end
+      end
     end
     describe "of 5" do
       before(:each) do
         @request[:crawl_limit] = 5
@@ -131,36 +132,36 @@ describe CrawlWorker, :local_only => true do
           @request[:crawl_limit_by_page] = true
           @cobweb = Cobweb.new @request
         end
         it "should only use html pages towards the crawl limit" do
           crawl = @cobweb.start(@base_url)
           @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
           wait_for_crawl_finished crawl[:crawl_id]
           mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
           mime_types.select{|m| m=="text/html"}.count.should == 5
         end
       end
     end
     describe "of 10" do
       before(:each) do
         @request[:crawl_limit] = 10
         @cobweb = Cobweb.new @request
       end
       it "should not crawl the entire site" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlProcessWorker.queue_size.should_not == @base_page_count
-      end
+      end
       it "should notify of crawl finished" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlFinishedWorker.queue_size.should == 1
-      end
+      end
       it "should only crawl 10 objects" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -168,38 +169,38 @@ describe CrawlWorker, :local_only => true do
         CrawlProcessWorker.queue_size.should == 10
       end
     end
     describe "of 100" do
       before(:each) do
         @request[:crawl_limit] = 100
         @cobweb = Cobweb.new @request
       end
       it "should crawl the entire sample site" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlProcessWorker.queue_size.should == @base_page_count
-      end
+      end
       it "should notify of crawl finished" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlFinishedWorker.queue_size.should == 1
-      end
+      end
       it "should not crawl more than 100 pages" do
         crawl = @cobweb.start(@base_url)
         @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         wait_for_crawl_finished crawl[:crawl_id]
         CrawlProcessWorker.queue_size.should_not > 100
-      end
-    end
+      end
+    end
   end
   after(:all) do
     @all_processes = `ps aux | grep sidekiq | grep -v grep | grep -v sidekiq-web | awk '{print $2}'`.split("\n")
     unless (@all_processes - @existing_processes).empty?
-      command = "kill #{(@all_processes - @existing_processes).join(" ")}"
+      command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
       IO.popen(command)
     end
     clear_sidekiq_queues
@@ -211,6 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
   @counter = 0
   start_time = Time.now
   while(running?(crawl_id) && Time.now < start_time + timeout) do
+    puts Sidekiq::Stats.new.queues
     sleep 1
   end
   if Time.now > start_time + timeout
@@ -246,9 +248,7 @@ def clear_sidekiq_queues
     end
   end
   sleep 5
   CrawlProcessWorker.queue_size.should == 0
   CrawlFinishedWorker.queue_size.should == 0
 end