RubyGems - cobweb - Versions diffs - 1.0.11 → 1.0.12 - Mend

cobweb 1.0.11 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data/README.textile +4 -3
data/lib/cobweb.rb +31 -8
data/lib/cobweb_crawler.rb +7 -8
data/lib/cobweb_process_job.rb +1 -1
data/lib/cobweb_version.rb +1 -1
data/lib/crawl.rb +11 -4
data/lib/crawl_finished_worker.rb +27 -0
data/lib/crawl_helper.rb +250 -0
data/lib/crawl_job.rb +2 -2
data/lib/crawl_process_worker.rb +31 -0
data/lib/crawl_worker.rb +118 -0
data/lib/sidekiq/cobweb_helper.rb +16 -0
data/lib/stats.rb +12 -11
data/lib/uri_helper.rb +8 -0
data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -1
data/spec/cobweb/cobweb_crawl_spec.rb +29 -13
data/spec/cobweb/cobweb_crawler_spec.rb +33 -14
data/spec/cobweb/cobweb_links_spec.rb +2 -1
data/spec/cobweb/cobweb_spec.rb +3 -0
data/spec/cobweb/content_link_parser_spec.rb +4 -0
data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} +52 -9
data/spec/cobweb/crawl_worker_spec.rb +250 -0
data/spec/cobweb/robots_spec.rb +2 -1
data/spec/http_stubs.rb +95 -0
data/spec/samples/sample_site/{boxgrid.html → boxgrid>withsillyname.html} +1 -1
data/spec/samples/sample_site/dashboard.html +1 -1
data/spec/samples/sample_site/forms.html +1 -1
data/spec/samples/sample_site/gallery.html +1 -1
data/spec/samples/sample_site/more.html +1 -1
data/spec/samples/sample_site/tables.html +1 -1
data/spec/samples/sample_site/typography.html +1 -1
data/spec/spec_helper.rb +6 -88
metadata +85 -35
data/spec/cobweb/site_test_spec.rb.tmp +0 -101

data/lib/crawl_job.rb CHANGED Viewed

@@ -84,6 +84,7 @@ class CrawlJob
   def self.send_to_processing_queue(content, content_request)
     content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
     if content_request[:direct_call_process_job]
+      #clazz = content_request[:processing_queue].to_s.constantize
       clazz = const_get(content_request[:processing_queue])
       clazz.perform(content_to_send)
     elsif content_request[:use_encoding_safe_process_job]
@@ -107,6 +108,5 @@ class CrawlJob
     #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
     Resque.enqueue(CrawlJob, new_request)
   end
-end
+end

data/lib/crawl_process_worker.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'sidekiq'
+require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
+# If your client is single-threaded, we just need a single connection in our Redis connection pool
+#Sidekiq.configure_client do |config|
+#  config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
+#end
+# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
+#Sidekiq.configure_server do |config|
+#  config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
+#end
+class CrawlProcessWorker
+  include Sidekiq::Worker
+  sidekiq_options queue: "crawl_process_worker"
+  def perform(content)
+    content = HashUtil.deep_symbolize_keys(content)
+    puts "Dummy Processing for #{content[:url]}"
+  end
+  def self.queue_size
+    Sidekiq.redis do |conn|
+      conn.llen("queue:#{get_sidekiq_options["queue"]}")
+    end
+  end
+end

data/lib/crawl_worker.rb ADDED Viewed

@@ -0,0 +1,118 @@
+require 'sidekiq'
+require File.expand_path(File.dirname(__FILE__) + '/cobweb')
+require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
+# If your client is single-threaded, we just need a single connection in our Redis connection pool
+#Sidekiq.configure_client do |config|
+#  config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
+#end
+# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
+#Sidekiq.configure_server do |config|
+#  config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
+#end
+class CrawlWorker
+  include Sidekiq::Worker
+  sidekiq_options queue: "crawl_worker"
+  sidekiq_options retry: false
+  def perform(content_request)
+    # setup the crawl class to manage the crawl of this object
+    @crawl = CobwebModule::Crawl.new(content_request)
+    # update the counters and then perform the get, returns false if we are outwith limits
+    if @crawl.retrieve
+      # if the crawled object is an object type we are interested
+      if @crawl.content.permitted_type?
+        # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
+        @crawl.process_links do |link|
+          @crawl.debug_puts "ENQUEUED LINK: #{link}"
+          enqueue_content(content_request, link)
+        end
+        if @crawl.to_be_processed?
+          @crawl.process do
+            # enqueue to processing queue
+            @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
+            send_to_processing_queue(@crawl.content.to_hash, content_request)
+            #if the enqueue counter has been requested update that
+            if content_request.has_key?(:enqueue_counter_key)
+              enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
+              current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
+              enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
+            end
+          end
+        else
+          @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
+          @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
+          @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
+        end
+      end
+    end
+    @crawl.lock("finished") do
+      # let the crawl know we're finished with this object
+      @crawl.finished_processing
+      # test queue and crawl sizes to see if we have completed the crawl
+      @crawl.debug_puts "finished? #{@crawl.finished?}"
+      @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
+      if @crawl.finished? && @crawl.first_to_finish?
+        @crawl.debug_puts "Calling crawl_job finished"
+        finished(content_request)
+      end
+    end
+  end
+  def self.jobs
+    Sidekiq.redis do |conn|
+      conn.smembers(get_sidekiq_options[:queue]).count
+    end
+  end
+    # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
+  def finished(content_request)
+    additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
+    additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
+    additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
+    @crawl.finished
+    @crawl.debug_puts "increment crawl_finished_enqueued_count"
+    @crawl.redis.incr("crawl_finished_enqueued_count")
+    content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
+  end
+  # Enqueues the content to the processing queue setup in options
+  def send_to_processing_queue(content, content_request)
+    content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
+    if content_request[:direct_call_process_job]
+      clazz = content_request[:processing_queue].constantize
+      clazz.perform(content_to_send)
+    else
+      content_request[:processing_queue].constantize.perform_async(content_to_send)
+    end
+    @crawl.debug_puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}"
+  end
+  private
+  # Enqueues content to the crawl_job queue
+  def enqueue_content(content_request, link)
+    new_request = content_request.clone
+    new_request[:url] = link
+    new_request[:parent] = content_request[:url]
+    CrawlWorker.perform_async(new_request)
+  end
+end

data/lib/sidekiq/cobweb_helper.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module Sidekiq
+  module Worker
+    module ClassMethods
+      def queue_size
+        Sidekiq.redis do |conn|
+          conn.llen("queue:#{get_sidekiq_options["queue"]}")
+        end
+      end
+      def queue_items(start=0, finish=-1)
+        Sidekiq.redis do |conn|
+          conn.lrange("queue:#{get_sidekiq_options["queue"]}", start, finish)
+        end
+      end
+    end
+  end
+end

data/lib/stats.rb CHANGED Viewed

@@ -38,13 +38,13 @@ class Stats
     @redis.smembers "crawled"
   end
-  def inbound_links_for(url, redis=@redis)
-    @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
+  def inbound_links_for(url)
+    uri = UriHelper.parse(url)
+    @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
   end
   # Returns statistics hash.  update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
   def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
     @lock.synchronize {
       @statistics = get_statistics
@@ -94,6 +94,7 @@ class Stats
       else
         mime_counts = {content[:mime_type] => 1}
       end
       @statistics[:mime_counts] = mime_counts.to_json
       # record mime categories stats
@@ -151,18 +152,18 @@ class Stats
   # Returns the statistics hash
   def get_statistics
-    @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
-    if @statistics[:status_counts].nil?
-      @statistics[:status_counts]
+    statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
+    if statistics[:status_counts].nil?
+      statistics[:status_counts]
     else
-      @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
+      statistics[:status_counts] = JSON.parse(statistics[:status_counts])
     end
-    if @statistics[:mime_counts].nil?
-      @statistics[:mime_counts]
+    if statistics[:mime_counts].nil?
+      statistics[:mime_counts]
     else
-      @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
+      statistics[:mime_counts] = JSON.parse(statistics[:mime_counts])
     end
-    @statistics
+    statistics
   end
   # Sets the current status of the crawl

data/lib/uri_helper.rb CHANGED Viewed

@@ -7,4 +7,12 @@ class UriHelper
     new_link
   end
+  def self.parse(url)
+    begin
+      URI.parse(url)
+    rescue URI::InvalidURIError
+      URI.parse(URI.escape(url))
+    end
+  end
 end

data/spec/cobweb/cobweb_crawl_helper_spec.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 describe CobwebCrawlHelper do
+  include HttpStubs
+  before(:each) do
+    setup_stubs
+  end
   # this spec tests the crawl object
   describe "initialize" do

data/spec/cobweb/cobweb_crawl_spec.rb CHANGED Viewed

@@ -1,8 +1,11 @@
 require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'resolv'
 describe CobwebModule::Crawl, :local_only => true do
+  include HttpStubs
   before(:each) do
+    setup_stubs
     @local_redis = {:host => "localhost", :port => 6379}
     @remote_redis = {:host => "remote-redis", :port => 6379}
@@ -12,25 +15,38 @@ describe CobwebModule::Crawl, :local_only => true do
   describe "remote redis" do
     before(:each) do
       @local = CobwebModule::Crawl.new(:redis_options => @local_redis)
-      @remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
       @local.redis.del("test_redis")
-      @remote.redis.del("test_redis")
+      begin
+        Resolv.getaddress @remote_redis[:host]
+        @remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
+        @remote.redis.del("test_redis")
+      rescue
+        @remote = nil
+      end
     end
     it "should connect to the local redis" do
-      @local.redis.exists("test_redis").should be_false
-      @local.redis.set("test_redis", 1)
-      @local.redis.exists("test_redis").should be_true
+      if @remote
+        @local.redis.exists("test_redis").should be_false
+        @local.redis.set("test_redis", 1)
+        @local.redis.exists("test_redis").should be_true
-      @remote.redis.exists("test_redis").should be_false
+        @remote.redis.exists("test_redis").should be_false
+      else
+        puts "WARNING: can't connect to remote redis"
+      end
     end
     it "should connect to the remote redis" do
-      @remote.redis.exists("test_redis").should be_false
-      @remote.redis.set("test_redis", 1)
-      @remote.redis.exists("test_redis").should be_true
-      @local.redis.exists("test_redis").should be_false
+      if @remote
+        @remote.redis.exists("test_redis").should be_false
+        @remote.redis.set("test_redis", 1)
+        @remote.redis.exists("test_redis").should be_true
+        @local.redis.exists("test_redis").should be_false
+      else
+        puts "WARNING: can't connect to remote redis"
+      end
     end
   end
 end

data/spec/cobweb/cobweb_crawler_spec.rb CHANGED Viewed

@@ -23,25 +23,23 @@ describe CobwebCrawler do
   end
   describe "crawl" do
     it "should crawl a site" do
-      # temporary tests to run crawler - proper specs to follow.. honest
-      crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
-      statistics = crawler.crawl(@base_url)
-      statistics.should_not be_nil
-      statistics.get_statistics.should be_an_instance_of Hash
+      @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false})
+      @statistics = @crawler.crawl(@base_url)
+      @statistics.should_not be_nil
+      @statistics.get_statistics.should be_an_instance_of Hash
+      @statistics.get_statistics[:mime_counts]["text/html"].should == 8
+      @statistics.get_statistics[:mime_counts]["text/css"].should == 18
+      @statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
     end
     it "should take a block" do
-      # temporary tests to run crawler - proper specs to follow.. honest
-      crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
+      crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
       statistics = crawler.crawl(@base_url) do |content, statistics|
         content[:url].should_not be_nil
         statistics[:average_length].should_not be_nil
@@ -50,6 +48,27 @@ describe CobwebCrawler do
       statistics.should_not be_nil
       statistics.get_statistics.should be_an_instance_of Hash
+      statistics.get_statistics[:mime_counts]["text/html"].should == 1
+    end
+    context "storing inbound links" do
+      before(:each) do
+        @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
+        @statistics = @crawler.crawl(@base_url)
+      end
+      it "should store inbound links" do
+        @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
+        @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
+      end
+      it "should handle url encoding" do
+        @statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
+      end
     end
   end

data/spec/cobweb/cobweb_links_spec.rb CHANGED Viewed

@@ -2,8 +2,9 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
 describe CobwebLinks do
+  include HttpStubs
   before(:each) do
+    setup_stubs
     @base_url = "http://www.baseurl.com/"

data/spec/cobweb/cobweb_spec.rb CHANGED Viewed

@@ -2,7 +2,10 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 describe Cobweb do
+  include HttpStubs
   before(:each) do
+    setup_stubs
     @base_url = "http://www.baseurl.com/"
     @cobweb = Cobweb.new :quiet => true, :cache => nil

data/spec/cobweb/content_link_parser_spec.rb CHANGED Viewed

@@ -2,6 +2,10 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
 require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
 describe ContentLinkParser do
+  include HttpStubs
+  before(:each) do
+    setup_stubs
+  end
   describe "Sample Links Document" do
     before(:each) do

data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} RENAMED Viewed

@@ -1,18 +1,51 @@
 require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
-describe Cobweb, :local_only => true, :disabled => true do
+RESQUE_WORKER_COUNT = 10
+describe CrawlJob, :local_only => true, :disabled => true do
   before(:all) do
     #store all existing resque process ids so we don't kill them afterwards
     @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
+    if Resque.workers.count > 0 && @existing_processes.empty?
+      raise "Ghost workers present in resque, please clear before running specs"
+    elsif Resque.workers.count == 0 && !@existing_processes.empty?
+      raise "Ghost worker processes present (#{@existing_processes.join(',')})"
+    elsif Resque.workers.count > 0 && !@existing_processes.empty?
+      raise "Resque workers present, please end other resque processes before running this spec"
+    end
     # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
-    puts "Starting Workers... Please Wait..."
-    `mkdir log`
-    `mkdir tmp`
-    `mkdir tmp/pids`
-    io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
-    puts "Workers Started."
+    `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
+    `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
+    `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
+    io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
+    counter = 0
+    print "Starting Resque Processes"
+    until counter > 10 || workers_processes_started?
+      print "."
+      counter += 1
+      sleep 0.5
+    end
+    puts ""
+    counter = 0
+    print "Waiting for Resque Workers"
+    until counter > 50 || workers_running?
+      print "."
+      counter += 1
+      sleep 0.5
+    end
+    puts ""
+    if Resque.workers.count == RESQUE_WORKER_COUNT
+      puts "Workers Running."
+    else
+      raise "Workers didn't appear, please check environment"
+    end
   end
@@ -237,7 +270,7 @@ describe Cobweb, :local_only => true, :disabled => true do
   after(:all) do
     @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
-    command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
+    command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
     IO.popen(command)
     clear_queues
@@ -249,13 +282,23 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
   @counter = 0
   start_time = Time.now
   while(running?(crawl_id) && Time.now < start_time + timeout) do
-    sleep 0.5
+    sleep 1
   end
   if Time.now > start_time + timeout
     raise "End of crawl not detected"
   end
 end
+def workers_processes_started?
+  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
+  @new_processes = @all_processes - @existing_processes
+  @new_processes.count == RESQUE_WORKER_COUNT
+end
+def workers_running?
+  Resque.workers.count > 0
+end
 def running?(crawl_id)
   status = @stat.get_status
   result = true