cobweb 1.0.11 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/crawl_job.rb CHANGED
@@ -84,6 +84,7 @@ class CrawlJob
84
84
  def self.send_to_processing_queue(content, content_request)
85
85
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
86
86
  if content_request[:direct_call_process_job]
87
+ #clazz = content_request[:processing_queue].to_s.constantize
87
88
  clazz = const_get(content_request[:processing_queue])
88
89
  clazz.perform(content_to_send)
89
90
  elsif content_request[:use_encoding_safe_process_job]
@@ -107,6 +108,5 @@ class CrawlJob
107
108
  #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
108
109
  Resque.enqueue(CrawlJob, new_request)
109
110
  end
110
-
111
111
 
112
- end
112
+ end
@@ -0,0 +1,31 @@
1
+
2
+ require 'sidekiq'
3
+ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
4
+
5
+ # If your client is single-threaded, we just need a single connection in our Redis connection pool
6
+ #Sidekiq.configure_client do |config|
7
+ # config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
8
+ #end
9
+
10
+ # Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
11
+ #Sidekiq.configure_server do |config|
12
+ # config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
13
+ #end
14
+
15
+ class CrawlProcessWorker
16
+
17
+ include Sidekiq::Worker
18
+
19
+ sidekiq_options queue: "crawl_process_worker"
20
+
21
+ def perform(content)
22
+ content = HashUtil.deep_symbolize_keys(content)
23
+ puts "Dummy Processing for #{content[:url]}"
24
+ end
25
+ def self.queue_size
26
+ Sidekiq.redis do |conn|
27
+ conn.llen("queue:#{get_sidekiq_options["queue"]}")
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,118 @@
1
+ require 'sidekiq'
2
+ require File.expand_path(File.dirname(__FILE__) + '/cobweb')
3
+ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
4
+
5
+ # If your client is single-threaded, we just need a single connection in our Redis connection pool
6
+ #Sidekiq.configure_client do |config|
7
+ # config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
8
+ #end
9
+
10
+ # Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
11
+ #Sidekiq.configure_server do |config|
12
+ # config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
13
+ #end
14
+
15
+ class CrawlWorker
16
+ include Sidekiq::Worker
17
+ sidekiq_options queue: "crawl_worker"
18
+ sidekiq_options retry: false
19
+
20
+ def perform(content_request)
21
+ # setup the crawl class to manage the crawl of this object
22
+ @crawl = CobwebModule::Crawl.new(content_request)
23
+
24
+ # update the counters and then perform the get, returns false if we are outwith limits
25
+ if @crawl.retrieve
26
+
27
+ # if the crawled object is an object type we are interested
28
+ if @crawl.content.permitted_type?
29
+
30
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
31
+ @crawl.process_links do |link|
32
+
33
+ @crawl.debug_puts "ENQUEUED LINK: #{link}"
34
+ enqueue_content(content_request, link)
35
+
36
+ end
37
+
38
+ if @crawl.to_be_processed?
39
+
40
+ @crawl.process do
41
+
42
+ # enqueue to processing queue
43
+ @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
44
+ send_to_processing_queue(@crawl.content.to_hash, content_request)
45
+
46
+ #if the enqueue counter has been requested update that
47
+ if content_request.has_key?(:enqueue_counter_key)
48
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
49
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
50
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
51
+ end
52
+
53
+ end
54
+ else
55
+ @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
56
+ @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
57
+ @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
58
+ end
59
+
60
+ end
61
+ end
62
+
63
+ @crawl.lock("finished") do
64
+ # let the crawl know we're finished with this object
65
+ @crawl.finished_processing
66
+
67
+ # test queue and crawl sizes to see if we have completed the crawl
68
+ @crawl.debug_puts "finished? #{@crawl.finished?}"
69
+ @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
70
+ if @crawl.finished? && @crawl.first_to_finish?
71
+ @crawl.debug_puts "Calling crawl_job finished"
72
+ finished(content_request)
73
+ end
74
+ end
75
+ end
76
+ def self.jobs
77
+ Sidekiq.redis do |conn|
78
+ conn.smembers(get_sidekiq_options[:queue]).count
79
+ end
80
+ end
81
+
82
+
83
+ # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
84
+ def finished(content_request)
85
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
86
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
87
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
88
+
89
+ @crawl.finished
90
+
91
+ @crawl.debug_puts "increment crawl_finished_enqueued_count"
92
+ @crawl.redis.incr("crawl_finished_enqueued_count")
93
+ content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
94
+ end
95
+
96
+ # Enqueues the content to the processing queue setup in options
97
+ def send_to_processing_queue(content, content_request)
98
+ content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
99
+ if content_request[:direct_call_process_job]
100
+ clazz = content_request[:processing_queue].constantize
101
+ clazz.perform(content_to_send)
102
+ else
103
+ content_request[:processing_queue].constantize.perform_async(content_to_send)
104
+ end
105
+ @crawl.debug_puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}"
106
+ end
107
+
108
+ private
109
+
110
+ # Enqueues content to the crawl_job queue
111
+ def enqueue_content(content_request, link)
112
+ new_request = content_request.clone
113
+ new_request[:url] = link
114
+ new_request[:parent] = content_request[:url]
115
+ CrawlWorker.perform_async(new_request)
116
+ end
117
+
118
+ end
@@ -0,0 +1,16 @@
1
+ module Sidekiq
2
+ module Worker
3
+ module ClassMethods
4
+ def queue_size
5
+ Sidekiq.redis do |conn|
6
+ conn.llen("queue:#{get_sidekiq_options["queue"]}")
7
+ end
8
+ end
9
+ def queue_items(start=0, finish=-1)
10
+ Sidekiq.redis do |conn|
11
+ conn.lrange("queue:#{get_sidekiq_options["queue"]}", start, finish)
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
data/lib/stats.rb CHANGED
@@ -38,13 +38,13 @@ class Stats
38
38
  @redis.smembers "crawled"
39
39
  end
40
40
 
41
- def inbound_links_for(url, redis=@redis)
42
- @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
41
+ def inbound_links_for(url)
42
+ uri = UriHelper.parse(url)
43
+ @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
43
44
  end
44
45
 
45
46
  # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
46
47
  def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
47
-
48
48
  @lock.synchronize {
49
49
  @statistics = get_statistics
50
50
 
@@ -94,6 +94,7 @@ class Stats
94
94
  else
95
95
  mime_counts = {content[:mime_type] => 1}
96
96
  end
97
+
97
98
  @statistics[:mime_counts] = mime_counts.to_json
98
99
 
99
100
  # record mime categories stats
@@ -151,18 +152,18 @@ class Stats
151
152
  # Returns the statistics hash
152
153
  def get_statistics
153
154
 
154
- @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
155
- if @statistics[:status_counts].nil?
156
- @statistics[:status_counts]
155
+ statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
156
+ if statistics[:status_counts].nil?
157
+ statistics[:status_counts]
157
158
  else
158
- @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
159
+ statistics[:status_counts] = JSON.parse(statistics[:status_counts])
159
160
  end
160
- if @statistics[:mime_counts].nil?
161
- @statistics[:mime_counts]
161
+ if statistics[:mime_counts].nil?
162
+ statistics[:mime_counts]
162
163
  else
163
- @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
164
+ statistics[:mime_counts] = JSON.parse(statistics[:mime_counts])
164
165
  end
165
- @statistics
166
+ statistics
166
167
  end
167
168
 
168
169
  # Sets the current status of the crawl
data/lib/uri_helper.rb CHANGED
@@ -7,4 +7,12 @@ class UriHelper
7
7
  new_link
8
8
  end
9
9
 
10
+ def self.parse(url)
11
+ begin
12
+ URI.parse(url)
13
+ rescue URI::InvalidURIError
14
+ URI.parse(URI.escape(url))
15
+ end
16
+ end
17
+
10
18
  end
@@ -1,7 +1,10 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
3
  describe CobwebCrawlHelper do
4
-
4
+ include HttpStubs
5
+ before(:each) do
6
+ setup_stubs
7
+ end
5
8
  # this spec tests the crawl object
6
9
 
7
10
  describe "initialize" do
@@ -1,8 +1,11 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'resolv'
2
3
 
3
4
  describe CobwebModule::Crawl, :local_only => true do
4
-
5
+ include HttpStubs
5
6
  before(:each) do
7
+ setup_stubs
8
+
6
9
  @local_redis = {:host => "localhost", :port => 6379}
7
10
  @remote_redis = {:host => "remote-redis", :port => 6379}
8
11
 
@@ -12,25 +15,38 @@ describe CobwebModule::Crawl, :local_only => true do
12
15
  describe "remote redis" do
13
16
  before(:each) do
14
17
  @local = CobwebModule::Crawl.new(:redis_options => @local_redis)
15
- @remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
16
-
17
18
  @local.redis.del("test_redis")
18
- @remote.redis.del("test_redis")
19
+
20
+ begin
21
+ Resolv.getaddress @remote_redis[:host]
22
+ @remote = CobwebModule::Crawl.new(:redis_options => @remote_redis)
23
+ @remote.redis.del("test_redis")
24
+ rescue
25
+ @remote = nil
26
+ end
19
27
 
20
28
  end
21
29
  it "should connect to the local redis" do
22
- @local.redis.exists("test_redis").should be_false
23
- @local.redis.set("test_redis", 1)
24
- @local.redis.exists("test_redis").should be_true
30
+ if @remote
31
+ @local.redis.exists("test_redis").should be_false
32
+ @local.redis.set("test_redis", 1)
33
+ @local.redis.exists("test_redis").should be_true
25
34
 
26
- @remote.redis.exists("test_redis").should be_false
35
+ @remote.redis.exists("test_redis").should be_false
36
+ else
37
+ puts "WARNING: can't connect to remote redis"
38
+ end
27
39
  end
28
40
  it "should connect to the remote redis" do
29
- @remote.redis.exists("test_redis").should be_false
30
- @remote.redis.set("test_redis", 1)
31
- @remote.redis.exists("test_redis").should be_true
32
-
33
- @local.redis.exists("test_redis").should be_false
41
+ if @remote
42
+ @remote.redis.exists("test_redis").should be_false
43
+ @remote.redis.set("test_redis", 1)
44
+ @remote.redis.exists("test_redis").should be_true
45
+
46
+ @local.redis.exists("test_redis").should be_false
47
+ else
48
+ puts "WARNING: can't connect to remote redis"
49
+ end
34
50
  end
35
51
  end
36
52
  end
@@ -23,25 +23,23 @@ describe CobwebCrawler do
23
23
  end
24
24
 
25
25
  describe "crawl" do
26
+
26
27
  it "should crawl a site" do
27
-
28
- # temporary tests to run crawler - proper specs to follow.. honest
29
-
30
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => true})
31
-
32
- statistics = crawler.crawl(@base_url)
33
-
34
- statistics.should_not be_nil
35
- statistics.get_statistics.should be_an_instance_of Hash
28
+
29
+ @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false})
30
+ @statistics = @crawler.crawl(@base_url)
31
+
32
+ @statistics.should_not be_nil
33
+ @statistics.get_statistics.should be_an_instance_of Hash
34
+
35
+ @statistics.get_statistics[:mime_counts]["text/html"].should == 8
36
+ @statistics.get_statistics[:mime_counts]["text/css"].should == 18
37
+ @statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
36
38
 
37
39
  end
38
40
 
39
41
  it "should take a block" do
40
-
41
- # temporary tests to run crawler - proper specs to follow.. honest
42
-
43
- crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
44
-
42
+ crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
45
43
  statistics = crawler.crawl(@base_url) do |content, statistics|
46
44
  content[:url].should_not be_nil
47
45
  statistics[:average_length].should_not be_nil
@@ -50,6 +48,27 @@ describe CobwebCrawler do
50
48
  statistics.should_not be_nil
51
49
  statistics.get_statistics.should be_an_instance_of Hash
52
50
 
51
+ statistics.get_statistics[:mime_counts]["text/html"].should == 1
52
+
53
+ end
54
+
55
+ context "storing inbound links" do
56
+
57
+ before(:each) do
58
+ @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
59
+ @statistics = @crawler.crawl(@base_url)
60
+ end
61
+
62
+ it "should store inbound links" do
63
+
64
+ @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
65
+ @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
66
+ end
67
+
68
+ it "should handle url encoding" do
69
+ @statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
70
+ end
71
+
53
72
  end
54
73
  end
55
74
 
@@ -2,8 +2,9 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
  require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
3
3
 
4
4
  describe CobwebLinks do
5
-
5
+ include HttpStubs
6
6
  before(:each) do
7
+ setup_stubs
7
8
 
8
9
  @base_url = "http://www.baseurl.com/"
9
10
 
@@ -2,7 +2,10 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
3
  describe Cobweb do
4
4
 
5
+ include HttpStubs
5
6
  before(:each) do
7
+ setup_stubs
8
+
6
9
  @base_url = "http://www.baseurl.com/"
7
10
  @cobweb = Cobweb.new :quiet => true, :cache => nil
8
11
 
@@ -2,6 +2,10 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
  require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
3
3
 
4
4
  describe ContentLinkParser do
5
+ include HttpStubs
6
+ before(:each) do
7
+ setup_stubs
8
+ end
5
9
 
6
10
  describe "Sample Links Document" do
7
11
  before(:each) do
@@ -1,18 +1,51 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
2
 
3
- describe Cobweb, :local_only => true, :disabled => true do
3
+ RESQUE_WORKER_COUNT = 10
4
+
5
+ describe CrawlJob, :local_only => true, :disabled => true do
4
6
 
5
7
  before(:all) do
6
8
  #store all existing resque process ids so we don't kill them afterwards
9
+
7
10
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
11
+ if Resque.workers.count > 0 && @existing_processes.empty?
12
+ raise "Ghost workers present in resque, please clear before running specs"
13
+ elsif Resque.workers.count == 0 && !@existing_processes.empty?
14
+ raise "Ghost worker processes present (#{@existing_processes.join(',')})"
15
+ elsif Resque.workers.count > 0 && !@existing_processes.empty?
16
+ raise "Resque workers present, please end other resque processes before running this spec"
17
+ end
8
18
 
9
19
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
- puts "Starting Workers... Please Wait..."
11
- `mkdir log`
12
- `mkdir tmp`
13
- `mkdir tmp/pids`
14
- io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
15
- puts "Workers Started."
20
+ `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
21
+ `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
22
+ `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
23
+ io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
24
+
25
+ counter = 0
26
+ print "Starting Resque Processes"
27
+ until counter > 10 || workers_processes_started?
28
+ print "."
29
+ counter += 1
30
+ sleep 0.5
31
+ end
32
+ puts ""
33
+
34
+
35
+ counter = 0
36
+ print "Waiting for Resque Workers"
37
+ until counter > 50 || workers_running?
38
+ print "."
39
+ counter += 1
40
+ sleep 0.5
41
+ end
42
+ puts ""
43
+
44
+ if Resque.workers.count == RESQUE_WORKER_COUNT
45
+ puts "Workers Running."
46
+ else
47
+ raise "Workers didn't appear, please check environment"
48
+ end
16
49
 
17
50
  end
18
51
 
@@ -237,7 +270,7 @@ describe Cobweb, :local_only => true, :disabled => true do
237
270
  after(:all) do
238
271
 
239
272
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
240
- command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
273
+ command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
241
274
  IO.popen(command)
242
275
 
243
276
  clear_queues
@@ -249,13 +282,23 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
249
282
  @counter = 0
250
283
  start_time = Time.now
251
284
  while(running?(crawl_id) && Time.now < start_time + timeout) do
252
- sleep 0.5
285
+ sleep 1
253
286
  end
254
287
  if Time.now > start_time + timeout
255
288
  raise "End of crawl not detected"
256
289
  end
257
290
  end
258
291
 
292
+ def workers_processes_started?
293
+ @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
294
+ @new_processes = @all_processes - @existing_processes
295
+ @new_processes.count == RESQUE_WORKER_COUNT
296
+ end
297
+
298
+ def workers_running?
299
+ Resque.workers.count > 0
300
+ end
301
+
259
302
  def running?(crawl_id)
260
303
  status = @stat.get_status
261
304
  result = true