cobweb 0.0.58 → 0.0.59

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.58
2
+ h1. Cobweb v0.0.59
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -38,11 +38,6 @@ h3. Data Returned
38
38
  ** :related - url's from link tags
39
39
  ** :scripts - url's from script tags
40
40
  ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
41
- * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
42
- * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
43
- * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
44
- * :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
45
- * :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
46
41
 
47
42
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
48
43
 
@@ -70,6 +65,13 @@ Creates a new crawler object based on a base_url
70
65
  ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
71
66
  ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
72
67
  ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
68
+ ** :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
69
+ ** :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
70
+ ** :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
71
+ ** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
72
+ ** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
73
+ ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
74
+ ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
73
75
 
74
76
  bq. crawler = CobWeb.new(:follow_redirects => false)
75
77
 
data/lib/cobweb.rb CHANGED
@@ -180,6 +180,7 @@ class Cobweb
180
180
  content[:character_set] = charset
181
181
  end
182
182
  content[:length] = response.content_length
183
+ content[:text_content] = text_content?(content[:mime_type])
183
184
  if text_content?(content[:mime_type])
184
185
  if response["Content-Encoding"]=="gzip"
185
186
  content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
@@ -389,21 +390,22 @@ class Cobweb
389
390
 
390
391
  end
391
392
 
393
+ # escapes characters with meaning in regular expressions and adds wildcard expression
394
+ def self.escape_pattern_for_regex(pattern)
395
+ pattern = pattern.gsub(".", "\\.")
396
+ pattern = pattern.gsub("?", "\\?")
397
+ pattern = pattern.gsub("+", "\\+")
398
+ pattern = pattern.gsub("*", ".*?")
399
+ pattern
400
+ end
401
+
392
402
  private
393
403
  # checks if the mime_type is textual
394
404
  def text_content?(content_type)
395
405
  @options[:text_mime_types].each do |mime_type|
396
- return true if content_type.match(escape_pattern_for_regex(mime_type))
406
+ return true if content_type.match(Cobweb.escape_pattern_for_regex(mime_type))
397
407
  end
398
408
  false
399
409
  end
400
-
401
- # escapes characters with meaning in regular expressions and adds wildcard expression
402
- def escape_pattern_for_regex(pattern)
403
- pattern = pattern.gsub(".", "\\.")
404
- pattern = pattern.gsub("?", "\\?")
405
- pattern = pattern.gsub("*", ".*?")
406
- pattern
407
- end
408
410
 
409
411
  end
@@ -85,8 +85,6 @@ class CobwebCrawler
85
85
 
86
86
  @stats.update_statistics(content, crawl_counter, queue_counter)
87
87
  @stats.update_status("Completed #{url}.")
88
- puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
89
-
90
88
  yield content, @stats.get_statistics if block_given?
91
89
 
92
90
  rescue => e
data/lib/cobweb_links.rb CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
17
17
 
18
18
  end
19
19
 
@@ -52,15 +52,6 @@ class CobwebLinks
52
52
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
53
53
  end
54
54
 
55
- private
56
- # escapes characters with meaning in regular expressions and adds wildcard expression
57
- def escape_pattern_for_regex(pattern)
58
- pattern = pattern.gsub(".", "\\.")
59
- pattern = pattern.gsub("?", "\\?")
60
- pattern = pattern.gsub("*", ".*?")
61
- ap pattern if @options[:debug]
62
- pattern
63
- end
64
55
  end
65
56
 
66
57
  # Exception raised for :internal_urls missing from CobwebLinks
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.58"
6
+ "0.0.59"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -11,83 +11,105 @@ class CrawlJob
11
11
 
12
12
  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
13
13
  def self.perform(content_request)
14
-
15
14
  # change all hash keys to symbols
16
15
  content_request = HashUtil.deep_symbolize_keys(content_request)
16
+ @content_request = content_request
17
17
 
18
18
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
19
+ content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
20
+ content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
21
+
19
22
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
20
23
  @stats = Stats.new(content_request)
21
24
 
22
25
  @debug = content_request[:debug]
23
26
 
24
- refresh_counters
27
+ decrement_queue_counter
25
28
 
26
29
  # check we haven't crawled this url before
27
30
  unless @redis.sismember "crawled", content_request[:url]
28
- @redis.srem "queued", content_request[:url]
29
- decrement_queue_counter
30
- @redis.sadd "crawled", content_request[:url]
31
- increment_crawl_counter
32
31
 
33
- # if there is no limit or we're still under it lets get the url
34
- if within_crawl_limits?(content_request[:crawl_limit])
32
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
35
33
 
36
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
37
-
38
- ## update statistics
39
- @stats.update_status("Crawling #{content_request[:url]}...")
40
- @stats.update_statistics(content)
41
-
42
- # set the base url if this is the first page
43
- set_base_url @redis, content, content_request
34
+ if is_permitted_type(content)
35
+ # if there is no limit or we're still under it lets get the url
36
+ if within_crawl_limits?(content_request[:crawl_limit])
37
+ #update the queued and crawled lists if we are within the crawl limits.
38
+ @redis.srem "queued", content_request[:url]
39
+ @redis.sadd "crawled", content_request[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_counter
44
+ increment_crawl_started_counter
45
+ end
46
+ else
47
+ increment_crawl_counter
48
+ increment_crawl_started_counter
49
+ end
44
50
 
45
- @cobweb_links = CobwebLinks.new(content_request)
46
- if within_queue_limits?(content_request[:crawl_limit])
47
- internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
48
-
49
- # select the link if its internal
50
- internal_links.select!{|link| @cobweb_links.internal?(link)}
51
+ ## update statistics
52
+ @stats.update_status("Crawling #{content_request[:url]}...")
53
+ @stats.update_statistics(content)
54
+
55
+ # set the base url if this is the first page
56
+ set_base_url @redis, content, content_request
57
+
58
+ @cobweb_links = CobwebLinks.new(content_request)
59
+ if within_queue_limits?(content_request[:crawl_limit])
60
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
61
+
62
+ # select the link if its internal
63
+ internal_links.select!{|link| @cobweb_links.internal?(link)}
51
64
 
52
- # reject the link if we've crawled it or queued it
53
- internal_links.reject!{|link| @redis.sismember("crawled", link)}
54
- internal_links.reject!{|link| @redis.sismember("queued", link)}
65
+ # reject the link if we've crawled it or queued it
66
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
67
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
55
68
 
56
- internal_links.each do |link|
57
- enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ internal_links.each do |link|
70
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
71
+ end
58
72
  end
59
- end
60
73
 
61
- # enqueue to processing queue
62
- send_to_processing_queue(content, content_request)
74
+ # enqueue to processing queue
75
+ send_to_processing_queue(content, content_request)
63
76
 
64
- #if the enqueue counter has been requested update that
65
- if content_request.has_key? :enqueue_counter_key
66
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
67
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
68
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
69
- end
70
-
71
- # if there's nothing left queued or the crawled limit has been reached
72
- if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
73
- if @redis.scard("queued") == 0
74
- finished(content_request)
77
+ #if the enqueue counter has been requested update that
78
+ if content_request.has_key? :enqueue_counter_key
79
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
80
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
81
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
75
82
  end
76
- elsif @queue_counter == 0 || @crawl_counter > content_request[:crawl_limit].to_i
77
- finished(content_request)
83
+
84
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
85
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
86
+ #increment_crawl_counter
87
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
78
88
  end
89
+ else
90
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
79
91
  end
92
+
80
93
  else
81
94
  @redis.srem "queued", content_request[:url]
82
- decrement_queue_counter
83
95
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
84
96
  end
85
-
97
+
98
+ # if there's nothing left queued or the crawled limit has been reached
99
+ if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
100
+ if @queue_counter == 0
101
+ finished(content_request)
102
+ end
103
+ elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
104
+ finished(content_request)
105
+ end
106
+
86
107
  end
87
108
 
88
109
  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
89
110
  def self.finished(content_request)
90
111
  # finished
112
+ ap "FINISHED"
91
113
  @stats.end_crawl(content_request)
92
114
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
93
115
  end
@@ -95,7 +117,10 @@ class CrawlJob
95
117
  # Enqueues the content to the processing queue setup in options
96
118
  def self.send_to_processing_queue(content, content_request)
97
119
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
98
- if content_request[:use_encoding_safe_process_job]
120
+ if content_request[:direct_call_process_job]
121
+ clazz = const_get(content_request[:processing_queue])
122
+ clazz.perform(content_to_send)
123
+ elsif content_request[:use_encoding_safe_process_job]
99
124
  content_to_send[:body] = Base64.encode64(content[:body])
100
125
  content_to_send[:processing_queue] = content_request[:processing_queue]
101
126
  Resque.enqueue(EncodingSafeProcessJob, content_to_send)
@@ -103,19 +128,28 @@ class CrawlJob
103
128
  Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
104
129
  end
105
130
  puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
106
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
107
131
  end
108
132
 
109
133
  private
110
134
 
135
+ # Helper method to determine if this content is to be processed or not
136
+ def self.is_permitted_type(content)
137
+ @content_request[:valid_mime_types].each do |mime_type|
138
+ return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
139
+ end
140
+ false
141
+ end
142
+
111
143
  # Returns true if the crawl count is within limits
112
144
  def self.within_crawl_limits?(crawl_limit)
145
+ refresh_counters
113
146
  crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
147
+ crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
114
148
  end
115
149
 
116
150
  # Returns true if the queue count is calculated to be still within limits when complete
117
151
  def self.within_queue_limits?(crawl_limit)
118
- within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
152
+ @content_request[:crawl_limit_by_page] || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_started_counter) < crawl_limit.to_i)
119
153
  end
120
154
 
121
155
  # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
@@ -149,6 +183,10 @@ class CrawlJob
149
183
  @redis.incr "crawl-counter"
150
184
  refresh_counters
151
185
  end
186
+ def self.increment_crawl_started_counter
187
+ @redis.incr "crawl-started-counter"
188
+ refresh_counters
189
+ end
152
190
  # Decrements the queue counter and refreshes crawl counters
153
191
  def self.decrement_queue_counter
154
192
  @redis.decr "queue-counter"
@@ -157,12 +195,20 @@ class CrawlJob
157
195
  # Refreshes the crawl counters
158
196
  def self.refresh_counters
159
197
  @crawl_counter = @redis.get("crawl-counter").to_i
198
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
160
199
  @queue_counter = @redis.get("queue-counter").to_i
161
200
  end
201
+
202
+ def self.print_counters
203
+ puts "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
204
+ end
205
+
162
206
  # Sets the crawl counters based on the crawled and queued queues
163
207
  def self.reset_counters
208
+ @redis.set("crawl-started-counter", @redis.smembers("crawled").count)
164
209
  @redis.set("crawl-counter", @redis.smembers("crawled").count)
165
210
  @redis.set("queue-counter", @redis.smembers("queued").count)
211
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
166
212
  @crawl_counter = @redis.get("crawl-counter").to_i
167
213
  @queue_counter = @redis.get("queue-counter").to_i
168
214
  end
data/lib/robots.rb CHANGED
@@ -28,10 +28,10 @@ class Robots
28
28
  def allowed?(url)
29
29
  uri = URI.parse(url)
30
30
  @params[:allow].each do |pattern|
31
- return true if uri.path.match(escape_pattern_for_regex(pattern))
31
+ return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
32
32
  end
33
33
  @params[:disallow].each do |pattern|
34
- return false if uri.path.match(escape_pattern_for_regex(pattern))
34
+ return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
35
35
  end
36
36
  true
37
37
  end
@@ -45,13 +45,6 @@ class Robots
45
45
  end
46
46
 
47
47
  private
48
- # escapes characters with meaning in regular expressions and adds wildcard expression
49
- def escape_pattern_for_regex(pattern)
50
- pattern = pattern.gsub(".", "\\.")
51
- pattern = pattern.gsub("?", "\\?")
52
- pattern = pattern.gsub("*", ".*?")
53
- pattern
54
- end
55
48
 
56
49
  def parse_data(data)
57
50
  user_agents = {}
@@ -46,7 +46,32 @@ describe Cobweb, :local_only => true do
46
46
  Resque.size("cobweb_finished_job").should == 1
47
47
  end
48
48
  end
49
-
49
+ describe "with limited mime_types" do
50
+ before(:each) do
51
+ @request = {
52
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
53
+ :quiet => true,
54
+ :cache => nil,
55
+ :valid_mime_types => ["text/html"]
56
+ }
57
+ @cobweb = Cobweb.new @request
58
+ end
59
+
60
+ it "should only crawl html pages" do
61
+ crawl = @cobweb.start(@base_url)
62
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
63
+ wait_for_crawl_finished crawl[:crawl_id]
64
+ Resque.size("cobweb_process_job").should == 8
65
+
66
+ mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
67
+ mime_types.count.should == 8
68
+ mime_types.map{|m| m.should == "text/html"}
69
+ mime_types.select{|m| m=="text/html"}.count.should == 8
70
+
71
+
72
+ end
73
+
74
+ end
50
75
  describe "with a crawl limit" do
51
76
  before(:each) do
52
77
  @request = {
@@ -54,12 +79,12 @@ describe Cobweb, :local_only => true do
54
79
  :quiet => true,
55
80
  :cache => nil
56
81
  }
57
- @cobweb = Cobweb.new @request
58
82
  end
59
83
 
60
84
  describe "limit to 1" do
61
85
  before(:each) do
62
86
  @request[:crawl_limit] = 1
87
+ @cobweb = Cobweb.new @request
63
88
  end
64
89
 
65
90
  it "should not crawl the entire site" do
@@ -82,11 +107,30 @@ describe Cobweb, :local_only => true do
82
107
  end
83
108
 
84
109
  end
110
+
111
+ describe "for pages only" do
112
+ before(:each) do
113
+ @request[:crawl_limit_by_page] = true
114
+ @request[:crawl_limit] = 5
115
+ @cobweb = Cobweb.new @request
116
+ end
117
+
118
+ it "should only use html pages towards the crawl limit" do
119
+ crawl = @cobweb.start(@base_url)
120
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
121
+ wait_for_crawl_finished crawl[:crawl_id]
122
+ mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
123
+ mime_types.count.should == 70
124
+ mime_types.select{|m| m=="text/html"}.count.should == 5
125
+ end
126
+ end
85
127
 
86
- describe "limit to 3" do
128
+ describe "limit to 10" do
87
129
  before(:each) do
88
- @request[:crawl_limit] = 3
130
+ @request[:crawl_limit] = 10
131
+ @cobweb = Cobweb.new @request
89
132
  end
133
+
90
134
  it "should not crawl the entire site" do
91
135
  crawl = @cobweb.start(@base_url)
92
136
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -99,21 +143,21 @@ describe Cobweb, :local_only => true do
99
143
  wait_for_crawl_finished crawl[:crawl_id]
100
144
  Resque.size("cobweb_finished_job").should == 1
101
145
  end
102
- it "should only crawl 3 pages" do
146
+ it "should only crawl 10 objects" do
103
147
  crawl = @cobweb.start(@base_url)
104
148
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
105
149
  wait_for_crawl_finished crawl[:crawl_id]
106
- Resque.size("cobweb_process_job").should == 3
107
- end
108
-
150
+ Resque.size("cobweb_process_job").should == 10
151
+ end
109
152
  end
110
153
 
111
154
  describe "limit to 100" do
112
155
  before(:each) do
113
156
  @request[:crawl_limit] = 100
157
+ @cobweb = Cobweb.new @request
114
158
  end
115
-
116
- it "should crawl the entire site" do
159
+
160
+ it "should crawl the entire sample site" do
117
161
  crawl = @cobweb.start(@base_url)
118
162
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
119
163
  wait_for_crawl_finished crawl[:crawl_id]
@@ -138,19 +182,21 @@ describe Cobweb, :local_only => true do
138
182
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
139
183
  command = "kill #{(@all_processes - @existing_processes).join(" ")}"
140
184
  IO.popen(command)
185
+
186
+ clear_queues
141
187
  end
142
188
 
143
189
  end
144
190
 
145
191
  def wait_for_crawl_finished(crawl_id, timeout=20)
146
- counter = 0
147
- while(running?(crawl_id) && counter < timeout) do
148
- sleep 1
149
- counter+=1
192
+ counter = 0
193
+ start_time = Time.now
194
+ while(running?(crawl_id) && Time.now < start_time + timeout) do
195
+ sleep 0.5
150
196
  end
151
- if counter > timeout
197
+ if Time.now > start_time + timeout
152
198
  raise "End of crawl not detected"
153
- end
199
+ end
154
200
  end
155
201
 
156
202
  def running?(crawl_id)
@@ -161,9 +207,10 @@ def clear_queues
161
207
  Resque.queues.each do |queue|
162
208
  Resque.remove_queue(queue)
163
209
  end
210
+ puts "Cleared"
164
211
 
165
212
  Resque.size("cobweb_process_job").should == 0
166
- Resque.size("cobweb_finished_job").should == 0
213
+ Resque.size("cobweb_finished_job").should == 0
167
214
  end
168
215
 
169
216
 
@@ -71,11 +71,6 @@
71
71
  </ul>
72
72
  </li>
73
73
  <li><a href="typography.html">Typography</a></li>
74
- <li><a href="boxgrid.html">Boxes Grid</a></li>
75
- <li><a href="forms.html">Forms</a></li>
76
- <li><a href="gallery.html">Gallery</a></li>
77
- <li><a href="tables.html">Tables</a></li>
78
- <li><a href="more.html">More</a></li>
79
74
  </ul>
80
75
  <div class="search">
81
76
  <form action="" method="post">
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.58
4
+ version: 0.0.59
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-30 00:00:00.000000000 Z
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70328776801460 !ruby/object:Gem::Requirement
16
+ requirement: &70204470213880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70328776801460
24
+ version_requirements: *70204470213880
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70328776799760 !ruby/object:Gem::Requirement
27
+ requirement: &70204470212220 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70328776799760
35
+ version_requirements: *70204470212220
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70328776798960 !ruby/object:Gem::Requirement
38
+ requirement: &70204470211500 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70328776798960
46
+ version_requirements: *70204470211500
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70328776797840 !ruby/object:Gem::Requirement
49
+ requirement: &70204470210300 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70328776797840
57
+ version_requirements: *70204470210300
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70328776796300 !ruby/object:Gem::Requirement
60
+ requirement: &70204470208860 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70328776796300
68
+ version_requirements: *70204470208860
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70328776811560 !ruby/object:Gem::Requirement
71
+ requirement: &70204470223880 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70328776811560
79
+ version_requirements: *70204470223880
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70328776810940 !ruby/object:Gem::Requirement
82
+ requirement: &70204470223280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70328776810940
90
+ version_requirements: *70204470223280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70328776810380 !ruby/object:Gem::Requirement
93
+ requirement: &70204470222720 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70328776810380
101
+ version_requirements: *70204470222720
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70328776809840 !ruby/object:Gem::Requirement
104
+ requirement: &70204470222160 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70328776809840
112
+ version_requirements: *70204470222160
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70328776809160 !ruby/object:Gem::Requirement
115
+ requirement: &70204470221480 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70328776809160
123
+ version_requirements: *70204470221480
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface