cobweb 0.0.58 → 0.0.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.58
2
+ h1. Cobweb v0.0.59
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
@@ -38,11 +38,6 @@ h3. Data Returned
38
38
  ** :related - url's from link tags
39
39
  ** :scripts - url's from script tags
40
40
  ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
41
- * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
42
- * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
43
- * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
44
- * :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
45
- * :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
46
41
 
47
42
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
48
43
 
@@ -70,6 +65,13 @@ Creates a new crawler object based on a base_url
70
65
  ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
71
66
  ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
72
67
  ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
68
+ ** :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
69
+ ** :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
70
+ ** :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
71
+ ** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
72
+ ** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
73
+ ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
74
+ ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
73
75
 
74
76
  bq. crawler = CobWeb.new(:follow_redirects => false)
75
77
 
data/lib/cobweb.rb CHANGED
@@ -180,6 +180,7 @@ class Cobweb
180
180
  content[:character_set] = charset
181
181
  end
182
182
  content[:length] = response.content_length
183
+ content[:text_content] = text_content?(content[:mime_type])
183
184
  if text_content?(content[:mime_type])
184
185
  if response["Content-Encoding"]=="gzip"
185
186
  content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
@@ -389,21 +390,22 @@ class Cobweb
389
390
 
390
391
  end
391
392
 
393
+ # escapes characters with meaning in regular expressions and adds wildcard expression
394
+ def self.escape_pattern_for_regex(pattern)
395
+ pattern = pattern.gsub(".", "\\.")
396
+ pattern = pattern.gsub("?", "\\?")
397
+ pattern = pattern.gsub("+", "\\+")
398
+ pattern = pattern.gsub("*", ".*?")
399
+ pattern
400
+ end
401
+
392
402
  private
393
403
  # checks if the mime_type is textual
394
404
  def text_content?(content_type)
395
405
  @options[:text_mime_types].each do |mime_type|
396
- return true if content_type.match(escape_pattern_for_regex(mime_type))
406
+ return true if content_type.match(Cobweb.escape_pattern_for_regex(mime_type))
397
407
  end
398
408
  false
399
409
  end
400
-
401
- # escapes characters with meaning in regular expressions and adds wildcard expression
402
- def escape_pattern_for_regex(pattern)
403
- pattern = pattern.gsub(".", "\\.")
404
- pattern = pattern.gsub("?", "\\?")
405
- pattern = pattern.gsub("*", ".*?")
406
- pattern
407
- end
408
410
 
409
411
  end
@@ -85,8 +85,6 @@ class CobwebCrawler
85
85
 
86
86
  @stats.update_statistics(content, crawl_counter, queue_counter)
87
87
  @stats.update_status("Completed #{url}.")
88
- puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
89
-
90
88
  yield content, @stats.get_statistics if block_given?
91
89
 
92
90
  rescue => e
data/lib/cobweb_links.rb CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
17
17
 
18
18
  end
19
19
 
@@ -52,15 +52,6 @@ class CobwebLinks
52
52
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
53
53
  end
54
54
 
55
- private
56
- # escapes characters with meaning in regular expressions and adds wildcard expression
57
- def escape_pattern_for_regex(pattern)
58
- pattern = pattern.gsub(".", "\\.")
59
- pattern = pattern.gsub("?", "\\?")
60
- pattern = pattern.gsub("*", ".*?")
61
- ap pattern if @options[:debug]
62
- pattern
63
- end
64
55
  end
65
56
 
66
57
  # Exception raised for :internal_urls missing from CobwebLinks
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.58"
6
+ "0.0.59"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -11,83 +11,105 @@ class CrawlJob
11
11
 
12
12
  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
13
13
  def self.perform(content_request)
14
-
15
14
  # change all hash keys to symbols
16
15
  content_request = HashUtil.deep_symbolize_keys(content_request)
16
+ @content_request = content_request
17
17
 
18
18
  content_request[:redis_options] = {} unless content_request.has_key? :redis_options
19
+ content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
20
+ content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
21
+
19
22
  @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
20
23
  @stats = Stats.new(content_request)
21
24
 
22
25
  @debug = content_request[:debug]
23
26
 
24
- refresh_counters
27
+ decrement_queue_counter
25
28
 
26
29
  # check we haven't crawled this url before
27
30
  unless @redis.sismember "crawled", content_request[:url]
28
- @redis.srem "queued", content_request[:url]
29
- decrement_queue_counter
30
- @redis.sadd "crawled", content_request[:url]
31
- increment_crawl_counter
32
31
 
33
- # if there is no limit or we're still under it lets get the url
34
- if within_crawl_limits?(content_request[:crawl_limit])
32
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
35
33
 
36
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
37
-
38
- ## update statistics
39
- @stats.update_status("Crawling #{content_request[:url]}...")
40
- @stats.update_statistics(content)
41
-
42
- # set the base url if this is the first page
43
- set_base_url @redis, content, content_request
34
+ if is_permitted_type(content)
35
+ # if there is no limit or we're still under it lets get the url
36
+ if within_crawl_limits?(content_request[:crawl_limit])
37
+ #update the queued and crawled lists if we are within the crawl limits.
38
+ @redis.srem "queued", content_request[:url]
39
+ @redis.sadd "crawled", content_request[:url]
40
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
41
+ if content_request[:crawl_limit_by_page]
42
+ if content[:mime_type].match("text/html")
43
+ increment_crawl_counter
44
+ increment_crawl_started_counter
45
+ end
46
+ else
47
+ increment_crawl_counter
48
+ increment_crawl_started_counter
49
+ end
44
50
 
45
- @cobweb_links = CobwebLinks.new(content_request)
46
- if within_queue_limits?(content_request[:crawl_limit])
47
- internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
48
-
49
- # select the link if its internal
50
- internal_links.select!{|link| @cobweb_links.internal?(link)}
51
+ ## update statistics
52
+ @stats.update_status("Crawling #{content_request[:url]}...")
53
+ @stats.update_statistics(content)
54
+
55
+ # set the base url if this is the first page
56
+ set_base_url @redis, content, content_request
57
+
58
+ @cobweb_links = CobwebLinks.new(content_request)
59
+ if within_queue_limits?(content_request[:crawl_limit])
60
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
61
+
62
+ # select the link if its internal
63
+ internal_links.select!{|link| @cobweb_links.internal?(link)}
51
64
 
52
- # reject the link if we've crawled it or queued it
53
- internal_links.reject!{|link| @redis.sismember("crawled", link)}
54
- internal_links.reject!{|link| @redis.sismember("queued", link)}
65
+ # reject the link if we've crawled it or queued it
66
+ internal_links.reject!{|link| @redis.sismember("crawled", link)}
67
+ internal_links.reject!{|link| @redis.sismember("queued", link)}
55
68
 
56
- internal_links.each do |link|
57
- enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
69
+ internal_links.each do |link|
70
+ enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
71
+ end
58
72
  end
59
- end
60
73
 
61
- # enqueue to processing queue
62
- send_to_processing_queue(content, content_request)
74
+ # enqueue to processing queue
75
+ send_to_processing_queue(content, content_request)
63
76
 
64
- #if the enqueue counter has been requested update that
65
- if content_request.has_key? :enqueue_counter_key
66
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
67
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
68
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
69
- end
70
-
71
- # if there's nothing left queued or the crawled limit has been reached
72
- if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
73
- if @redis.scard("queued") == 0
74
- finished(content_request)
77
+ #if the enqueue counter has been requested update that
78
+ if content_request.has_key? :enqueue_counter_key
79
+ enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
80
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
81
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
75
82
  end
76
- elsif @queue_counter == 0 || @crawl_counter > content_request[:crawl_limit].to_i
77
- finished(content_request)
83
+
84
+ # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
85
+ # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
86
+ #increment_crawl_counter
87
+ puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
78
88
  end
89
+ else
90
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
79
91
  end
92
+
80
93
  else
81
94
  @redis.srem "queued", content_request[:url]
82
- decrement_queue_counter
83
95
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
84
96
  end
85
-
97
+
98
+ # if there's nothing left queued or the crawled limit has been reached
99
+ if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
100
+ if @queue_counter == 0
101
+ finished(content_request)
102
+ end
103
+ elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
104
+ finished(content_request)
105
+ end
106
+
86
107
  end
87
108
 
88
109
  # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
89
110
  def self.finished(content_request)
90
111
  # finished
112
+ ap "FINISHED"
91
113
  @stats.end_crawl(content_request)
92
114
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
93
115
  end
@@ -95,7 +117,10 @@ class CrawlJob
95
117
  # Enqueues the content to the processing queue setup in options
96
118
  def self.send_to_processing_queue(content, content_request)
97
119
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
98
- if content_request[:use_encoding_safe_process_job]
120
+ if content_request[:direct_call_process_job]
121
+ clazz = const_get(content_request[:processing_queue])
122
+ clazz.perform(content_to_send)
123
+ elsif content_request[:use_encoding_safe_process_job]
99
124
  content_to_send[:body] = Base64.encode64(content[:body])
100
125
  content_to_send[:processing_queue] = content_request[:processing_queue]
101
126
  Resque.enqueue(EncodingSafeProcessJob, content_to_send)
@@ -103,19 +128,28 @@ class CrawlJob
103
128
  Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
104
129
  end
105
130
  puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
106
- puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
107
131
  end
108
132
 
109
133
  private
110
134
 
135
+ # Helper method to determine if this content is to be processed or not
136
+ def self.is_permitted_type(content)
137
+ @content_request[:valid_mime_types].each do |mime_type|
138
+ return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
139
+ end
140
+ false
141
+ end
142
+
111
143
  # Returns true if the crawl count is within limits
112
144
  def self.within_crawl_limits?(crawl_limit)
145
+ refresh_counters
113
146
  crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
147
+ crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
114
148
  end
115
149
 
116
150
  # Returns true if the queue count is calculated to be still within limits when complete
117
151
  def self.within_queue_limits?(crawl_limit)
118
- within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
152
+ @content_request[:crawl_limit_by_page] || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_started_counter) < crawl_limit.to_i)
119
153
  end
120
154
 
121
155
  # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
@@ -149,6 +183,10 @@ class CrawlJob
149
183
  @redis.incr "crawl-counter"
150
184
  refresh_counters
151
185
  end
186
+ def self.increment_crawl_started_counter
187
+ @redis.incr "crawl-started-counter"
188
+ refresh_counters
189
+ end
152
190
  # Decrements the queue counter and refreshes crawl counters
153
191
  def self.decrement_queue_counter
154
192
  @redis.decr "queue-counter"
@@ -157,12 +195,20 @@ class CrawlJob
157
195
  # Refreshes the crawl counters
158
196
  def self.refresh_counters
159
197
  @crawl_counter = @redis.get("crawl-counter").to_i
198
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
160
199
  @queue_counter = @redis.get("queue-counter").to_i
161
200
  end
201
+
202
+ def self.print_counters
203
+ puts "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
204
+ end
205
+
162
206
  # Sets the crawl counters based on the crawled and queued queues
163
207
  def self.reset_counters
208
+ @redis.set("crawl-started-counter", @redis.smembers("crawled").count)
164
209
  @redis.set("crawl-counter", @redis.smembers("crawled").count)
165
210
  @redis.set("queue-counter", @redis.smembers("queued").count)
211
+ @crawl_started_counter = @redis.get("crawl-started-counter").to_i
166
212
  @crawl_counter = @redis.get("crawl-counter").to_i
167
213
  @queue_counter = @redis.get("queue-counter").to_i
168
214
  end
data/lib/robots.rb CHANGED
@@ -28,10 +28,10 @@ class Robots
28
28
  def allowed?(url)
29
29
  uri = URI.parse(url)
30
30
  @params[:allow].each do |pattern|
31
- return true if uri.path.match(escape_pattern_for_regex(pattern))
31
+ return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
32
32
  end
33
33
  @params[:disallow].each do |pattern|
34
- return false if uri.path.match(escape_pattern_for_regex(pattern))
34
+ return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
35
35
  end
36
36
  true
37
37
  end
@@ -45,13 +45,6 @@ class Robots
45
45
  end
46
46
 
47
47
  private
48
- # escapes characters with meaning in regular expressions and adds wildcard expression
49
- def escape_pattern_for_regex(pattern)
50
- pattern = pattern.gsub(".", "\\.")
51
- pattern = pattern.gsub("?", "\\?")
52
- pattern = pattern.gsub("*", ".*?")
53
- pattern
54
- end
55
48
 
56
49
  def parse_data(data)
57
50
  user_agents = {}
@@ -46,7 +46,32 @@ describe Cobweb, :local_only => true do
46
46
  Resque.size("cobweb_finished_job").should == 1
47
47
  end
48
48
  end
49
-
49
+ describe "with limited mime_types" do
50
+ before(:each) do
51
+ @request = {
52
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
53
+ :quiet => true,
54
+ :cache => nil,
55
+ :valid_mime_types => ["text/html"]
56
+ }
57
+ @cobweb = Cobweb.new @request
58
+ end
59
+
60
+ it "should only crawl html pages" do
61
+ crawl = @cobweb.start(@base_url)
62
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
63
+ wait_for_crawl_finished crawl[:crawl_id]
64
+ Resque.size("cobweb_process_job").should == 8
65
+
66
+ mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
67
+ mime_types.count.should == 8
68
+ mime_types.map{|m| m.should == "text/html"}
69
+ mime_types.select{|m| m=="text/html"}.count.should == 8
70
+
71
+
72
+ end
73
+
74
+ end
50
75
  describe "with a crawl limit" do
51
76
  before(:each) do
52
77
  @request = {
@@ -54,12 +79,12 @@ describe Cobweb, :local_only => true do
54
79
  :quiet => true,
55
80
  :cache => nil
56
81
  }
57
- @cobweb = Cobweb.new @request
58
82
  end
59
83
 
60
84
  describe "limit to 1" do
61
85
  before(:each) do
62
86
  @request[:crawl_limit] = 1
87
+ @cobweb = Cobweb.new @request
63
88
  end
64
89
 
65
90
  it "should not crawl the entire site" do
@@ -82,11 +107,30 @@ describe Cobweb, :local_only => true do
82
107
  end
83
108
 
84
109
  end
110
+
111
+ describe "for pages only" do
112
+ before(:each) do
113
+ @request[:crawl_limit_by_page] = true
114
+ @request[:crawl_limit] = 5
115
+ @cobweb = Cobweb.new @request
116
+ end
117
+
118
+ it "should only use html pages towards the crawl limit" do
119
+ crawl = @cobweb.start(@base_url)
120
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
121
+ wait_for_crawl_finished crawl[:crawl_id]
122
+ mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
123
+ mime_types.count.should == 70
124
+ mime_types.select{|m| m=="text/html"}.count.should == 5
125
+ end
126
+ end
85
127
 
86
- describe "limit to 3" do
128
+ describe "limit to 10" do
87
129
  before(:each) do
88
- @request[:crawl_limit] = 3
130
+ @request[:crawl_limit] = 10
131
+ @cobweb = Cobweb.new @request
89
132
  end
133
+
90
134
  it "should not crawl the entire site" do
91
135
  crawl = @cobweb.start(@base_url)
92
136
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -99,21 +143,21 @@ describe Cobweb, :local_only => true do
99
143
  wait_for_crawl_finished crawl[:crawl_id]
100
144
  Resque.size("cobweb_finished_job").should == 1
101
145
  end
102
- it "should only crawl 3 pages" do
146
+ it "should only crawl 10 objects" do
103
147
  crawl = @cobweb.start(@base_url)
104
148
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
105
149
  wait_for_crawl_finished crawl[:crawl_id]
106
- Resque.size("cobweb_process_job").should == 3
107
- end
108
-
150
+ Resque.size("cobweb_process_job").should == 10
151
+ end
109
152
  end
110
153
 
111
154
  describe "limit to 100" do
112
155
  before(:each) do
113
156
  @request[:crawl_limit] = 100
157
+ @cobweb = Cobweb.new @request
114
158
  end
115
-
116
- it "should crawl the entire site" do
159
+
160
+ it "should crawl the entire sample site" do
117
161
  crawl = @cobweb.start(@base_url)
118
162
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
119
163
  wait_for_crawl_finished crawl[:crawl_id]
@@ -138,19 +182,21 @@ describe Cobweb, :local_only => true do
138
182
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
139
183
  command = "kill #{(@all_processes - @existing_processes).join(" ")}"
140
184
  IO.popen(command)
185
+
186
+ clear_queues
141
187
  end
142
188
 
143
189
  end
144
190
 
145
191
  def wait_for_crawl_finished(crawl_id, timeout=20)
146
- counter = 0
147
- while(running?(crawl_id) && counter < timeout) do
148
- sleep 1
149
- counter+=1
192
+ counter = 0
193
+ start_time = Time.now
194
+ while(running?(crawl_id) && Time.now < start_time + timeout) do
195
+ sleep 0.5
150
196
  end
151
- if counter > timeout
197
+ if Time.now > start_time + timeout
152
198
  raise "End of crawl not detected"
153
- end
199
+ end
154
200
  end
155
201
 
156
202
  def running?(crawl_id)
@@ -161,9 +207,10 @@ def clear_queues
161
207
  Resque.queues.each do |queue|
162
208
  Resque.remove_queue(queue)
163
209
  end
210
+ puts "Cleared"
164
211
 
165
212
  Resque.size("cobweb_process_job").should == 0
166
- Resque.size("cobweb_finished_job").should == 0
213
+ Resque.size("cobweb_finished_job").should == 0
167
214
  end
168
215
 
169
216
 
@@ -71,11 +71,6 @@
71
71
  </ul>
72
72
  </li>
73
73
  <li><a href="typography.html">Typography</a></li>
74
- <li><a href="boxgrid.html">Boxes Grid</a></li>
75
- <li><a href="forms.html">Forms</a></li>
76
- <li><a href="gallery.html">Gallery</a></li>
77
- <li><a href="tables.html">Tables</a></li>
78
- <li><a href="more.html">More</a></li>
79
74
  </ul>
80
75
  <div class="search">
81
76
  <form action="" method="post">
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.58
4
+ version: 0.0.59
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-30 00:00:00.000000000 Z
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70328776801460 !ruby/object:Gem::Requirement
16
+ requirement: &70204470213880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70328776801460
24
+ version_requirements: *70204470213880
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70328776799760 !ruby/object:Gem::Requirement
27
+ requirement: &70204470212220 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70328776799760
35
+ version_requirements: *70204470212220
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70328776798960 !ruby/object:Gem::Requirement
38
+ requirement: &70204470211500 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70328776798960
46
+ version_requirements: *70204470211500
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70328776797840 !ruby/object:Gem::Requirement
49
+ requirement: &70204470210300 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70328776797840
57
+ version_requirements: *70204470210300
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70328776796300 !ruby/object:Gem::Requirement
60
+ requirement: &70204470208860 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70328776796300
68
+ version_requirements: *70204470208860
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70328776811560 !ruby/object:Gem::Requirement
71
+ requirement: &70204470223880 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70328776811560
79
+ version_requirements: *70204470223880
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70328776810940 !ruby/object:Gem::Requirement
82
+ requirement: &70204470223280 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70328776810940
90
+ version_requirements: *70204470223280
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70328776810380 !ruby/object:Gem::Requirement
93
+ requirement: &70204470222720 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70328776810380
101
+ version_requirements: *70204470222720
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70328776809840 !ruby/object:Gem::Requirement
104
+ requirement: &70204470222160 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70328776809840
112
+ version_requirements: *70204470222160
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70328776809160 !ruby/object:Gem::Requirement
115
+ requirement: &70204470221480 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70328776809160
123
+ version_requirements: *70204470221480
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface