cobweb 0.0.73 → 0.0.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.73
2
+ h1. Cobweb v0.0.74
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -4,7 +4,6 @@ require 'resque'
4
4
  require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
- require 'namespaced_redis'
8
7
 
9
8
  Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
10
9
  require file
@@ -46,6 +45,7 @@ class Cobweb
46
45
  default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
46
  default_obey_robots_to false
48
47
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
48
+ default_valid_mime_types_to ["*/*"]
49
49
 
50
50
  end
51
51
 
@@ -65,7 +65,7 @@ class Cobweb
65
65
  end
66
66
 
67
67
  request.merge!(@options)
68
- @redis = NamespacedRedis.new(request[:redis_options], "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
68
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
69
69
  @redis.set("original_base_url", base_url)
70
70
  @redis.hset "statistics", "queued_at", DateTime.now
71
71
  @redis.set("crawl-counter", 0)
@@ -110,9 +110,9 @@ class Cobweb
110
110
 
111
111
  # connect to redis
112
112
  if options.has_key? :crawl_id
113
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
113
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
114
114
  else
115
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
115
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
116
116
  end
117
117
 
118
118
  content = {:base_url => url}
@@ -269,9 +269,9 @@ class Cobweb
269
269
 
270
270
  # connect to redis
271
271
  if options.has_key? :crawl_id
272
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
272
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
273
273
  else
274
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
274
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
275
275
  end
276
276
 
277
277
  content = {:base_url => url}
@@ -15,7 +15,7 @@ class CobwebCrawlHelper
15
15
  @stats = Stats.new(data)
16
16
  end
17
17
 
18
- def destroy(options)
18
+ def destroy(options={})
19
19
 
20
20
  options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
21
21
  options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
@@ -1,7 +1,7 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
3
  require 'ap'
4
- #require 'namespaced_redis'
4
+ require 'redis-namespace'
5
5
 
6
6
  # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
7
7
  class CobwebCrawler
@@ -20,7 +20,7 @@ class CobwebCrawler
20
20
  @options[:crawl_id] = @crawl_id
21
21
  end
22
22
 
23
- @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{@crawl_id}")
23
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
26
  @debug = @options[:debug]
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.73"
6
+ "0.0.74"
7
7
  end
8
8
 
9
9
  end
@@ -6,7 +6,7 @@ class ContentLinkParser
6
6
 
7
7
  # Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
8
8
  def initialize(url, content, options = {})
9
- @options = options
9
+ @options = {}.merge(options)
10
10
  @url = url
11
11
  @doc = Nokogiri::HTML(content)
12
12
 
@@ -61,8 +61,7 @@ class ContentLinkParser
61
61
  end
62
62
  links.uniq
63
63
  else
64
- puts "Warning: There was no configuration on how to find #{m} links"
65
- []
64
+ super
66
65
  end
67
66
  end
68
67
 
data/lib/crawl.rb ADDED
@@ -0,0 +1,263 @@
1
+ module CobwebModule
2
+ class Crawl
3
+
4
+ def initialize(options={})
5
+ @options = HashUtil.deep_symbolize_keys(options)
6
+
7
+ setup_defaults
8
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
9
+ @stats = Stats.new(@options)
10
+ @debug = @options[:debug]
11
+ @first_to_finish = false
12
+
13
+ end
14
+
15
+ # Returns true if the url requested is already in the crawled queue
16
+ def already_crawled?(link=@options[:url])
17
+ @redis.sismember "crawled", link
18
+ end
19
+
20
+ def already_queued?(link)
21
+ @redis.sismember "queued", link
22
+ end
23
+
24
+ # Returns true if the crawl count is within limits
25
+ def within_crawl_limits?
26
+ @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
27
+ end
28
+
29
+ # Returns true if the processed count is within limits
30
+ def within_process_limits?
31
+ @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
32
+ end
33
+
34
+ # Returns true if the queue count is calculated to be still within limits when complete
35
+ def within_queue_limits?
36
+
37
+ # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
38
+ if @options[:crawl_limit_by_page]
39
+ return true
40
+
41
+ # if a crawl limit is set, limit queue size to crawled + queue
42
+ elsif @options[:crawl_limit].to_i > 0
43
+ (queue_counter + crawl_counter) < @options[:crawl_limit].to_i
44
+
45
+ # no crawl limit set so always within queue limit
46
+ else
47
+ true
48
+ end
49
+ end
50
+
51
+ def retrieve
52
+ unless already_crawled?
53
+ if within_crawl_limits?
54
+ @stats.update_status("Retrieving #{@options[:url]}...")
55
+ @content = Cobweb.new(@options).get(@options[:url], @options)
56
+ if @options[:url] == @redis.get("original_base_url")
57
+ @redis.set("crawled_base_url", @content[:base_url])
58
+ end
59
+ update_queues
60
+
61
+ if content.permitted_type?
62
+ ## update statistics
63
+
64
+ @stats.update_statistics(@content)
65
+ return true
66
+ end
67
+ else
68
+ decrement_queue_counter
69
+ end
70
+ else
71
+ decrement_queue_counter
72
+ end
73
+ false
74
+ end
75
+
76
+ def process_links &block
77
+
78
+ # set the base url if this is the first page
79
+ set_base_url @redis
80
+
81
+ @cobweb_links = CobwebLinks.new(@options)
82
+ if within_queue_limits?
83
+ internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
84
+ #get rid of duplicate links in the same page.
85
+ internal_links.uniq!
86
+ # select the link if its internal
87
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
88
+
89
+ # reject the link if we've crawled it or queued it
90
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
91
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
92
+
93
+ internal_links.each do |link|
94
+ if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
95
+ if status != CobwebCrawlHelper::CANCELLED
96
+ yield link if block_given?
97
+ unless link.nil?
98
+ @redis.sadd "queued", link
99
+ increment_queue_counter
100
+ end
101
+ else
102
+ puts "Cannot enqueue new content as crawl has been cancelled." if @options[:debug]
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ def content
110
+ raise "Content is not available" if @content.nil?
111
+ CobwebModule::CrawlObject.new(@content, @options)
112
+ end
113
+
114
+ def update_queues
115
+ @redis.multi do
116
+ #@redis.incr "inprogress"
117
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
118
+ @redis.srem "queued", @options[:url]
119
+ @redis.sadd "crawled", @options[:url]
120
+ if content.url != @options[:url]
121
+ @redis.srem "queued", content.url
122
+ @redis.sadd "crawled", content.url
123
+ end
124
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
125
+ if @options[:crawl_limit_by_page]
126
+ ap "#{content.mime_type} - #{content.url}"
127
+ if content.mime_type.match("text/html")
128
+ increment_crawl_counter
129
+ end
130
+ else
131
+ increment_crawl_counter
132
+ end
133
+ decrement_queue_counter
134
+ end
135
+ end
136
+
137
+ def to_be_processed?
138
+ !finished? || first_to_finish? || within_process_limits?
139
+ end
140
+
141
+ def process
142
+ if @options[:crawl_limit_by_page]
143
+ if content.mime_type.match("text/html")
144
+ increment_process_counter
145
+ end
146
+ else
147
+ increment_process_counter
148
+ end
149
+ end
150
+
151
+ def finished?
152
+ print_counters
153
+ # if there's nothing left queued or the crawled limit has been reached
154
+ if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
155
+ if queue_counter.to_i == 0
156
+ finished
157
+ return true
158
+ end
159
+ elsif (queue_counter.to_i) == 0 || crawl_counter.to_i >= @options[:crawl_limit].to_i
160
+ finished
161
+ return true
162
+ end
163
+ false
164
+ end
165
+
166
+ def finished
167
+ set_first_to_finish if !@redis.exists("first_to_finish")
168
+ ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if @options[:debug]
169
+ @stats.end_crawl(@options)
170
+ end
171
+
172
+ def set_first_to_finish
173
+ @redis.watch("first_to_finish") do
174
+ if !@redis.exists("first_to_finish")
175
+ @redis.multi do
176
+ puts "set first to finish"
177
+ @first_to_finish = true
178
+ @redis.set("first_to_finish", 1)
179
+ end
180
+ else
181
+ @redis.unwatch
182
+ end
183
+ end
184
+ end
185
+
186
+
187
+ def first_to_finish?
188
+ @first_to_finish
189
+ end
190
+
191
+ def crawled_base_url
192
+ @redis.get("crawled_base_url")
193
+ end
194
+
195
+ def statistics
196
+ @stats.get_statistics
197
+ end
198
+
199
+ def redis
200
+ @redis
201
+ end
202
+
203
+ private
204
+ def setup_defaults
205
+ @options[:redis_options] = {} unless @options.has_key? :redis_options
206
+ @options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
207
+ @options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
208
+ end
209
+
210
+ # Increments the queue counter and refreshes crawl counters
211
+ def increment_queue_counter
212
+ @redis.incr "queue-counter"
213
+ end
214
+ # Increments the crawl counter and refreshes crawl counters
215
+ def increment_crawl_counter
216
+ @redis.incr "crawl-counter"
217
+ end
218
+ # Increments the process counter
219
+ def increment_process_counter
220
+ @redis.incr "process-counter"
221
+ end
222
+ # Decrements the queue counter and refreshes crawl counters
223
+ def decrement_queue_counter
224
+ @redis.decr "queue-counter"
225
+ end
226
+
227
+ def crawl_counter
228
+ @redis.get("crawl-counter").to_i
229
+ end
230
+ def queue_counter
231
+ @redis.get("queue-counter").to_i
232
+ end
233
+ def process_counter
234
+ @redis.get("process-counter").to_i
235
+ end
236
+
237
+ def status
238
+ @stats.get_status
239
+ end
240
+
241
+ def print_counters
242
+ puts counters
243
+ end
244
+
245
+ def counters
246
+ "crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
247
+ end
248
+
249
+ # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
250
+ def set_base_url(redis)
251
+ if redis.get("base_url").nil?
252
+ unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal]
253
+ uri = Addressable::URI.parse(content.redirect_through.last)
254
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
255
+ end
256
+ redis.set("base_url", content.url)
257
+ end
258
+ end
259
+
260
+
261
+
262
+ end
263
+ end
data/lib/crawl_job.rb CHANGED
@@ -5,132 +5,58 @@ class CrawlJob
5
5
  require "net/https"
6
6
  require "uri"
7
7
  require "redis"
8
- require 'namespaced_redis'
9
-
8
+
10
9
  @queue = :cobweb_crawl_job
11
-
10
+
12
11
  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
13
12
  def self.perform(content_request)
14
- # change all hash keys to symbols
15
- content_request = HashUtil.deep_symbolize_keys(content_request)
16
- @content_request = content_request
17
- @crawl = CobwebCrawlHelper.new(content_request)
18
-
19
- content_request[:redis_options] = {} unless content_request.has_key? :redis_options
20
- content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
21
- content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
22
13
 
23
- @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
24
- @stats = Stats.new(content_request)
14
+ # setup the crawl class to manage the crawl of this object
15
+ @crawl = CobwebModule::Crawl.new(content_request)
25
16
 
26
- @debug = content_request[:debug]
17
+ # update the counters and then perform the get, returns false if we are outwith limits
18
+ if @crawl.retrieve
27
19
 
28
- # check we haven't crawled this url before
29
- unless @redis.sismember "crawled", content_request[:url]
30
- # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit])
32
- if @crawl.status != CobwebCrawlHelper::CANCELLED
33
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
- if content_request[:url] == @redis.get("original_base_url")
35
- @redis.set("crawled_base_url", content[:base_url])
36
- end
37
- if is_permitted_type(content)
38
- begin
39
- @redis.incr "inprogress"
40
- # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
- @redis.srem "queued", content_request[:url]
42
- @redis.sadd "crawled", content_request[:url]
43
- @redis.srem "queued", content[:url]
44
- @redis.sadd "crawled", content[:url]
45
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
46
- if content_request[:crawl_limit_by_page]
47
- if content[:mime_type].match("text/html")
48
- increment_crawl_started_counter
49
- end
50
- else
51
- increment_crawl_started_counter
52
- end
53
-
54
- ## update statistics
55
- @stats.update_status("Crawling #{content_request[:url]}...")
56
- @stats.update_statistics(content)
57
-
58
- # set the base url if this is the first page
59
- set_base_url @redis, content, content_request
60
-
61
- @cobweb_links = CobwebLinks.new(content_request)
62
- if within_queue_limits?(content_request[:crawl_limit])
63
- internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
64
- #get rid of duplicate links in the same page.
65
- internal_links.uniq!
66
- # select the link if its internal
67
- internal_links.select! { |link| @cobweb_links.internal?(link) }
68
-
69
- # reject the link if we've crawled it or queued it
70
- internal_links.reject! { |link| @redis.sismember("crawled", link) }
71
- internal_links.reject! { |link| @redis.sismember("queued", link) }
72
-
73
- internal_links.each do |link|
74
- puts link
75
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
76
- if within_queue_limits?(content_request[:crawl_limit])
77
- if @crawl.status != CobwebCrawlHelper::CANCELLED
78
- enqueue_content(content_request, link)
79
- else
80
- puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
81
- end
82
- end
83
- end
84
- end
85
-
86
- # enqueue to processing queue
87
- send_to_processing_queue(content, content_request)
20
+ # if the crawled object is an object type we are interested
21
+ if @crawl.content.permitted_type?
22
+
23
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
24
+ @crawl.process_links do |link|
88
25
 
89
- #if the enqueue counter has been requested update that
90
- if content_request.has_key?(:enqueue_counter_key)
91
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
92
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
93
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
94
- end
26
+ # enqueue the links to resque
27
+ puts "ENQUEUED LINK: #{link}"
28
+ enqueue_content(content_request, link)
95
29
 
96
- ensure
97
- @redis.decr "inprogress"
98
- #update the queued and crawled lists if we are within the crawl limits.
99
-
100
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
101
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
102
- # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
103
- if content_request[:crawl_limit_by_page]
104
- if content[:mime_type].match("text/html")
105
- increment_crawl_counter
106
- end
107
- else
108
- increment_crawl_counter
109
- end
110
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
111
- end
112
- else
113
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
30
+ end
31
+
32
+
33
+ if @crawl.to_be_processed?
34
+ @crawl.process
35
+
36
+ # enqueue to processing queue
37
+ @crawl.redis.incr("crawl_job_enqueued_count")
38
+ puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
39
+ send_to_processing_queue(@crawl.content.to_hash, content_request)
40
+
41
+
42
+ #if the enqueue counter has been requested update that
43
+ if content_request.has_key?(:enqueue_counter_key)
44
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
45
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
46
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
114
47
  end
115
48
  else
116
- puts "ignoring #{content_request[:url]} as crawl has been cancelled." if content_request[:debug]
49
+ ap "@crawl.finished? #{@crawl.finished?}"
50
+ ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
51
+ ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
117
52
  end
118
- else
119
- puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
120
53
  end
121
-
122
- else
123
- @redis.srem "queued", content_request[:url]
124
- puts "Already crawled #{content_request[:url]}" if content_request[:debug]
125
54
  end
126
-
127
- decrement_queue_counter
128
- # if there's nothing left queued or the crawled limit has been reached
129
- if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
130
- if queue_counter + crawl_started_counter - crawl_counter == 0
131
- finished(content_request)
132
- end
133
- elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
55
+
56
+ # test queue and crawl sizes to see if we have completed the crawl
57
+ ap "finished? #{@crawl.finished?}"
58
+ ap "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
59
+ if @crawl.finished? && @crawl.first_to_finish?
134
60
  finished(content_request)
135
61
  end
136
62
 
@@ -138,19 +64,12 @@ class CrawlJob
138
64
 
139
65
  # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
140
66
  def self.finished(content_request)
141
- # finished
142
- if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
143
- ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
144
- @stats.end_crawl(content_request)
145
-
146
- additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
147
- additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
148
- additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
149
-
150
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
151
- else
152
- # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
153
- end
67
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
68
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
69
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
70
+
71
+ @crawl.redis.incr("crawl_finished_enqueued_count")
72
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
154
73
  end
155
74
 
156
75
  # Enqueues the content to the processing queue setup in options
@@ -171,34 +90,6 @@ class CrawlJob
171
90
 
172
91
  private
173
92
 
174
- # Helper method to determine if this content is to be processed or not
175
- def self.is_permitted_type(content)
176
- @content_request[:valid_mime_types].each do |mime_type|
177
- return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
178
- end
179
- false
180
- end
181
-
182
- # Returns true if the crawl count is within limits
183
- def self.within_crawl_limits?(crawl_limit)
184
- crawl_limit.nil? or crawl_counter < crawl_limit.to_i
185
- end
186
-
187
- # Returns true if the queue count is calculated to be still within limits when complete
188
- def self.within_queue_limits?(crawl_limit)
189
- (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
190
- end
191
-
192
- # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
193
- def self.set_base_url(redis, content, content_request)
194
- if redis.get("base_url").nil?
195
- unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
196
- uri = Addressable::URI.parse(content[:redirect_through].last)
197
- redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
198
- end
199
- redis.set("base_url", content[:url])
200
- end
201
- end
202
93
 
203
94
  # Enqueues content to the crawl_job queue
204
95
  def self.enqueue_content(content_request, link)
@@ -206,43 +97,8 @@ class CrawlJob
206
97
  new_request[:url] = link
207
98
  new_request[:parent] = content_request[:url]
208
99
  #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
209
- @redis.sadd "queued", link
210
100
  Resque.enqueue(CrawlJob, new_request)
211
- increment_queue_counter
212
101
  end
213
102
 
214
- # Increments the queue counter and refreshes crawl counters
215
- def self.increment_queue_counter
216
- @redis.incr "queue-counter"
217
- end
218
- # Increments the crawl counter and refreshes crawl counters
219
- def self.increment_crawl_counter
220
- @redis.incr "crawl-counter"
221
- end
222
- def self.increment_crawl_started_counter
223
- @redis.incr "crawl-started-counter"
224
- end
225
- # Decrements the queue counter and refreshes crawl counters
226
- def self.decrement_queue_counter
227
- @redis.decr "queue-counter"
228
- end
229
-
230
- def self.crawl_counter
231
- @redis.get("crawl-counter").to_i
232
- end
233
- def self.crawl_started_counter
234
- @redis.get("crawl-started-counter").to_i
235
- end
236
- def self.queue_counter
237
- @redis.get("queue-counter").to_i
238
- end
239
-
240
- def self.print_counters
241
- puts counters
242
- end
243
-
244
- def self.counters
245
- "crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
246
- end
247
103
 
248
104
  end
@@ -0,0 +1,30 @@
1
+ module CobwebModule
2
+ class CrawlObject
3
+
4
+ def initialize(content_hash, options={})
5
+ @content = HashUtil.deep_symbolize_keys(content_hash)
6
+ @options = options
7
+ end
8
+
9
+
10
+ # Helper method to determine if this content is to be processed or not
11
+ def permitted_type?
12
+ @options[:valid_mime_types].each do |valid_mime_type|
13
+ return true if @content[:mime_type].match(Cobweb.escape_pattern_for_regex(valid_mime_type))
14
+ end
15
+ false
16
+ end
17
+
18
+ def method_missing(m)
19
+ if @content.keys.include? m.to_sym
20
+ @content[m.to_sym]
21
+ else
22
+ super
23
+ end
24
+ end
25
+
26
+ def to_hash
27
+ @content
28
+ end
29
+ end
30
+ end
data/lib/hash_util.rb CHANGED
@@ -3,6 +3,7 @@ class HashUtil
3
3
 
4
4
  # Returns a hash with the keys converted to symbols
5
5
  def self.deep_symbolize_keys(hash)
6
+ raise "Cannot symbolize keys for a nil object" if hash.nil?
6
7
  hash.keys.each do |key|
7
8
  value = hash[key]
8
9
  hash.delete(key)
data/lib/server.rb CHANGED
@@ -16,7 +16,7 @@ class Server < Sinatra::Base
16
16
  @crawls = []
17
17
  @full_redis.smembers("cobweb_crawls").each do |crawl_id|
18
18
  version = cobweb_version(crawl_id)
19
- redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{crawl_id}")
19
+ redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
20
20
  stats = HashUtil.deep_symbolize_keys({
21
21
  :cobweb_version => version,
22
22
  :crawl_details => redis.hgetall("crawl_details"),
@@ -33,7 +33,7 @@ class Server < Sinatra::Base
33
33
  get '/statistics/:crawl_id' do
34
34
 
35
35
  version = cobweb_version(params[:crawl_id])
36
- redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{params[:crawl_id]}")
36
+ redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
37
37
 
38
38
  @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
39
39
  if @statistics[:status_counts].nil?
data/lib/stats.rb CHANGED
@@ -8,7 +8,7 @@ class Stats
8
8
  def initialize(options)
9
9
  options[:redis_options] = {} unless options.has_key? :redis_options
10
10
  @full_redis = Redis.new(options[:redis_options])
11
- @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
11
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
12
12
  end
13
13
 
14
14
  # Sets up the crawl in statistics
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
9
9
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
10
  puts "Starting Workers... Please Wait..."
11
11
  `mkdir log`
12
- io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=1 QUEUE=cobweb_crawl_job > log/output.log &")
12
+ io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
13
13
  puts "Workers Started."
14
14
 
15
15
  end
@@ -17,6 +17,7 @@ describe Cobweb, :local_only => true do
17
17
  before(:each) do
18
18
  @base_url = "http://localhost:3532/"
19
19
  @base_page_count = 77
20
+
20
21
  clear_queues
21
22
  end
22
23
 
@@ -29,6 +30,7 @@ describe Cobweb, :local_only => true do
29
30
  :debug => false,
30
31
  :cache => nil
31
32
  }
33
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
32
34
  @cobweb = Cobweb.new @request
33
35
  end
34
36
  it "should not crawl anything if nothing has started" do
@@ -37,7 +39,7 @@ describe Cobweb, :local_only => true do
37
39
  crawl_obj.destroy
38
40
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
39
41
  wait_for_crawl_finished crawl[:crawl_id]
40
- Resque.size("cobweb_process_job").should == 0
42
+ @redis.get("crawl_job_enqueued_count").to_i.should == 0
41
43
  end
42
44
 
43
45
  it "should not complete the crawl when cancelled" do
@@ -47,8 +49,8 @@ describe Cobweb, :local_only => true do
47
49
  crawl_obj.destroy
48
50
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
49
51
  wait_for_crawl_finished crawl[:crawl_id]
50
- Resque.size("cobweb_process_job").should > 0
51
- Resque.size("cobweb_process_job").should_not == @base_page_count
52
+ @redis.get("crawl_job_enqueued_count").to_i.should > 0
53
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
52
54
  end
53
55
 
54
56
  end
@@ -61,22 +63,24 @@ describe Cobweb, :local_only => true do
61
63
  :debug => false,
62
64
  :cache => nil
63
65
  }
66
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
67
+
64
68
  @cobweb = Cobweb.new @request
65
69
  end
66
70
 
67
71
  it "should crawl entire site" do
68
- ap Resque.size("cobweb_process_job")
69
72
  crawl = @cobweb.start(@base_url)
70
73
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
71
74
  wait_for_crawl_finished crawl[:crawl_id]
72
- ap @stat.get_statistics
73
- Resque.size("cobweb_process_job").should == @base_page_count
75
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
76
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
74
77
  end
75
78
  it "detect crawl finished once" do
76
79
  crawl = @cobweb.start(@base_url)
77
80
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
78
81
  wait_for_crawl_finished crawl[:crawl_id]
79
- Resque.size("cobweb_finished_job").should == 1
82
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
83
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
80
84
  end
81
85
  end
82
86
  describe "with limited mime_types" do
@@ -87,6 +91,7 @@ describe Cobweb, :local_only => true do
87
91
  :cache => nil,
88
92
  :valid_mime_types => ["text/html"]
89
93
  }
94
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
90
95
  @cobweb = Cobweb.new @request
91
96
  end
92
97
 
@@ -94,7 +99,7 @@ describe Cobweb, :local_only => true do
94
99
  crawl = @cobweb.start(@base_url)
95
100
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
96
101
  wait_for_crawl_finished crawl[:crawl_id]
97
- Resque.size("cobweb_process_job").should == 8
102
+ @redis.get("crawl_job_enqueued_count").to_i.should == 8
98
103
 
99
104
  mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
100
105
  mime_types.count.should == 8
@@ -110,6 +115,7 @@ describe Cobweb, :local_only => true do
110
115
  :quiet => true,
111
116
  :cache => nil
112
117
  }
118
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
113
119
  end
114
120
 
115
121
  describe "limit to 1" do
@@ -122,19 +128,19 @@ describe Cobweb, :local_only => true do
122
128
  crawl = @cobweb.start(@base_url)
123
129
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
124
130
  wait_for_crawl_finished crawl[:crawl_id]
125
- Resque.size("cobweb_process_job").should_not == @base_page_count
131
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
126
132
  end
127
133
  it "should only crawl 1 page" do
128
134
  crawl = @cobweb.start(@base_url)
129
135
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
130
136
  wait_for_crawl_finished crawl[:crawl_id]
131
- Resque.size("cobweb_process_job").should == 1
137
+ @redis.get("crawl_job_enqueued_count").to_i.should == 1
132
138
  end
133
139
  it "should notify of crawl finished once" do
134
140
  crawl = @cobweb.start(@base_url)
135
141
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
136
142
  wait_for_crawl_finished crawl[:crawl_id]
137
- Resque.size("cobweb_finished_job").should == 1
143
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
138
144
  end
139
145
  end
140
146
 
@@ -145,6 +151,7 @@ describe Cobweb, :local_only => true do
145
151
  @cobweb = Cobweb.new @request
146
152
  end
147
153
 
154
+ # the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
148
155
  it "should only use html pages towards the crawl limit" do
149
156
  crawl = @cobweb.start(@base_url)
150
157
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -165,19 +172,19 @@ describe Cobweb, :local_only => true do
165
172
  crawl = @cobweb.start(@base_url)
166
173
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
167
174
  wait_for_crawl_finished crawl[:crawl_id]
168
- Resque.size("cobweb_process_job").should_not == @base_page_count
175
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
169
176
  end
170
177
  it "should notify of crawl finished once" do
171
178
  crawl = @cobweb.start(@base_url)
172
179
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
173
180
  wait_for_crawl_finished crawl[:crawl_id]
174
- Resque.size("cobweb_finished_job").should == 1
181
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
175
182
  end
176
183
  it "should only crawl 10 objects" do
177
184
  crawl = @cobweb.start(@base_url)
178
185
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
179
186
  wait_for_crawl_finished crawl[:crawl_id]
180
- Resque.size("cobweb_process_job").should == 10
187
+ @redis.get("crawl_job_enqueued_count").to_i.should == 10
181
188
  end
182
189
  end
183
190
 
@@ -191,23 +198,24 @@ describe Cobweb, :local_only => true do
191
198
  crawl = @cobweb.start(@base_url)
192
199
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
193
200
  wait_for_crawl_finished crawl[:crawl_id]
194
- Resque.size("cobweb_process_job").should == @base_page_count
201
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
195
202
  end
196
203
  it "should notify of crawl finished once" do
197
204
  crawl = @cobweb.start(@base_url)
198
205
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
199
206
  wait_for_crawl_finished crawl[:crawl_id]
200
- Resque.size("cobweb_finished_job").should == 1
207
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
201
208
  end
202
209
  it "should not crawl 100 pages" do
203
210
  crawl = @cobweb.start(@base_url)
204
211
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
205
212
  wait_for_crawl_finished crawl[:crawl_id]
206
- Resque.size("cobweb_process_job").should_not == 100
213
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == 100
207
214
  end
208
215
  end
209
216
  end
210
217
 
218
+
211
219
  after(:all) do
212
220
 
213
221
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
@@ -220,26 +228,43 @@ describe Cobweb, :local_only => true do
220
228
  end
221
229
 
222
230
  def wait_for_crawl_finished(crawl_id, timeout=20)
223
- counter = 0
231
+ @counter = 0
224
232
  start_time = Time.now
225
233
  while(running?(crawl_id) && Time.now < start_time + timeout) do
226
- sleep 0.5
227
- end
228
- if Time.now > start_time + timeout
229
- raise "End of crawl not detected"
230
- end
234
+ sleep 0.5
231
235
  end
232
-
233
- def running?(crawl_id)
234
- @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
236
+ if Time.now > start_time + timeout
237
+ raise "End of crawl not detected"
235
238
  end
239
+ end
236
240
 
237
- def clear_queues
238
- Resque.queues.each do |queue|
239
- Resque.remove_queue(queue)
241
+ def running?(crawl_id)
242
+ status = @stat.get_status
243
+ result = true
244
+ if status == CobwebCrawlHelper::STARTING
245
+ result = true
246
+ else
247
+ if status == @last_stat
248
+ if @counter > 5
249
+ raise "Static status: #{status}"
250
+ else
251
+ @counter += 1
252
+ end
253
+ puts "Static Status.. #{6-@counter}"
254
+ else
255
+ result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
240
256
  end
257
+ end
258
+ @last_stat = @stat.get_status
259
+ result
260
+ end
241
261
 
242
- Resque.size("cobweb_process_job").should == 0
243
- Resque.size("cobweb_finished_job").should == 0
244
- Resque.peek("cobweb_process_job", 0, 200).should be_empty
262
+ def clear_queues
263
+ Resque.queues.each do |queue|
264
+ Resque.remove_queue(queue)
245
265
  end
266
+
267
+ Resque.size("cobweb_process_job").should == 0
268
+ Resque.size("cobweb_finished_job").should == 0
269
+ Resque.peek("cobweb_process_job", 0, 200).should be_empty
270
+ end
@@ -76,11 +76,9 @@ describe ContentLinkParser do
76
76
  links.length.should == 3
77
77
  end
78
78
  end
79
- describe "returning unknown link type" do
79
+ describe "returning unknown link type should raise an error" do
80
80
  it "should return an empty array" do
81
- links = @content_parser.asdfasdfsadf
82
- links.should_not be_nil
83
- links.should be_an_instance_of Array
81
+ lambda {@content_parser.asdfasdfsadf}.should raise_error
84
82
  end
85
83
  end
86
84
  end
@@ -122,7 +120,7 @@ describe ContentLinkParser do
122
120
  describe "ignoring default tags" do
123
121
  it "should not return any links" do
124
122
  parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
125
- parser.links.should be_empty
123
+ lambda{parser.links}.should raise_error(NoMethodError)
126
124
  end
127
125
  end
128
126
  end
@@ -0,0 +1,101 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Cobweb, :local_only => true do
4
+
5
+ before(:all) do
6
+ #store all existing resque process ids so we don't kill them afterwards
7
+ @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
8
+
9
+ # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
+ puts "Starting Workers... Please Wait..."
11
+ `mkdir log`
12
+ io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
13
+ puts "Workers Started."
14
+
15
+ end
16
+
17
+ before(:each) do
18
+ @base_url = "http://localhost:3532/"
19
+ @base_page_count = 77
20
+ clear_queues
21
+ end
22
+
23
+ describe "with a crawl limit" do
24
+ before(:each) do
25
+ @request = {
26
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
27
+ :quiet => true,
28
+ :cache => nil,
29
+ :use_encoding_safe_process_job => true,
30
+ :crawl_limit_by_page => true
31
+ }
32
+ end
33
+
34
+ describe "on ancestry.com.au" do
35
+ describe "limited to 100" do
36
+ before(:each) do
37
+ @request[:crawl_limit] = 100
38
+ @request[:valid_mime_types] = ["text/html"]
39
+ @cobweb = Cobweb.new @request
40
+ end
41
+
42
+ it "should crawl 100 pages" do
43
+ crawl = @cobweb.start("http://www.ancestry.com.au/")
44
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
45
+ wait_for_crawl_finished crawl[:crawl_id], 180
46
+ puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
47
+ end
48
+ end
49
+
50
+ describe "limited to 999" do
51
+ before(:each) do
52
+ @request[:crawl_limit] = 999
53
+ @cobweb = Cobweb.new @request
54
+ end
55
+
56
+ it "should crawl 999 pages" do
57
+ crawl = @cobweb.start("http://www.ancestry.com.au/")
58
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
59
+ wait_for_crawl_finished crawl[:crawl_id], 720
60
+ puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
61
+ end
62
+ end
63
+ __END__
64
+
65
+ end
66
+
67
+ after(:all) do
68
+
69
+ @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
70
+ command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
71
+ IO.popen(command)
72
+
73
+ clear_queues
74
+ end
75
+
76
+ end
77
+
78
+ def wait_for_crawl_finished(crawl_id, timeout=20)
79
+ counter = 0
80
+ start_time = Time.now
81
+ while(running?(crawl_id) && Time.now < start_time + timeout) do
82
+ sleep 0.5
83
+ end
84
+ if Time.now > start_time + timeout
85
+ raise "End of crawl not detected"
86
+ end
87
+ end
88
+
89
+ def running?(crawl_id)
90
+ @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
91
+ end
92
+
93
+ def clear_queues
94
+ Resque.queues.each do |queue|
95
+ Resque.remove_queue(queue)
96
+ end
97
+
98
+ Resque.size("cobweb_process_job").should == 0
99
+ Resque.size("cobweb_finished_job").should == 0
100
+ Resque.peek("cobweb_process_job", 0, 200).should be_empty
101
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.73
4
+ version: 0.0.74
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-20 00:00:00.000000000 Z
12
+ date: 2012-10-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70226914018080 !ruby/object:Gem::Requirement
16
+ requirement: &70347429190520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70226914018080
24
+ version_requirements: *70347429190520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70226914017080 !ruby/object:Gem::Requirement
27
+ requirement: &70347429190020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70226914017080
35
+ version_requirements: *70347429190020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70226914016400 !ruby/object:Gem::Requirement
38
+ requirement: &70347429189540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70226914016400
46
+ version_requirements: *70347429189540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70226914015220 !ruby/object:Gem::Requirement
49
+ requirement: &70347429188880 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70226914015220
57
+ version_requirements: *70347429188880
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70226914014640 !ruby/object:Gem::Requirement
60
+ requirement: &70347429187340 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70226914014640
68
+ version_requirements: *70347429187340
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70226914013860 !ruby/object:Gem::Requirement
71
+ requirement: &70347429185820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70226914013860
79
+ version_requirements: *70347429185820
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70226914013140 !ruby/object:Gem::Requirement
82
+ requirement: &70347429185040 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70226914013140
90
+ version_requirements: *70347429185040
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70226914012280 !ruby/object:Gem::Requirement
93
+ requirement: &70347429184340 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70226914012280
101
+ version_requirements: *70347429184340
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70226914011460 !ruby/object:Gem::Requirement
104
+ requirement: &70347429183120 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70226914011460
112
+ version_requirements: *70347429183120
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70226914010720 !ruby/object:Gem::Requirement
115
+ requirement: &70347429181840 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70226914010720
123
+ version_requirements: *70347429181840
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70226914010260 !ruby/object:Gem::Requirement
126
+ requirement: &70347429180860 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70226914010260
134
+ version_requirements: *70347429180860
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -142,13 +142,14 @@ extensions: []
142
142
  extra_rdoc_files:
143
143
  - README.textile
144
144
  files:
145
+ - spec/cobweb/cobweb_crawl_helper_spec.rb
145
146
  - spec/cobweb/cobweb_crawler_spec.rb
146
147
  - spec/cobweb/cobweb_job_spec.rb
147
148
  - spec/cobweb/cobweb_links_spec.rb
148
149
  - spec/cobweb/cobweb_spec.rb
149
150
  - spec/cobweb/content_link_parser_spec.rb
150
- - spec/cobweb/crawl_spec.rb
151
151
  - spec/cobweb/robots_spec.rb
152
+ - spec/cobweb/site_test_spec.rb.tmp
152
153
  - spec/samples/robots.txt
153
154
  - spec/samples/sample_html_links.html
154
155
  - spec/samples/sample_server.rb
@@ -328,7 +329,9 @@ files:
328
329
  - lib/cobweb_process_job.rb
329
330
  - lib/cobweb_version.rb
330
331
  - lib/content_link_parser.rb
332
+ - lib/crawl.rb
331
333
  - lib/crawl_job.rb
334
+ - lib/crawl_object.rb
332
335
  - lib/encoding_safe_process_job.rb
333
336
  - lib/hash_util.rb
334
337
  - lib/redirect_error.rb