cobweb 0.0.73 → 0.0.74

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.73
2
+ h1. Cobweb v0.0.74
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -4,7 +4,6 @@ require 'resque'
4
4
  require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
- require 'namespaced_redis'
8
7
 
9
8
  Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
10
9
  require file
@@ -46,6 +45,7 @@ class Cobweb
46
45
  default_text_mime_types_to ["text/*", "application/xhtml+xml"]
47
46
  default_obey_robots_to false
48
47
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
48
+ default_valid_mime_types_to ["*/*"]
49
49
 
50
50
  end
51
51
 
@@ -65,7 +65,7 @@ class Cobweb
65
65
  end
66
66
 
67
67
  request.merge!(@options)
68
- @redis = NamespacedRedis.new(request[:redis_options], "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
68
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
69
69
  @redis.set("original_base_url", base_url)
70
70
  @redis.hset "statistics", "queued_at", DateTime.now
71
71
  @redis.set("crawl-counter", 0)
@@ -110,9 +110,9 @@ class Cobweb
110
110
 
111
111
  # connect to redis
112
112
  if options.has_key? :crawl_id
113
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
113
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
114
114
  else
115
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
115
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
116
116
  end
117
117
 
118
118
  content = {:base_url => url}
@@ -269,9 +269,9 @@ class Cobweb
269
269
 
270
270
  # connect to redis
271
271
  if options.has_key? :crawl_id
272
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
272
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
273
273
  else
274
- redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
274
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
275
275
  end
276
276
 
277
277
  content = {:base_url => url}
@@ -15,7 +15,7 @@ class CobwebCrawlHelper
15
15
  @stats = Stats.new(data)
16
16
  end
17
17
 
18
- def destroy(options)
18
+ def destroy(options={})
19
19
 
20
20
  options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
21
21
  options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
@@ -1,7 +1,7 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
3
  require 'ap'
4
- #require 'namespaced_redis'
4
+ require 'redis-namespace'
5
5
 
6
6
  # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
7
7
  class CobwebCrawler
@@ -20,7 +20,7 @@ class CobwebCrawler
20
20
  @options[:crawl_id] = @crawl_id
21
21
  end
22
22
 
23
- @redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}-#{@crawl_id}")
23
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
26
  @debug = @options[:debug]
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.73"
6
+ "0.0.74"
7
7
  end
8
8
 
9
9
  end
@@ -6,7 +6,7 @@ class ContentLinkParser
6
6
 
7
7
  # Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
8
8
  def initialize(url, content, options = {})
9
- @options = options
9
+ @options = {}.merge(options)
10
10
  @url = url
11
11
  @doc = Nokogiri::HTML(content)
12
12
 
@@ -61,8 +61,7 @@ class ContentLinkParser
61
61
  end
62
62
  links.uniq
63
63
  else
64
- puts "Warning: There was no configuration on how to find #{m} links"
65
- []
64
+ super
66
65
  end
67
66
  end
68
67
 
data/lib/crawl.rb ADDED
@@ -0,0 +1,263 @@
1
+ module CobwebModule
2
+ class Crawl
3
+
4
+ def initialize(options={})
5
+ @options = HashUtil.deep_symbolize_keys(options)
6
+
7
+ setup_defaults
8
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
9
+ @stats = Stats.new(@options)
10
+ @debug = @options[:debug]
11
+ @first_to_finish = false
12
+
13
+ end
14
+
15
+ # Returns true if the url requested is already in the crawled queue
16
+ def already_crawled?(link=@options[:url])
17
+ @redis.sismember "crawled", link
18
+ end
19
+
20
+ def already_queued?(link)
21
+ @redis.sismember "queued", link
22
+ end
23
+
24
+ # Returns true if the crawl count is within limits
25
+ def within_crawl_limits?
26
+ @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
27
+ end
28
+
29
+ # Returns true if the processed count is within limits
30
+ def within_process_limits?
31
+ @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
32
+ end
33
+
34
+ # Returns true if the queue count is calculated to be still within limits when complete
35
+ def within_queue_limits?
36
+
37
+ # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
38
+ if @options[:crawl_limit_by_page]
39
+ return true
40
+
41
+ # if a crawl limit is set, limit queue size to crawled + queue
42
+ elsif @options[:crawl_limit].to_i > 0
43
+ (queue_counter + crawl_counter) < @options[:crawl_limit].to_i
44
+
45
+ # no crawl limit set so always within queue limit
46
+ else
47
+ true
48
+ end
49
+ end
50
+
51
+ def retrieve
52
+ unless already_crawled?
53
+ if within_crawl_limits?
54
+ @stats.update_status("Retrieving #{@options[:url]}...")
55
+ @content = Cobweb.new(@options).get(@options[:url], @options)
56
+ if @options[:url] == @redis.get("original_base_url")
57
+ @redis.set("crawled_base_url", @content[:base_url])
58
+ end
59
+ update_queues
60
+
61
+ if content.permitted_type?
62
+ ## update statistics
63
+
64
+ @stats.update_statistics(@content)
65
+ return true
66
+ end
67
+ else
68
+ decrement_queue_counter
69
+ end
70
+ else
71
+ decrement_queue_counter
72
+ end
73
+ false
74
+ end
75
+
76
+ def process_links &block
77
+
78
+ # set the base url if this is the first page
79
+ set_base_url @redis
80
+
81
+ @cobweb_links = CobwebLinks.new(@options)
82
+ if within_queue_limits?
83
+ internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
84
+ #get rid of duplicate links in the same page.
85
+ internal_links.uniq!
86
+ # select the link if its internal
87
+ internal_links.select! { |link| @cobweb_links.internal?(link) }
88
+
89
+ # reject the link if we've crawled it or queued it
90
+ internal_links.reject! { |link| @redis.sismember("crawled", link) }
91
+ internal_links.reject! { |link| @redis.sismember("queued", link) }
92
+
93
+ internal_links.each do |link|
94
+ if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
95
+ if status != CobwebCrawlHelper::CANCELLED
96
+ yield link if block_given?
97
+ unless link.nil?
98
+ @redis.sadd "queued", link
99
+ increment_queue_counter
100
+ end
101
+ else
102
+ puts "Cannot enqueue new content as crawl has been cancelled." if @options[:debug]
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ def content
110
+ raise "Content is not available" if @content.nil?
111
+ CobwebModule::CrawlObject.new(@content, @options)
112
+ end
113
+
114
+ def update_queues
115
+ @redis.multi do
116
+ #@redis.incr "inprogress"
117
+ # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
118
+ @redis.srem "queued", @options[:url]
119
+ @redis.sadd "crawled", @options[:url]
120
+ if content.url != @options[:url]
121
+ @redis.srem "queued", content.url
122
+ @redis.sadd "crawled", content.url
123
+ end
124
+ # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
125
+ if @options[:crawl_limit_by_page]
126
+ ap "#{content.mime_type} - #{content.url}"
127
+ if content.mime_type.match("text/html")
128
+ increment_crawl_counter
129
+ end
130
+ else
131
+ increment_crawl_counter
132
+ end
133
+ decrement_queue_counter
134
+ end
135
+ end
136
+
137
+ def to_be_processed?
138
+ !finished? || first_to_finish? || within_process_limits?
139
+ end
140
+
141
+ def process
142
+ if @options[:crawl_limit_by_page]
143
+ if content.mime_type.match("text/html")
144
+ increment_process_counter
145
+ end
146
+ else
147
+ increment_process_counter
148
+ end
149
+ end
150
+
151
+ def finished?
152
+ print_counters
153
+ # if there's nothing left queued or the crawled limit has been reached
154
+ if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
155
+ if queue_counter.to_i == 0
156
+ finished
157
+ return true
158
+ end
159
+ elsif (queue_counter.to_i) == 0 || crawl_counter.to_i >= @options[:crawl_limit].to_i
160
+ finished
161
+ return true
162
+ end
163
+ false
164
+ end
165
+
166
+ def finished
167
+ set_first_to_finish if !@redis.exists("first_to_finish")
168
+ ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if @options[:debug]
169
+ @stats.end_crawl(@options)
170
+ end
171
+
172
+ def set_first_to_finish
173
+ @redis.watch("first_to_finish") do
174
+ if !@redis.exists("first_to_finish")
175
+ @redis.multi do
176
+ puts "set first to finish"
177
+ @first_to_finish = true
178
+ @redis.set("first_to_finish", 1)
179
+ end
180
+ else
181
+ @redis.unwatch
182
+ end
183
+ end
184
+ end
185
+
186
+
187
+ def first_to_finish?
188
+ @first_to_finish
189
+ end
190
+
191
+ def crawled_base_url
192
+ @redis.get("crawled_base_url")
193
+ end
194
+
195
+ def statistics
196
+ @stats.get_statistics
197
+ end
198
+
199
+ def redis
200
+ @redis
201
+ end
202
+
203
+ private
204
+ def setup_defaults
205
+ @options[:redis_options] = {} unless @options.has_key? :redis_options
206
+ @options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
207
+ @options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
208
+ end
209
+
210
+ # Increments the queue counter and refreshes crawl counters
211
+ def increment_queue_counter
212
+ @redis.incr "queue-counter"
213
+ end
214
+ # Increments the crawl counter and refreshes crawl counters
215
+ def increment_crawl_counter
216
+ @redis.incr "crawl-counter"
217
+ end
218
+ # Increments the process counter
219
+ def increment_process_counter
220
+ @redis.incr "process-counter"
221
+ end
222
+ # Decrements the queue counter and refreshes crawl counters
223
+ def decrement_queue_counter
224
+ @redis.decr "queue-counter"
225
+ end
226
+
227
+ def crawl_counter
228
+ @redis.get("crawl-counter").to_i
229
+ end
230
+ def queue_counter
231
+ @redis.get("queue-counter").to_i
232
+ end
233
+ def process_counter
234
+ @redis.get("process-counter").to_i
235
+ end
236
+
237
+ def status
238
+ @stats.get_status
239
+ end
240
+
241
+ def print_counters
242
+ puts counters
243
+ end
244
+
245
+ def counters
246
+ "crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
247
+ end
248
+
249
+ # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
250
+ def set_base_url(redis)
251
+ if redis.get("base_url").nil?
252
+ unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal]
253
+ uri = Addressable::URI.parse(content.redirect_through.last)
254
+ redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
255
+ end
256
+ redis.set("base_url", content.url)
257
+ end
258
+ end
259
+
260
+
261
+
262
+ end
263
+ end
data/lib/crawl_job.rb CHANGED
@@ -5,132 +5,58 @@ class CrawlJob
5
5
  require "net/https"
6
6
  require "uri"
7
7
  require "redis"
8
- require 'namespaced_redis'
9
-
8
+
10
9
  @queue = :cobweb_crawl_job
11
-
10
+
12
11
  # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
13
12
  def self.perform(content_request)
14
- # change all hash keys to symbols
15
- content_request = HashUtil.deep_symbolize_keys(content_request)
16
- @content_request = content_request
17
- @crawl = CobwebCrawlHelper.new(content_request)
18
-
19
- content_request[:redis_options] = {} unless content_request.has_key? :redis_options
20
- content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
21
- content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
22
13
 
23
- @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
24
- @stats = Stats.new(content_request)
14
+ # setup the crawl class to manage the crawl of this object
15
+ @crawl = CobwebModule::Crawl.new(content_request)
25
16
 
26
- @debug = content_request[:debug]
17
+ # update the counters and then perform the get, returns false if we are outwith limits
18
+ if @crawl.retrieve
27
19
 
28
- # check we haven't crawled this url before
29
- unless @redis.sismember "crawled", content_request[:url]
30
- # if there is no limit or we're still under it lets get the url
31
- if within_crawl_limits?(content_request[:crawl_limit])
32
- if @crawl.status != CobwebCrawlHelper::CANCELLED
33
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
- if content_request[:url] == @redis.get("original_base_url")
35
- @redis.set("crawled_base_url", content[:base_url])
36
- end
37
- if is_permitted_type(content)
38
- begin
39
- @redis.incr "inprogress"
40
- # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
- @redis.srem "queued", content_request[:url]
42
- @redis.sadd "crawled", content_request[:url]
43
- @redis.srem "queued", content[:url]
44
- @redis.sadd "crawled", content[:url]
45
- # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
46
- if content_request[:crawl_limit_by_page]
47
- if content[:mime_type].match("text/html")
48
- increment_crawl_started_counter
49
- end
50
- else
51
- increment_crawl_started_counter
52
- end
53
-
54
- ## update statistics
55
- @stats.update_status("Crawling #{content_request[:url]}...")
56
- @stats.update_statistics(content)
57
-
58
- # set the base url if this is the first page
59
- set_base_url @redis, content, content_request
60
-
61
- @cobweb_links = CobwebLinks.new(content_request)
62
- if within_queue_limits?(content_request[:crawl_limit])
63
- internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
64
- #get rid of duplicate links in the same page.
65
- internal_links.uniq!
66
- # select the link if its internal
67
- internal_links.select! { |link| @cobweb_links.internal?(link) }
68
-
69
- # reject the link if we've crawled it or queued it
70
- internal_links.reject! { |link| @redis.sismember("crawled", link) }
71
- internal_links.reject! { |link| @redis.sismember("queued", link) }
72
-
73
- internal_links.each do |link|
74
- puts link
75
- puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
76
- if within_queue_limits?(content_request[:crawl_limit])
77
- if @crawl.status != CobwebCrawlHelper::CANCELLED
78
- enqueue_content(content_request, link)
79
- else
80
- puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
81
- end
82
- end
83
- end
84
- end
85
-
86
- # enqueue to processing queue
87
- send_to_processing_queue(content, content_request)
20
+ # if the crawled object is an object type we are interested
21
+ if @crawl.content.permitted_type?
22
+
23
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
24
+ @crawl.process_links do |link|
88
25
 
89
- #if the enqueue counter has been requested update that
90
- if content_request.has_key?(:enqueue_counter_key)
91
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
92
- current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
93
- enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
94
- end
26
+ # enqueue the links to resque
27
+ puts "ENQUEUED LINK: #{link}"
28
+ enqueue_content(content_request, link)
95
29
 
96
- ensure
97
- @redis.decr "inprogress"
98
- #update the queued and crawled lists if we are within the crawl limits.
99
-
100
- # update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
101
- # really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
102
- # stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
103
- if content_request[:crawl_limit_by_page]
104
- if content[:mime_type].match("text/html")
105
- increment_crawl_counter
106
- end
107
- else
108
- increment_crawl_counter
109
- end
110
- puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
111
- end
112
- else
113
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
30
+ end
31
+
32
+
33
+ if @crawl.to_be_processed?
34
+ @crawl.process
35
+
36
+ # enqueue to processing queue
37
+ @crawl.redis.incr("crawl_job_enqueued_count")
38
+ puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
39
+ send_to_processing_queue(@crawl.content.to_hash, content_request)
40
+
41
+
42
+ #if the enqueue counter has been requested update that
43
+ if content_request.has_key?(:enqueue_counter_key)
44
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
45
+ current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
46
+ enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
114
47
  end
115
48
  else
116
- puts "ignoring #{content_request[:url]} as crawl has been cancelled." if content_request[:debug]
49
+ ap "@crawl.finished? #{@crawl.finished?}"
50
+ ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
51
+ ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
117
52
  end
118
- else
119
- puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
120
53
  end
121
-
122
- else
123
- @redis.srem "queued", content_request[:url]
124
- puts "Already crawled #{content_request[:url]}" if content_request[:debug]
125
54
  end
126
-
127
- decrement_queue_counter
128
- # if there's nothing left queued or the crawled limit has been reached
129
- if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
130
- if queue_counter + crawl_started_counter - crawl_counter == 0
131
- finished(content_request)
132
- end
133
- elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
55
+
56
+ # test queue and crawl sizes to see if we have completed the crawl
57
+ ap "finished? #{@crawl.finished?}"
58
+ ap "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
59
+ if @crawl.finished? && @crawl.first_to_finish?
134
60
  finished(content_request)
135
61
  end
136
62
 
@@ -138,19 +64,12 @@ class CrawlJob
138
64
 
139
65
  # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
140
66
  def self.finished(content_request)
141
- # finished
142
- if @crawl.status != CobwebCrawlHelper::FINISHED and @crawl.status != CobwebCrawlHelper::CANCELLED && @redis.get("inprogress").to_i==0
143
- ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
144
- @stats.end_crawl(content_request)
145
-
146
- additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
147
- additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
148
- additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
149
-
150
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
151
- else
152
- # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
153
- end
67
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
68
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
69
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
70
+
71
+ @crawl.redis.incr("crawl_finished_enqueued_count")
72
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
154
73
  end
155
74
 
156
75
  # Enqueues the content to the processing queue setup in options
@@ -171,34 +90,6 @@ class CrawlJob
171
90
 
172
91
  private
173
92
 
174
- # Helper method to determine if this content is to be processed or not
175
- def self.is_permitted_type(content)
176
- @content_request[:valid_mime_types].each do |mime_type|
177
- return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
178
- end
179
- false
180
- end
181
-
182
- # Returns true if the crawl count is within limits
183
- def self.within_crawl_limits?(crawl_limit)
184
- crawl_limit.nil? or crawl_counter < crawl_limit.to_i
185
- end
186
-
187
- # Returns true if the queue count is calculated to be still within limits when complete
188
- def self.within_queue_limits?(crawl_limit)
189
- (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
190
- end
191
-
192
- # Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
193
- def self.set_base_url(redis, content, content_request)
194
- if redis.get("base_url").nil?
195
- unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
196
- uri = Addressable::URI.parse(content[:redirect_through].last)
197
- redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
198
- end
199
- redis.set("base_url", content[:url])
200
- end
201
- end
202
93
 
203
94
  # Enqueues content to the crawl_job queue
204
95
  def self.enqueue_content(content_request, link)
@@ -206,43 +97,8 @@ class CrawlJob
206
97
  new_request[:url] = link
207
98
  new_request[:parent] = content_request[:url]
208
99
  #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
209
- @redis.sadd "queued", link
210
100
  Resque.enqueue(CrawlJob, new_request)
211
- increment_queue_counter
212
101
  end
213
102
 
214
- # Increments the queue counter and refreshes crawl counters
215
- def self.increment_queue_counter
216
- @redis.incr "queue-counter"
217
- end
218
- # Increments the crawl counter and refreshes crawl counters
219
- def self.increment_crawl_counter
220
- @redis.incr "crawl-counter"
221
- end
222
- def self.increment_crawl_started_counter
223
- @redis.incr "crawl-started-counter"
224
- end
225
- # Decrements the queue counter and refreshes crawl counters
226
- def self.decrement_queue_counter
227
- @redis.decr "queue-counter"
228
- end
229
-
230
- def self.crawl_counter
231
- @redis.get("crawl-counter").to_i
232
- end
233
- def self.crawl_started_counter
234
- @redis.get("crawl-started-counter").to_i
235
- end
236
- def self.queue_counter
237
- @redis.get("queue-counter").to_i
238
- end
239
-
240
- def self.print_counters
241
- puts counters
242
- end
243
-
244
- def self.counters
245
- "crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
246
- end
247
103
 
248
104
  end
@@ -0,0 +1,30 @@
1
+ module CobwebModule
2
+ class CrawlObject
3
+
4
+ def initialize(content_hash, options={})
5
+ @content = HashUtil.deep_symbolize_keys(content_hash)
6
+ @options = options
7
+ end
8
+
9
+
10
+ # Helper method to determine if this content is to be processed or not
11
+ def permitted_type?
12
+ @options[:valid_mime_types].each do |valid_mime_type|
13
+ return true if @content[:mime_type].match(Cobweb.escape_pattern_for_regex(valid_mime_type))
14
+ end
15
+ false
16
+ end
17
+
18
+ def method_missing(m)
19
+ if @content.keys.include? m.to_sym
20
+ @content[m.to_sym]
21
+ else
22
+ super
23
+ end
24
+ end
25
+
26
+ def to_hash
27
+ @content
28
+ end
29
+ end
30
+ end
data/lib/hash_util.rb CHANGED
@@ -3,6 +3,7 @@ class HashUtil
3
3
 
4
4
  # Returns a hash with the keys converted to symbols
5
5
  def self.deep_symbolize_keys(hash)
6
+ raise "Cannot symbolize keys for a nil object" if hash.nil?
6
7
  hash.keys.each do |key|
7
8
  value = hash[key]
8
9
  hash.delete(key)
data/lib/server.rb CHANGED
@@ -16,7 +16,7 @@ class Server < Sinatra::Base
16
16
  @crawls = []
17
17
  @full_redis.smembers("cobweb_crawls").each do |crawl_id|
18
18
  version = cobweb_version(crawl_id)
19
- redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{crawl_id}")
19
+ redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
20
20
  stats = HashUtil.deep_symbolize_keys({
21
21
  :cobweb_version => version,
22
22
  :crawl_details => redis.hgetall("crawl_details"),
@@ -33,7 +33,7 @@ class Server < Sinatra::Base
33
33
  get '/statistics/:crawl_id' do
34
34
 
35
35
  version = cobweb_version(params[:crawl_id])
36
- redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{params[:crawl_id]}")
36
+ redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
37
37
 
38
38
  @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
39
39
  if @statistics[:status_counts].nil?
data/lib/stats.rb CHANGED
@@ -8,7 +8,7 @@ class Stats
8
8
  def initialize(options)
9
9
  options[:redis_options] = {} unless options.has_key? :redis_options
10
10
  @full_redis = Redis.new(options[:redis_options])
11
- @redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
11
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
12
12
  end
13
13
 
14
14
  # Sets up the crawl in statistics
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
9
9
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
10
  puts "Starting Workers... Please Wait..."
11
11
  `mkdir log`
12
- io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=1 QUEUE=cobweb_crawl_job > log/output.log &")
12
+ io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
13
13
  puts "Workers Started."
14
14
 
15
15
  end
@@ -17,6 +17,7 @@ describe Cobweb, :local_only => true do
17
17
  before(:each) do
18
18
  @base_url = "http://localhost:3532/"
19
19
  @base_page_count = 77
20
+
20
21
  clear_queues
21
22
  end
22
23
 
@@ -29,6 +30,7 @@ describe Cobweb, :local_only => true do
29
30
  :debug => false,
30
31
  :cache => nil
31
32
  }
33
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
32
34
  @cobweb = Cobweb.new @request
33
35
  end
34
36
  it "should not crawl anything if nothing has started" do
@@ -37,7 +39,7 @@ describe Cobweb, :local_only => true do
37
39
  crawl_obj.destroy
38
40
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
39
41
  wait_for_crawl_finished crawl[:crawl_id]
40
- Resque.size("cobweb_process_job").should == 0
42
+ @redis.get("crawl_job_enqueued_count").to_i.should == 0
41
43
  end
42
44
 
43
45
  it "should not complete the crawl when cancelled" do
@@ -47,8 +49,8 @@ describe Cobweb, :local_only => true do
47
49
  crawl_obj.destroy
48
50
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
49
51
  wait_for_crawl_finished crawl[:crawl_id]
50
- Resque.size("cobweb_process_job").should > 0
51
- Resque.size("cobweb_process_job").should_not == @base_page_count
52
+ @redis.get("crawl_job_enqueued_count").to_i.should > 0
53
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
52
54
  end
53
55
 
54
56
  end
@@ -61,22 +63,24 @@ describe Cobweb, :local_only => true do
61
63
  :debug => false,
62
64
  :cache => nil
63
65
  }
66
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
67
+
64
68
  @cobweb = Cobweb.new @request
65
69
  end
66
70
 
67
71
  it "should crawl entire site" do
68
- ap Resque.size("cobweb_process_job")
69
72
  crawl = @cobweb.start(@base_url)
70
73
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
71
74
  wait_for_crawl_finished crawl[:crawl_id]
72
- ap @stat.get_statistics
73
- Resque.size("cobweb_process_job").should == @base_page_count
75
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
76
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
74
77
  end
75
78
  it "detect crawl finished once" do
76
79
  crawl = @cobweb.start(@base_url)
77
80
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
78
81
  wait_for_crawl_finished crawl[:crawl_id]
79
- Resque.size("cobweb_finished_job").should == 1
82
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
83
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
80
84
  end
81
85
  end
82
86
  describe "with limited mime_types" do
@@ -87,6 +91,7 @@ describe Cobweb, :local_only => true do
87
91
  :cache => nil,
88
92
  :valid_mime_types => ["text/html"]
89
93
  }
94
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
90
95
  @cobweb = Cobweb.new @request
91
96
  end
92
97
 
@@ -94,7 +99,7 @@ describe Cobweb, :local_only => true do
94
99
  crawl = @cobweb.start(@base_url)
95
100
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
96
101
  wait_for_crawl_finished crawl[:crawl_id]
97
- Resque.size("cobweb_process_job").should == 8
102
+ @redis.get("crawl_job_enqueued_count").to_i.should == 8
98
103
 
99
104
  mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
100
105
  mime_types.count.should == 8
@@ -110,6 +115,7 @@ describe Cobweb, :local_only => true do
110
115
  :quiet => true,
111
116
  :cache => nil
112
117
  }
118
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
113
119
  end
114
120
 
115
121
  describe "limit to 1" do
@@ -122,19 +128,19 @@ describe Cobweb, :local_only => true do
122
128
  crawl = @cobweb.start(@base_url)
123
129
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
124
130
  wait_for_crawl_finished crawl[:crawl_id]
125
- Resque.size("cobweb_process_job").should_not == @base_page_count
131
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
126
132
  end
127
133
  it "should only crawl 1 page" do
128
134
  crawl = @cobweb.start(@base_url)
129
135
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
130
136
  wait_for_crawl_finished crawl[:crawl_id]
131
- Resque.size("cobweb_process_job").should == 1
137
+ @redis.get("crawl_job_enqueued_count").to_i.should == 1
132
138
  end
133
139
  it "should notify of crawl finished once" do
134
140
  crawl = @cobweb.start(@base_url)
135
141
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
136
142
  wait_for_crawl_finished crawl[:crawl_id]
137
- Resque.size("cobweb_finished_job").should == 1
143
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
138
144
  end
139
145
  end
140
146
 
@@ -145,6 +151,7 @@ describe Cobweb, :local_only => true do
145
151
  @cobweb = Cobweb.new @request
146
152
  end
147
153
 
154
+ # the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
148
155
  it "should only use html pages towards the crawl limit" do
149
156
  crawl = @cobweb.start(@base_url)
150
157
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -165,19 +172,19 @@ describe Cobweb, :local_only => true do
165
172
  crawl = @cobweb.start(@base_url)
166
173
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
167
174
  wait_for_crawl_finished crawl[:crawl_id]
168
- Resque.size("cobweb_process_job").should_not == @base_page_count
175
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
169
176
  end
170
177
  it "should notify of crawl finished once" do
171
178
  crawl = @cobweb.start(@base_url)
172
179
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
173
180
  wait_for_crawl_finished crawl[:crawl_id]
174
- Resque.size("cobweb_finished_job").should == 1
181
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
175
182
  end
176
183
  it "should only crawl 10 objects" do
177
184
  crawl = @cobweb.start(@base_url)
178
185
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
179
186
  wait_for_crawl_finished crawl[:crawl_id]
180
- Resque.size("cobweb_process_job").should == 10
187
+ @redis.get("crawl_job_enqueued_count").to_i.should == 10
181
188
  end
182
189
  end
183
190
 
@@ -191,23 +198,24 @@ describe Cobweb, :local_only => true do
191
198
  crawl = @cobweb.start(@base_url)
192
199
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
193
200
  wait_for_crawl_finished crawl[:crawl_id]
194
- Resque.size("cobweb_process_job").should == @base_page_count
201
+ @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
195
202
  end
196
203
  it "should notify of crawl finished once" do
197
204
  crawl = @cobweb.start(@base_url)
198
205
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
199
206
  wait_for_crawl_finished crawl[:crawl_id]
200
- Resque.size("cobweb_finished_job").should == 1
207
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 1
201
208
  end
202
209
  it "should not crawl 100 pages" do
203
210
  crawl = @cobweb.start(@base_url)
204
211
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
205
212
  wait_for_crawl_finished crawl[:crawl_id]
206
- Resque.size("cobweb_process_job").should_not == 100
213
+ @redis.get("crawl_job_enqueued_count").to_i.should_not == 100
207
214
  end
208
215
  end
209
216
  end
210
217
 
218
+
211
219
  after(:all) do
212
220
 
213
221
  @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
@@ -220,26 +228,43 @@ describe Cobweb, :local_only => true do
220
228
  end
221
229
 
222
230
  def wait_for_crawl_finished(crawl_id, timeout=20)
223
- counter = 0
231
+ @counter = 0
224
232
  start_time = Time.now
225
233
  while(running?(crawl_id) && Time.now < start_time + timeout) do
226
- sleep 0.5
227
- end
228
- if Time.now > start_time + timeout
229
- raise "End of crawl not detected"
230
- end
234
+ sleep 0.5
231
235
  end
232
-
233
- def running?(crawl_id)
234
- @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
236
+ if Time.now > start_time + timeout
237
+ raise "End of crawl not detected"
235
238
  end
239
+ end
236
240
 
237
- def clear_queues
238
- Resque.queues.each do |queue|
239
- Resque.remove_queue(queue)
241
+ def running?(crawl_id)
242
+ status = @stat.get_status
243
+ result = true
244
+ if status == CobwebCrawlHelper::STARTING
245
+ result = true
246
+ else
247
+ if status == @last_stat
248
+ if @counter > 5
249
+ raise "Static status: #{status}"
250
+ else
251
+ @counter += 1
252
+ end
253
+ puts "Static Status.. #{6-@counter}"
254
+ else
255
+ result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
240
256
  end
257
+ end
258
+ @last_stat = @stat.get_status
259
+ result
260
+ end
241
261
 
242
- Resque.size("cobweb_process_job").should == 0
243
- Resque.size("cobweb_finished_job").should == 0
244
- Resque.peek("cobweb_process_job", 0, 200).should be_empty
262
+ def clear_queues
263
+ Resque.queues.each do |queue|
264
+ Resque.remove_queue(queue)
245
265
  end
266
+
267
+ Resque.size("cobweb_process_job").should == 0
268
+ Resque.size("cobweb_finished_job").should == 0
269
+ Resque.peek("cobweb_process_job", 0, 200).should be_empty
270
+ end
@@ -76,11 +76,9 @@ describe ContentLinkParser do
76
76
  links.length.should == 3
77
77
  end
78
78
  end
79
- describe "returning unknown link type" do
79
+ describe "returning unknown link type should raise an error" do
80
80
  it "should return an empty array" do
81
- links = @content_parser.asdfasdfsadf
82
- links.should_not be_nil
83
- links.should be_an_instance_of Array
81
+ lambda {@content_parser.asdfasdfsadf}.should raise_error
84
82
  end
85
83
  end
86
84
  end
@@ -122,7 +120,7 @@ describe ContentLinkParser do
122
120
  describe "ignoring default tags" do
123
121
  it "should not return any links" do
124
122
  parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
125
- parser.links.should be_empty
123
+ lambda{parser.links}.should raise_error(NoMethodError)
126
124
  end
127
125
  end
128
126
  end
@@ -0,0 +1,101 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Cobweb, :local_only => true do
4
+
5
+ before(:all) do
6
+ #store all existing resque process ids so we don't kill them afterwards
7
+ @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
8
+
9
+ # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
+ puts "Starting Workers... Please Wait..."
11
+ `mkdir log`
12
+ io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
13
+ puts "Workers Started."
14
+
15
+ end
16
+
17
+ before(:each) do
18
+ @base_url = "http://localhost:3532/"
19
+ @base_page_count = 77
20
+ clear_queues
21
+ end
22
+
23
+ describe "with a crawl limit" do
24
+ before(:each) do
25
+ @request = {
26
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
27
+ :quiet => true,
28
+ :cache => nil,
29
+ :use_encoding_safe_process_job => true,
30
+ :crawl_limit_by_page => true
31
+ }
32
+ end
33
+
34
+ describe "on ancestry.com.au" do
35
+ describe "limited to 100" do
36
+ before(:each) do
37
+ @request[:crawl_limit] = 100
38
+ @request[:valid_mime_types] = ["text/html"]
39
+ @cobweb = Cobweb.new @request
40
+ end
41
+
42
+ it "should crawl 100 pages" do
43
+ crawl = @cobweb.start("http://www.ancestry.com.au/")
44
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
45
+ wait_for_crawl_finished crawl[:crawl_id], 180
46
+ puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
47
+ end
48
+ end
49
+
50
+ describe "limited to 999" do
51
+ before(:each) do
52
+ @request[:crawl_limit] = 999
53
+ @cobweb = Cobweb.new @request
54
+ end
55
+
56
+ it "should crawl 999 pages" do
57
+ crawl = @cobweb.start("http://www.ancestry.com.au/")
58
+ @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
59
+ wait_for_crawl_finished crawl[:crawl_id], 720
60
+ puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
61
+ end
62
+ end
63
+ __END__
64
+
65
+ end
66
+
67
+ after(:all) do
68
+
69
+ @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
70
+ command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
71
+ IO.popen(command)
72
+
73
+ clear_queues
74
+ end
75
+
76
+ end
77
+
78
+ def wait_for_crawl_finished(crawl_id, timeout=20)
79
+ counter = 0
80
+ start_time = Time.now
81
+ while(running?(crawl_id) && Time.now < start_time + timeout) do
82
+ sleep 0.5
83
+ end
84
+ if Time.now > start_time + timeout
85
+ raise "End of crawl not detected"
86
+ end
87
+ end
88
+
89
+ def running?(crawl_id)
90
+ @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
91
+ end
92
+
93
+ def clear_queues
94
+ Resque.queues.each do |queue|
95
+ Resque.remove_queue(queue)
96
+ end
97
+
98
+ Resque.size("cobweb_process_job").should == 0
99
+ Resque.size("cobweb_finished_job").should == 0
100
+ Resque.peek("cobweb_process_job", 0, 200).should be_empty
101
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.73
4
+ version: 0.0.74
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-20 00:00:00.000000000 Z
12
+ date: 2012-10-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70226914018080 !ruby/object:Gem::Requirement
16
+ requirement: &70347429190520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70226914018080
24
+ version_requirements: *70347429190520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70226914017080 !ruby/object:Gem::Requirement
27
+ requirement: &70347429190020 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70226914017080
35
+ version_requirements: *70347429190020
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70226914016400 !ruby/object:Gem::Requirement
38
+ requirement: &70347429189540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70226914016400
46
+ version_requirements: *70347429189540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70226914015220 !ruby/object:Gem::Requirement
49
+ requirement: &70347429188880 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70226914015220
57
+ version_requirements: *70347429188880
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70226914014640 !ruby/object:Gem::Requirement
60
+ requirement: &70347429187340 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70226914014640
68
+ version_requirements: *70347429187340
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70226914013860 !ruby/object:Gem::Requirement
71
+ requirement: &70347429185820 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70226914013860
79
+ version_requirements: *70347429185820
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70226914013140 !ruby/object:Gem::Requirement
82
+ requirement: &70347429185040 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70226914013140
90
+ version_requirements: *70347429185040
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70226914012280 !ruby/object:Gem::Requirement
93
+ requirement: &70347429184340 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70226914012280
101
+ version_requirements: *70347429184340
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70226914011460 !ruby/object:Gem::Requirement
104
+ requirement: &70347429183120 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70226914011460
112
+ version_requirements: *70347429183120
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70226914010720 !ruby/object:Gem::Requirement
115
+ requirement: &70347429181840 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70226914010720
123
+ version_requirements: *70347429181840
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70226914010260 !ruby/object:Gem::Requirement
126
+ requirement: &70347429180860 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70226914010260
134
+ version_requirements: *70347429180860
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -142,13 +142,14 @@ extensions: []
142
142
  extra_rdoc_files:
143
143
  - README.textile
144
144
  files:
145
+ - spec/cobweb/cobweb_crawl_helper_spec.rb
145
146
  - spec/cobweb/cobweb_crawler_spec.rb
146
147
  - spec/cobweb/cobweb_job_spec.rb
147
148
  - spec/cobweb/cobweb_links_spec.rb
148
149
  - spec/cobweb/cobweb_spec.rb
149
150
  - spec/cobweb/content_link_parser_spec.rb
150
- - spec/cobweb/crawl_spec.rb
151
151
  - spec/cobweb/robots_spec.rb
152
+ - spec/cobweb/site_test_spec.rb.tmp
152
153
  - spec/samples/robots.txt
153
154
  - spec/samples/sample_html_links.html
154
155
  - spec/samples/sample_server.rb
@@ -328,7 +329,9 @@ files:
328
329
  - lib/cobweb_process_job.rb
329
330
  - lib/cobweb_version.rb
330
331
  - lib/content_link_parser.rb
332
+ - lib/crawl.rb
331
333
  - lib/crawl_job.rb
334
+ - lib/crawl_object.rb
332
335
  - lib/encoding_safe_process_job.rb
333
336
  - lib/hash_util.rb
334
337
  - lib/redirect_error.rb