cobweb 0.0.73 → 0.0.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +6 -6
- data/lib/cobweb_crawl_helper.rb +1 -1
- data/lib/cobweb_crawler.rb +2 -2
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +2 -3
- data/lib/crawl.rb +263 -0
- data/lib/crawl_job.rb +45 -189
- data/lib/crawl_object.rb +30 -0
- data/lib/hash_util.rb +1 -0
- data/lib/server.rb +2 -2
- data/lib/stats.rb +1 -1
- data/spec/cobweb/{crawl_spec.rb → cobweb_crawl_helper_spec.rb} +0 -0
- data/spec/cobweb/cobweb_job_spec.rb +58 -33
- data/spec/cobweb/content_link_parser_spec.rb +3 -5
- data/spec/cobweb/site_test_spec.rb.tmp +101 -0
- metadata +28 -25
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -4,7 +4,6 @@ require 'resque'
|
|
4
4
|
require "addressable/uri"
|
5
5
|
require 'digest/sha1'
|
6
6
|
require 'base64'
|
7
|
-
require 'namespaced_redis'
|
8
7
|
|
9
8
|
Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
10
9
|
require file
|
@@ -46,6 +45,7 @@ class Cobweb
|
|
46
45
|
default_text_mime_types_to ["text/*", "application/xhtml+xml"]
|
47
46
|
default_obey_robots_to false
|
48
47
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
48
|
+
default_valid_mime_types_to ["*/*"]
|
49
49
|
|
50
50
|
end
|
51
51
|
|
@@ -65,7 +65,7 @@ class Cobweb
|
|
65
65
|
end
|
66
66
|
|
67
67
|
request.merge!(@options)
|
68
|
-
@redis =
|
68
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
|
69
69
|
@redis.set("original_base_url", base_url)
|
70
70
|
@redis.hset "statistics", "queued_at", DateTime.now
|
71
71
|
@redis.set("crawl-counter", 0)
|
@@ -110,9 +110,9 @@ class Cobweb
|
|
110
110
|
|
111
111
|
# connect to redis
|
112
112
|
if options.has_key? :crawl_id
|
113
|
-
redis =
|
113
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
|
114
114
|
else
|
115
|
-
redis =
|
115
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
116
116
|
end
|
117
117
|
|
118
118
|
content = {:base_url => url}
|
@@ -269,9 +269,9 @@ class Cobweb
|
|
269
269
|
|
270
270
|
# connect to redis
|
271
271
|
if options.has_key? :crawl_id
|
272
|
-
redis =
|
272
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
|
273
273
|
else
|
274
|
-
redis =
|
274
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
275
275
|
end
|
276
276
|
|
277
277
|
content = {:base_url => url}
|
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -15,7 +15,7 @@ class CobwebCrawlHelper
|
|
15
15
|
@stats = Stats.new(data)
|
16
16
|
end
|
17
17
|
|
18
|
-
def destroy(options)
|
18
|
+
def destroy(options={})
|
19
19
|
|
20
20
|
options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
|
21
21
|
options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'digest/md5'
|
2
2
|
require 'date'
|
3
3
|
require 'ap'
|
4
|
-
|
4
|
+
require 'redis-namespace'
|
5
5
|
|
6
6
|
# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
|
7
7
|
class CobwebCrawler
|
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
21
21
|
end
|
22
22
|
|
23
|
-
@redis =
|
23
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
26
|
@debug = @options[:debug]
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
@@ -6,7 +6,7 @@ class ContentLinkParser
|
|
6
6
|
|
7
7
|
# Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
|
8
8
|
def initialize(url, content, options = {})
|
9
|
-
@options = options
|
9
|
+
@options = {}.merge(options)
|
10
10
|
@url = url
|
11
11
|
@doc = Nokogiri::HTML(content)
|
12
12
|
|
@@ -61,8 +61,7 @@ class ContentLinkParser
|
|
61
61
|
end
|
62
62
|
links.uniq
|
63
63
|
else
|
64
|
-
|
65
|
-
[]
|
64
|
+
super
|
66
65
|
end
|
67
66
|
end
|
68
67
|
|
data/lib/crawl.rb
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
module CobwebModule
|
2
|
+
class Crawl
|
3
|
+
|
4
|
+
def initialize(options={})
|
5
|
+
@options = HashUtil.deep_symbolize_keys(options)
|
6
|
+
|
7
|
+
setup_defaults
|
8
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
|
9
|
+
@stats = Stats.new(@options)
|
10
|
+
@debug = @options[:debug]
|
11
|
+
@first_to_finish = false
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns true if the url requested is already in the crawled queue
|
16
|
+
def already_crawled?(link=@options[:url])
|
17
|
+
@redis.sismember "crawled", link
|
18
|
+
end
|
19
|
+
|
20
|
+
def already_queued?(link)
|
21
|
+
@redis.sismember "queued", link
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns true if the crawl count is within limits
|
25
|
+
def within_crawl_limits?
|
26
|
+
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns true if the processed count is within limits
|
30
|
+
def within_process_limits?
|
31
|
+
@options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns true if the queue count is calculated to be still within limits when complete
|
35
|
+
def within_queue_limits?
|
36
|
+
|
37
|
+
# if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
|
38
|
+
if @options[:crawl_limit_by_page]
|
39
|
+
return true
|
40
|
+
|
41
|
+
# if a crawl limit is set, limit queue size to crawled + queue
|
42
|
+
elsif @options[:crawl_limit].to_i > 0
|
43
|
+
(queue_counter + crawl_counter) < @options[:crawl_limit].to_i
|
44
|
+
|
45
|
+
# no crawl limit set so always within queue limit
|
46
|
+
else
|
47
|
+
true
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def retrieve
|
52
|
+
unless already_crawled?
|
53
|
+
if within_crawl_limits?
|
54
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
55
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
56
|
+
if @options[:url] == @redis.get("original_base_url")
|
57
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
58
|
+
end
|
59
|
+
update_queues
|
60
|
+
|
61
|
+
if content.permitted_type?
|
62
|
+
## update statistics
|
63
|
+
|
64
|
+
@stats.update_statistics(@content)
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
else
|
68
|
+
decrement_queue_counter
|
69
|
+
end
|
70
|
+
else
|
71
|
+
decrement_queue_counter
|
72
|
+
end
|
73
|
+
false
|
74
|
+
end
|
75
|
+
|
76
|
+
def process_links &block
|
77
|
+
|
78
|
+
# set the base url if this is the first page
|
79
|
+
set_base_url @redis
|
80
|
+
|
81
|
+
@cobweb_links = CobwebLinks.new(@options)
|
82
|
+
if within_queue_limits?
|
83
|
+
internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
84
|
+
#get rid of duplicate links in the same page.
|
85
|
+
internal_links.uniq!
|
86
|
+
# select the link if its internal
|
87
|
+
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
88
|
+
|
89
|
+
# reject the link if we've crawled it or queued it
|
90
|
+
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
91
|
+
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
92
|
+
|
93
|
+
internal_links.each do |link|
|
94
|
+
if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
|
95
|
+
if status != CobwebCrawlHelper::CANCELLED
|
96
|
+
yield link if block_given?
|
97
|
+
unless link.nil?
|
98
|
+
@redis.sadd "queued", link
|
99
|
+
increment_queue_counter
|
100
|
+
end
|
101
|
+
else
|
102
|
+
puts "Cannot enqueue new content as crawl has been cancelled." if @options[:debug]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def content
|
110
|
+
raise "Content is not available" if @content.nil?
|
111
|
+
CobwebModule::CrawlObject.new(@content, @options)
|
112
|
+
end
|
113
|
+
|
114
|
+
def update_queues
|
115
|
+
@redis.multi do
|
116
|
+
#@redis.incr "inprogress"
|
117
|
+
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
118
|
+
@redis.srem "queued", @options[:url]
|
119
|
+
@redis.sadd "crawled", @options[:url]
|
120
|
+
if content.url != @options[:url]
|
121
|
+
@redis.srem "queued", content.url
|
122
|
+
@redis.sadd "crawled", content.url
|
123
|
+
end
|
124
|
+
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
125
|
+
if @options[:crawl_limit_by_page]
|
126
|
+
ap "#{content.mime_type} - #{content.url}"
|
127
|
+
if content.mime_type.match("text/html")
|
128
|
+
increment_crawl_counter
|
129
|
+
end
|
130
|
+
else
|
131
|
+
increment_crawl_counter
|
132
|
+
end
|
133
|
+
decrement_queue_counter
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def to_be_processed?
|
138
|
+
!finished? || first_to_finish? || within_process_limits?
|
139
|
+
end
|
140
|
+
|
141
|
+
def process
|
142
|
+
if @options[:crawl_limit_by_page]
|
143
|
+
if content.mime_type.match("text/html")
|
144
|
+
increment_process_counter
|
145
|
+
end
|
146
|
+
else
|
147
|
+
increment_process_counter
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def finished?
|
152
|
+
print_counters
|
153
|
+
# if there's nothing left queued or the crawled limit has been reached
|
154
|
+
if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
|
155
|
+
if queue_counter.to_i == 0
|
156
|
+
finished
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
elsif (queue_counter.to_i) == 0 || crawl_counter.to_i >= @options[:crawl_limit].to_i
|
160
|
+
finished
|
161
|
+
return true
|
162
|
+
end
|
163
|
+
false
|
164
|
+
end
|
165
|
+
|
166
|
+
def finished
|
167
|
+
set_first_to_finish if !@redis.exists("first_to_finish")
|
168
|
+
ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if @options[:debug]
|
169
|
+
@stats.end_crawl(@options)
|
170
|
+
end
|
171
|
+
|
172
|
+
def set_first_to_finish
|
173
|
+
@redis.watch("first_to_finish") do
|
174
|
+
if !@redis.exists("first_to_finish")
|
175
|
+
@redis.multi do
|
176
|
+
puts "set first to finish"
|
177
|
+
@first_to_finish = true
|
178
|
+
@redis.set("first_to_finish", 1)
|
179
|
+
end
|
180
|
+
else
|
181
|
+
@redis.unwatch
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
def first_to_finish?
|
188
|
+
@first_to_finish
|
189
|
+
end
|
190
|
+
|
191
|
+
def crawled_base_url
|
192
|
+
@redis.get("crawled_base_url")
|
193
|
+
end
|
194
|
+
|
195
|
+
def statistics
|
196
|
+
@stats.get_statistics
|
197
|
+
end
|
198
|
+
|
199
|
+
def redis
|
200
|
+
@redis
|
201
|
+
end
|
202
|
+
|
203
|
+
private
|
204
|
+
def setup_defaults
|
205
|
+
@options[:redis_options] = {} unless @options.has_key? :redis_options
|
206
|
+
@options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
|
207
|
+
@options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
|
208
|
+
end
|
209
|
+
|
210
|
+
# Increments the queue counter and refreshes crawl counters
|
211
|
+
def increment_queue_counter
|
212
|
+
@redis.incr "queue-counter"
|
213
|
+
end
|
214
|
+
# Increments the crawl counter and refreshes crawl counters
|
215
|
+
def increment_crawl_counter
|
216
|
+
@redis.incr "crawl-counter"
|
217
|
+
end
|
218
|
+
# Increments the process counter
|
219
|
+
def increment_process_counter
|
220
|
+
@redis.incr "process-counter"
|
221
|
+
end
|
222
|
+
# Decrements the queue counter and refreshes crawl counters
|
223
|
+
def decrement_queue_counter
|
224
|
+
@redis.decr "queue-counter"
|
225
|
+
end
|
226
|
+
|
227
|
+
def crawl_counter
|
228
|
+
@redis.get("crawl-counter").to_i
|
229
|
+
end
|
230
|
+
def queue_counter
|
231
|
+
@redis.get("queue-counter").to_i
|
232
|
+
end
|
233
|
+
def process_counter
|
234
|
+
@redis.get("process-counter").to_i
|
235
|
+
end
|
236
|
+
|
237
|
+
def status
|
238
|
+
@stats.get_status
|
239
|
+
end
|
240
|
+
|
241
|
+
def print_counters
|
242
|
+
puts counters
|
243
|
+
end
|
244
|
+
|
245
|
+
def counters
|
246
|
+
"crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
|
247
|
+
end
|
248
|
+
|
249
|
+
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
250
|
+
def set_base_url(redis)
|
251
|
+
if redis.get("base_url").nil?
|
252
|
+
unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal]
|
253
|
+
uri = Addressable::URI.parse(content.redirect_through.last)
|
254
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
255
|
+
end
|
256
|
+
redis.set("base_url", content.url)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
|
261
|
+
|
262
|
+
end
|
263
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -5,132 +5,58 @@ class CrawlJob
|
|
5
5
|
require "net/https"
|
6
6
|
require "uri"
|
7
7
|
require "redis"
|
8
|
-
|
9
|
-
|
8
|
+
|
10
9
|
@queue = :cobweb_crawl_job
|
11
|
-
|
10
|
+
|
12
11
|
# Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
|
13
12
|
def self.perform(content_request)
|
14
|
-
# change all hash keys to symbols
|
15
|
-
content_request = HashUtil.deep_symbolize_keys(content_request)
|
16
|
-
@content_request = content_request
|
17
|
-
@crawl = CobwebCrawlHelper.new(content_request)
|
18
|
-
|
19
|
-
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
20
|
-
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
21
|
-
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
22
13
|
|
23
|
-
|
24
|
-
@
|
14
|
+
# setup the crawl class to manage the crawl of this object
|
15
|
+
@crawl = CobwebModule::Crawl.new(content_request)
|
25
16
|
|
26
|
-
|
17
|
+
# update the counters and then perform the get, returns false if we are outwith limits
|
18
|
+
if @crawl.retrieve
|
27
19
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
34
|
-
if content_request[:url] == @redis.get("original_base_url")
|
35
|
-
@redis.set("crawled_base_url", content[:base_url])
|
36
|
-
end
|
37
|
-
if is_permitted_type(content)
|
38
|
-
begin
|
39
|
-
@redis.incr "inprogress"
|
40
|
-
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
41
|
-
@redis.srem "queued", content_request[:url]
|
42
|
-
@redis.sadd "crawled", content_request[:url]
|
43
|
-
@redis.srem "queued", content[:url]
|
44
|
-
@redis.sadd "crawled", content[:url]
|
45
|
-
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
46
|
-
if content_request[:crawl_limit_by_page]
|
47
|
-
if content[:mime_type].match("text/html")
|
48
|
-
increment_crawl_started_counter
|
49
|
-
end
|
50
|
-
else
|
51
|
-
increment_crawl_started_counter
|
52
|
-
end
|
53
|
-
|
54
|
-
## update statistics
|
55
|
-
@stats.update_status("Crawling #{content_request[:url]}...")
|
56
|
-
@stats.update_statistics(content)
|
57
|
-
|
58
|
-
# set the base url if this is the first page
|
59
|
-
set_base_url @redis, content, content_request
|
60
|
-
|
61
|
-
@cobweb_links = CobwebLinks.new(content_request)
|
62
|
-
if within_queue_limits?(content_request[:crawl_limit])
|
63
|
-
internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
|
64
|
-
#get rid of duplicate links in the same page.
|
65
|
-
internal_links.uniq!
|
66
|
-
# select the link if its internal
|
67
|
-
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
68
|
-
|
69
|
-
# reject the link if we've crawled it or queued it
|
70
|
-
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
71
|
-
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
72
|
-
|
73
|
-
internal_links.each do |link|
|
74
|
-
puts link
|
75
|
-
puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
|
76
|
-
if within_queue_limits?(content_request[:crawl_limit])
|
77
|
-
if @crawl.status != CobwebCrawlHelper::CANCELLED
|
78
|
-
enqueue_content(content_request, link)
|
79
|
-
else
|
80
|
-
puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
# enqueue to processing queue
|
87
|
-
send_to_processing_queue(content, content_request)
|
20
|
+
# if the crawled object is an object type we are interested
|
21
|
+
if @crawl.content.permitted_type?
|
22
|
+
|
23
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
24
|
+
@crawl.process_links do |link|
|
88
25
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
93
|
-
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
94
|
-
end
|
26
|
+
# enqueue the links to resque
|
27
|
+
puts "ENQUEUED LINK: #{link}"
|
28
|
+
enqueue_content(content_request, link)
|
95
29
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
if @crawl.to_be_processed?
|
34
|
+
@crawl.process
|
35
|
+
|
36
|
+
# enqueue to processing queue
|
37
|
+
@crawl.redis.incr("crawl_job_enqueued_count")
|
38
|
+
puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
39
|
+
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
40
|
+
|
41
|
+
|
42
|
+
#if the enqueue counter has been requested update that
|
43
|
+
if content_request.has_key?(:enqueue_counter_key)
|
44
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
|
45
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
46
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
114
47
|
end
|
115
48
|
else
|
116
|
-
|
49
|
+
ap "@crawl.finished? #{@crawl.finished?}"
|
50
|
+
ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
51
|
+
ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
117
52
|
end
|
118
|
-
else
|
119
|
-
puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
|
120
53
|
end
|
121
|
-
|
122
|
-
else
|
123
|
-
@redis.srem "queued", content_request[:url]
|
124
|
-
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
125
54
|
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
finished(content_request)
|
132
|
-
end
|
133
|
-
elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
|
55
|
+
|
56
|
+
# test queue and crawl sizes to see if we have completed the crawl
|
57
|
+
ap "finished? #{@crawl.finished?}"
|
58
|
+
ap "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
|
59
|
+
if @crawl.finished? && @crawl.first_to_finish?
|
134
60
|
finished(content_request)
|
135
61
|
end
|
136
62
|
|
@@ -138,19 +64,12 @@ class CrawlJob
|
|
138
64
|
|
139
65
|
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
140
66
|
def self.finished(content_request)
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
148
|
-
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
149
|
-
|
150
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
|
151
|
-
else
|
152
|
-
# nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
|
153
|
-
end
|
67
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
|
68
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
69
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
70
|
+
|
71
|
+
@crawl.redis.incr("crawl_finished_enqueued_count")
|
72
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
|
154
73
|
end
|
155
74
|
|
156
75
|
# Enqueues the content to the processing queue setup in options
|
@@ -171,34 +90,6 @@ class CrawlJob
|
|
171
90
|
|
172
91
|
private
|
173
92
|
|
174
|
-
# Helper method to determine if this content is to be processed or not
|
175
|
-
def self.is_permitted_type(content)
|
176
|
-
@content_request[:valid_mime_types].each do |mime_type|
|
177
|
-
return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
|
178
|
-
end
|
179
|
-
false
|
180
|
-
end
|
181
|
-
|
182
|
-
# Returns true if the crawl count is within limits
|
183
|
-
def self.within_crawl_limits?(crawl_limit)
|
184
|
-
crawl_limit.nil? or crawl_counter < crawl_limit.to_i
|
185
|
-
end
|
186
|
-
|
187
|
-
# Returns true if the queue count is calculated to be still within limits when complete
|
188
|
-
def self.within_queue_limits?(crawl_limit)
|
189
|
-
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
|
190
|
-
end
|
191
|
-
|
192
|
-
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
193
|
-
def self.set_base_url(redis, content, content_request)
|
194
|
-
if redis.get("base_url").nil?
|
195
|
-
unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
196
|
-
uri = Addressable::URI.parse(content[:redirect_through].last)
|
197
|
-
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
198
|
-
end
|
199
|
-
redis.set("base_url", content[:url])
|
200
|
-
end
|
201
|
-
end
|
202
93
|
|
203
94
|
# Enqueues content to the crawl_job queue
|
204
95
|
def self.enqueue_content(content_request, link)
|
@@ -206,43 +97,8 @@ class CrawlJob
|
|
206
97
|
new_request[:url] = link
|
207
98
|
new_request[:parent] = content_request[:url]
|
208
99
|
#to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
|
209
|
-
@redis.sadd "queued", link
|
210
100
|
Resque.enqueue(CrawlJob, new_request)
|
211
|
-
increment_queue_counter
|
212
101
|
end
|
213
102
|
|
214
|
-
# Increments the queue counter and refreshes crawl counters
|
215
|
-
def self.increment_queue_counter
|
216
|
-
@redis.incr "queue-counter"
|
217
|
-
end
|
218
|
-
# Increments the crawl counter and refreshes crawl counters
|
219
|
-
def self.increment_crawl_counter
|
220
|
-
@redis.incr "crawl-counter"
|
221
|
-
end
|
222
|
-
def self.increment_crawl_started_counter
|
223
|
-
@redis.incr "crawl-started-counter"
|
224
|
-
end
|
225
|
-
# Decrements the queue counter and refreshes crawl counters
|
226
|
-
def self.decrement_queue_counter
|
227
|
-
@redis.decr "queue-counter"
|
228
|
-
end
|
229
|
-
|
230
|
-
def self.crawl_counter
|
231
|
-
@redis.get("crawl-counter").to_i
|
232
|
-
end
|
233
|
-
def self.crawl_started_counter
|
234
|
-
@redis.get("crawl-started-counter").to_i
|
235
|
-
end
|
236
|
-
def self.queue_counter
|
237
|
-
@redis.get("queue-counter").to_i
|
238
|
-
end
|
239
|
-
|
240
|
-
def self.print_counters
|
241
|
-
puts counters
|
242
|
-
end
|
243
|
-
|
244
|
-
def self.counters
|
245
|
-
"crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
|
246
|
-
end
|
247
103
|
|
248
104
|
end
|
data/lib/crawl_object.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module CobwebModule
|
2
|
+
class CrawlObject
|
3
|
+
|
4
|
+
def initialize(content_hash, options={})
|
5
|
+
@content = HashUtil.deep_symbolize_keys(content_hash)
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
# Helper method to determine if this content is to be processed or not
|
11
|
+
def permitted_type?
|
12
|
+
@options[:valid_mime_types].each do |valid_mime_type|
|
13
|
+
return true if @content[:mime_type].match(Cobweb.escape_pattern_for_regex(valid_mime_type))
|
14
|
+
end
|
15
|
+
false
|
16
|
+
end
|
17
|
+
|
18
|
+
def method_missing(m)
|
19
|
+
if @content.keys.include? m.to_sym
|
20
|
+
@content[m.to_sym]
|
21
|
+
else
|
22
|
+
super
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_hash
|
27
|
+
@content
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/hash_util.rb
CHANGED
data/lib/server.rb
CHANGED
@@ -16,7 +16,7 @@ class Server < Sinatra::Base
|
|
16
16
|
@crawls = []
|
17
17
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
18
18
|
version = cobweb_version(crawl_id)
|
19
|
-
redis =
|
19
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
|
20
20
|
stats = HashUtil.deep_symbolize_keys({
|
21
21
|
:cobweb_version => version,
|
22
22
|
:crawl_details => redis.hgetall("crawl_details"),
|
@@ -33,7 +33,7 @@ class Server < Sinatra::Base
|
|
33
33
|
get '/statistics/:crawl_id' do
|
34
34
|
|
35
35
|
version = cobweb_version(params[:crawl_id])
|
36
|
-
redis =
|
36
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
|
37
37
|
|
38
38
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
39
39
|
if @statistics[:status_counts].nil?
|
data/lib/stats.rb
CHANGED
@@ -8,7 +8,7 @@ class Stats
|
|
8
8
|
def initialize(options)
|
9
9
|
options[:redis_options] = {} unless options.has_key? :redis_options
|
10
10
|
@full_redis = Redis.new(options[:redis_options])
|
11
|
-
@redis =
|
11
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
|
12
12
|
end
|
13
13
|
|
14
14
|
# Sets up the crawl in statistics
|
File without changes
|
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
-
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
15
|
end
|
@@ -17,6 +17,7 @@ describe Cobweb, :local_only => true do
|
|
17
17
|
before(:each) do
|
18
18
|
@base_url = "http://localhost:3532/"
|
19
19
|
@base_page_count = 77
|
20
|
+
|
20
21
|
clear_queues
|
21
22
|
end
|
22
23
|
|
@@ -29,6 +30,7 @@ describe Cobweb, :local_only => true do
|
|
29
30
|
:debug => false,
|
30
31
|
:cache => nil
|
31
32
|
}
|
33
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
32
34
|
@cobweb = Cobweb.new @request
|
33
35
|
end
|
34
36
|
it "should not crawl anything if nothing has started" do
|
@@ -37,7 +39,7 @@ describe Cobweb, :local_only => true do
|
|
37
39
|
crawl_obj.destroy
|
38
40
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
39
41
|
wait_for_crawl_finished crawl[:crawl_id]
|
40
|
-
|
42
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 0
|
41
43
|
end
|
42
44
|
|
43
45
|
it "should not complete the crawl when cancelled" do
|
@@ -47,8 +49,8 @@ describe Cobweb, :local_only => true do
|
|
47
49
|
crawl_obj.destroy
|
48
50
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
49
51
|
wait_for_crawl_finished crawl[:crawl_id]
|
50
|
-
|
51
|
-
|
52
|
+
@redis.get("crawl_job_enqueued_count").to_i.should > 0
|
53
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
52
54
|
end
|
53
55
|
|
54
56
|
end
|
@@ -61,22 +63,24 @@ describe Cobweb, :local_only => true do
|
|
61
63
|
:debug => false,
|
62
64
|
:cache => nil
|
63
65
|
}
|
66
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
67
|
+
|
64
68
|
@cobweb = Cobweb.new @request
|
65
69
|
end
|
66
70
|
|
67
71
|
it "should crawl entire site" do
|
68
|
-
ap Resque.size("cobweb_process_job")
|
69
72
|
crawl = @cobweb.start(@base_url)
|
70
73
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
71
74
|
wait_for_crawl_finished crawl[:crawl_id]
|
72
|
-
|
73
|
-
|
75
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
76
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
74
77
|
end
|
75
78
|
it "detect crawl finished once" do
|
76
79
|
crawl = @cobweb.start(@base_url)
|
77
80
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
78
81
|
wait_for_crawl_finished crawl[:crawl_id]
|
79
|
-
|
82
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
83
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
80
84
|
end
|
81
85
|
end
|
82
86
|
describe "with limited mime_types" do
|
@@ -87,6 +91,7 @@ describe Cobweb, :local_only => true do
|
|
87
91
|
:cache => nil,
|
88
92
|
:valid_mime_types => ["text/html"]
|
89
93
|
}
|
94
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
90
95
|
@cobweb = Cobweb.new @request
|
91
96
|
end
|
92
97
|
|
@@ -94,7 +99,7 @@ describe Cobweb, :local_only => true do
|
|
94
99
|
crawl = @cobweb.start(@base_url)
|
95
100
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
96
101
|
wait_for_crawl_finished crawl[:crawl_id]
|
97
|
-
|
102
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 8
|
98
103
|
|
99
104
|
mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
|
100
105
|
mime_types.count.should == 8
|
@@ -110,6 +115,7 @@ describe Cobweb, :local_only => true do
|
|
110
115
|
:quiet => true,
|
111
116
|
:cache => nil
|
112
117
|
}
|
118
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
113
119
|
end
|
114
120
|
|
115
121
|
describe "limit to 1" do
|
@@ -122,19 +128,19 @@ describe Cobweb, :local_only => true do
|
|
122
128
|
crawl = @cobweb.start(@base_url)
|
123
129
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
124
130
|
wait_for_crawl_finished crawl[:crawl_id]
|
125
|
-
|
131
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
126
132
|
end
|
127
133
|
it "should only crawl 1 page" do
|
128
134
|
crawl = @cobweb.start(@base_url)
|
129
135
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
130
136
|
wait_for_crawl_finished crawl[:crawl_id]
|
131
|
-
|
137
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 1
|
132
138
|
end
|
133
139
|
it "should notify of crawl finished once" do
|
134
140
|
crawl = @cobweb.start(@base_url)
|
135
141
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
136
142
|
wait_for_crawl_finished crawl[:crawl_id]
|
137
|
-
|
143
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
138
144
|
end
|
139
145
|
end
|
140
146
|
|
@@ -145,6 +151,7 @@ describe Cobweb, :local_only => true do
|
|
145
151
|
@cobweb = Cobweb.new @request
|
146
152
|
end
|
147
153
|
|
154
|
+
# the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
|
148
155
|
it "should only use html pages towards the crawl limit" do
|
149
156
|
crawl = @cobweb.start(@base_url)
|
150
157
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -165,19 +172,19 @@ describe Cobweb, :local_only => true do
|
|
165
172
|
crawl = @cobweb.start(@base_url)
|
166
173
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
167
174
|
wait_for_crawl_finished crawl[:crawl_id]
|
168
|
-
|
175
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
169
176
|
end
|
170
177
|
it "should notify of crawl finished once" do
|
171
178
|
crawl = @cobweb.start(@base_url)
|
172
179
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
173
180
|
wait_for_crawl_finished crawl[:crawl_id]
|
174
|
-
|
181
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
175
182
|
end
|
176
183
|
it "should only crawl 10 objects" do
|
177
184
|
crawl = @cobweb.start(@base_url)
|
178
185
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
179
186
|
wait_for_crawl_finished crawl[:crawl_id]
|
180
|
-
|
187
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 10
|
181
188
|
end
|
182
189
|
end
|
183
190
|
|
@@ -191,23 +198,24 @@ describe Cobweb, :local_only => true do
|
|
191
198
|
crawl = @cobweb.start(@base_url)
|
192
199
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
193
200
|
wait_for_crawl_finished crawl[:crawl_id]
|
194
|
-
|
201
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
195
202
|
end
|
196
203
|
it "should notify of crawl finished once" do
|
197
204
|
crawl = @cobweb.start(@base_url)
|
198
205
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
199
206
|
wait_for_crawl_finished crawl[:crawl_id]
|
200
|
-
|
207
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
201
208
|
end
|
202
209
|
it "should not crawl 100 pages" do
|
203
210
|
crawl = @cobweb.start(@base_url)
|
204
211
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
205
212
|
wait_for_crawl_finished crawl[:crawl_id]
|
206
|
-
|
213
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == 100
|
207
214
|
end
|
208
215
|
end
|
209
216
|
end
|
210
217
|
|
218
|
+
|
211
219
|
after(:all) do
|
212
220
|
|
213
221
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
@@ -220,26 +228,43 @@ describe Cobweb, :local_only => true do
|
|
220
228
|
end
|
221
229
|
|
222
230
|
def wait_for_crawl_finished(crawl_id, timeout=20)
|
223
|
-
counter = 0
|
231
|
+
@counter = 0
|
224
232
|
start_time = Time.now
|
225
233
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
226
|
-
|
227
|
-
end
|
228
|
-
if Time.now > start_time + timeout
|
229
|
-
raise "End of crawl not detected"
|
230
|
-
end
|
234
|
+
sleep 0.5
|
231
235
|
end
|
232
|
-
|
233
|
-
|
234
|
-
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
236
|
+
if Time.now > start_time + timeout
|
237
|
+
raise "End of crawl not detected"
|
235
238
|
end
|
239
|
+
end
|
236
240
|
|
237
|
-
|
238
|
-
|
239
|
-
|
241
|
+
def running?(crawl_id)
|
242
|
+
status = @stat.get_status
|
243
|
+
result = true
|
244
|
+
if status == CobwebCrawlHelper::STARTING
|
245
|
+
result = true
|
246
|
+
else
|
247
|
+
if status == @last_stat
|
248
|
+
if @counter > 5
|
249
|
+
raise "Static status: #{status}"
|
250
|
+
else
|
251
|
+
@counter += 1
|
252
|
+
end
|
253
|
+
puts "Static Status.. #{6-@counter}"
|
254
|
+
else
|
255
|
+
result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
|
240
256
|
end
|
257
|
+
end
|
258
|
+
@last_stat = @stat.get_status
|
259
|
+
result
|
260
|
+
end
|
241
261
|
|
242
|
-
|
243
|
-
|
244
|
-
Resque.
|
262
|
+
def clear_queues
|
263
|
+
Resque.queues.each do |queue|
|
264
|
+
Resque.remove_queue(queue)
|
245
265
|
end
|
266
|
+
|
267
|
+
Resque.size("cobweb_process_job").should == 0
|
268
|
+
Resque.size("cobweb_finished_job").should == 0
|
269
|
+
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
270
|
+
end
|
@@ -76,11 +76,9 @@ describe ContentLinkParser do
|
|
76
76
|
links.length.should == 3
|
77
77
|
end
|
78
78
|
end
|
79
|
-
describe "returning unknown link type" do
|
79
|
+
describe "returning unknown link type should raise an error" do
|
80
80
|
it "should return an empty array" do
|
81
|
-
|
82
|
-
links.should_not be_nil
|
83
|
-
links.should be_an_instance_of Array
|
81
|
+
lambda {@content_parser.asdfasdfsadf}.should raise_error
|
84
82
|
end
|
85
83
|
end
|
86
84
|
end
|
@@ -122,7 +120,7 @@ describe ContentLinkParser do
|
|
122
120
|
describe "ignoring default tags" do
|
123
121
|
it "should not return any links" do
|
124
122
|
parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
|
125
|
-
parser.links.should
|
123
|
+
lambda{parser.links}.should raise_error(NoMethodError)
|
126
124
|
end
|
127
125
|
end
|
128
126
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Cobweb, :local_only => true do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
#store all existing resque process ids so we don't kill them afterwards
|
7
|
+
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
8
|
+
|
9
|
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
|
+
puts "Starting Workers... Please Wait..."
|
11
|
+
`mkdir log`
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
|
+
puts "Workers Started."
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
before(:each) do
|
18
|
+
@base_url = "http://localhost:3532/"
|
19
|
+
@base_page_count = 77
|
20
|
+
clear_queues
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "with a crawl limit" do
|
24
|
+
before(:each) do
|
25
|
+
@request = {
|
26
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
27
|
+
:quiet => true,
|
28
|
+
:cache => nil,
|
29
|
+
:use_encoding_safe_process_job => true,
|
30
|
+
:crawl_limit_by_page => true
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "on ancestry.com.au" do
|
35
|
+
describe "limited to 100" do
|
36
|
+
before(:each) do
|
37
|
+
@request[:crawl_limit] = 100
|
38
|
+
@request[:valid_mime_types] = ["text/html"]
|
39
|
+
@cobweb = Cobweb.new @request
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should crawl 100 pages" do
|
43
|
+
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
44
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
45
|
+
wait_for_crawl_finished crawl[:crawl_id], 180
|
46
|
+
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe "limited to 999" do
|
51
|
+
before(:each) do
|
52
|
+
@request[:crawl_limit] = 999
|
53
|
+
@cobweb = Cobweb.new @request
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should crawl 999 pages" do
|
57
|
+
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
58
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
59
|
+
wait_for_crawl_finished crawl[:crawl_id], 720
|
60
|
+
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
61
|
+
end
|
62
|
+
end
|
63
|
+
__END__
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
after(:all) do
|
68
|
+
|
69
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
70
|
+
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
71
|
+
IO.popen(command)
|
72
|
+
|
73
|
+
clear_queues
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
def wait_for_crawl_finished(crawl_id, timeout=20)
|
79
|
+
counter = 0
|
80
|
+
start_time = Time.now
|
81
|
+
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
82
|
+
sleep 0.5
|
83
|
+
end
|
84
|
+
if Time.now > start_time + timeout
|
85
|
+
raise "End of crawl not detected"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def running?(crawl_id)
|
90
|
+
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear_queues
|
94
|
+
Resque.queues.each do |queue|
|
95
|
+
Resque.remove_queue(queue)
|
96
|
+
end
|
97
|
+
|
98
|
+
Resque.size("cobweb_process_job").should == 0
|
99
|
+
Resque.size("cobweb_finished_job").should == 0
|
100
|
+
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
101
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.74
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70347429190520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70347429190520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70347429190020 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70347429190020
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70347429189540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70347429189540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70347429188880 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70347429188880
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70347429187340 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70347429187340
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70347429185820 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70347429185820
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70347429185040 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70347429185040
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70347429184340 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70347429184340
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70347429183120 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70347429183120
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70347429181840 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70347429181840
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70347429180860 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70347429180860
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -142,13 +142,14 @@ extensions: []
|
|
142
142
|
extra_rdoc_files:
|
143
143
|
- README.textile
|
144
144
|
files:
|
145
|
+
- spec/cobweb/cobweb_crawl_helper_spec.rb
|
145
146
|
- spec/cobweb/cobweb_crawler_spec.rb
|
146
147
|
- spec/cobweb/cobweb_job_spec.rb
|
147
148
|
- spec/cobweb/cobweb_links_spec.rb
|
148
149
|
- spec/cobweb/cobweb_spec.rb
|
149
150
|
- spec/cobweb/content_link_parser_spec.rb
|
150
|
-
- spec/cobweb/crawl_spec.rb
|
151
151
|
- spec/cobweb/robots_spec.rb
|
152
|
+
- spec/cobweb/site_test_spec.rb.tmp
|
152
153
|
- spec/samples/robots.txt
|
153
154
|
- spec/samples/sample_html_links.html
|
154
155
|
- spec/samples/sample_server.rb
|
@@ -328,7 +329,9 @@ files:
|
|
328
329
|
- lib/cobweb_process_job.rb
|
329
330
|
- lib/cobweb_version.rb
|
330
331
|
- lib/content_link_parser.rb
|
332
|
+
- lib/crawl.rb
|
331
333
|
- lib/crawl_job.rb
|
334
|
+
- lib/crawl_object.rb
|
332
335
|
- lib/encoding_safe_process_job.rb
|
333
336
|
- lib/hash_util.rb
|
334
337
|
- lib/redirect_error.rb
|