cobweb 0.0.73 → 0.0.74
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +1 -1
- data/lib/cobweb.rb +6 -6
- data/lib/cobweb_crawl_helper.rb +1 -1
- data/lib/cobweb_crawler.rb +2 -2
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +2 -3
- data/lib/crawl.rb +263 -0
- data/lib/crawl_job.rb +45 -189
- data/lib/crawl_object.rb +30 -0
- data/lib/hash_util.rb +1 -0
- data/lib/server.rb +2 -2
- data/lib/stats.rb +1 -1
- data/spec/cobweb/{crawl_spec.rb → cobweb_crawl_helper_spec.rb} +0 -0
- data/spec/cobweb/cobweb_job_spec.rb +58 -33
- data/spec/cobweb/content_link_parser_spec.rb +3 -5
- data/spec/cobweb/site_test_spec.rb.tmp +101 -0
- metadata +28 -25
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -4,7 +4,6 @@ require 'resque'
|
|
4
4
|
require "addressable/uri"
|
5
5
|
require 'digest/sha1'
|
6
6
|
require 'base64'
|
7
|
-
require 'namespaced_redis'
|
8
7
|
|
9
8
|
Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
10
9
|
require file
|
@@ -46,6 +45,7 @@ class Cobweb
|
|
46
45
|
default_text_mime_types_to ["text/*", "application/xhtml+xml"]
|
47
46
|
default_obey_robots_to false
|
48
47
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
48
|
+
default_valid_mime_types_to ["*/*"]
|
49
49
|
|
50
50
|
end
|
51
51
|
|
@@ -65,7 +65,7 @@ class Cobweb
|
|
65
65
|
end
|
66
66
|
|
67
67
|
request.merge!(@options)
|
68
|
-
@redis =
|
68
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
|
69
69
|
@redis.set("original_base_url", base_url)
|
70
70
|
@redis.hset "statistics", "queued_at", DateTime.now
|
71
71
|
@redis.set("crawl-counter", 0)
|
@@ -110,9 +110,9 @@ class Cobweb
|
|
110
110
|
|
111
111
|
# connect to redis
|
112
112
|
if options.has_key? :crawl_id
|
113
|
-
redis =
|
113
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
|
114
114
|
else
|
115
|
-
redis =
|
115
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
116
116
|
end
|
117
117
|
|
118
118
|
content = {:base_url => url}
|
@@ -269,9 +269,9 @@ class Cobweb
|
|
269
269
|
|
270
270
|
# connect to redis
|
271
271
|
if options.has_key? :crawl_id
|
272
|
-
redis =
|
272
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
|
273
273
|
else
|
274
|
-
redis =
|
274
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
275
275
|
end
|
276
276
|
|
277
277
|
content = {:base_url => url}
|
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -15,7 +15,7 @@ class CobwebCrawlHelper
|
|
15
15
|
@stats = Stats.new(data)
|
16
16
|
end
|
17
17
|
|
18
|
-
def destroy(options)
|
18
|
+
def destroy(options={})
|
19
19
|
|
20
20
|
options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
|
21
21
|
options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'digest/md5'
|
2
2
|
require 'date'
|
3
3
|
require 'ap'
|
4
|
-
|
4
|
+
require 'redis-namespace'
|
5
5
|
|
6
6
|
# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
|
7
7
|
class CobwebCrawler
|
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
21
21
|
end
|
22
22
|
|
23
|
-
@redis =
|
23
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
26
|
@debug = @options[:debug]
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
@@ -6,7 +6,7 @@ class ContentLinkParser
|
|
6
6
|
|
7
7
|
# Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
|
8
8
|
def initialize(url, content, options = {})
|
9
|
-
@options = options
|
9
|
+
@options = {}.merge(options)
|
10
10
|
@url = url
|
11
11
|
@doc = Nokogiri::HTML(content)
|
12
12
|
|
@@ -61,8 +61,7 @@ class ContentLinkParser
|
|
61
61
|
end
|
62
62
|
links.uniq
|
63
63
|
else
|
64
|
-
|
65
|
-
[]
|
64
|
+
super
|
66
65
|
end
|
67
66
|
end
|
68
67
|
|
data/lib/crawl.rb
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
module CobwebModule
|
2
|
+
class Crawl
|
3
|
+
|
4
|
+
def initialize(options={})
|
5
|
+
@options = HashUtil.deep_symbolize_keys(options)
|
6
|
+
|
7
|
+
setup_defaults
|
8
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
|
9
|
+
@stats = Stats.new(@options)
|
10
|
+
@debug = @options[:debug]
|
11
|
+
@first_to_finish = false
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns true if the url requested is already in the crawled queue
|
16
|
+
def already_crawled?(link=@options[:url])
|
17
|
+
@redis.sismember "crawled", link
|
18
|
+
end
|
19
|
+
|
20
|
+
def already_queued?(link)
|
21
|
+
@redis.sismember "queued", link
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns true if the crawl count is within limits
|
25
|
+
def within_crawl_limits?
|
26
|
+
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns true if the processed count is within limits
|
30
|
+
def within_process_limits?
|
31
|
+
@options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns true if the queue count is calculated to be still within limits when complete
|
35
|
+
def within_queue_limits?
|
36
|
+
|
37
|
+
# if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
|
38
|
+
if @options[:crawl_limit_by_page]
|
39
|
+
return true
|
40
|
+
|
41
|
+
# if a crawl limit is set, limit queue size to crawled + queue
|
42
|
+
elsif @options[:crawl_limit].to_i > 0
|
43
|
+
(queue_counter + crawl_counter) < @options[:crawl_limit].to_i
|
44
|
+
|
45
|
+
# no crawl limit set so always within queue limit
|
46
|
+
else
|
47
|
+
true
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def retrieve
|
52
|
+
unless already_crawled?
|
53
|
+
if within_crawl_limits?
|
54
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
55
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
56
|
+
if @options[:url] == @redis.get("original_base_url")
|
57
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
58
|
+
end
|
59
|
+
update_queues
|
60
|
+
|
61
|
+
if content.permitted_type?
|
62
|
+
## update statistics
|
63
|
+
|
64
|
+
@stats.update_statistics(@content)
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
else
|
68
|
+
decrement_queue_counter
|
69
|
+
end
|
70
|
+
else
|
71
|
+
decrement_queue_counter
|
72
|
+
end
|
73
|
+
false
|
74
|
+
end
|
75
|
+
|
76
|
+
def process_links &block
|
77
|
+
|
78
|
+
# set the base url if this is the first page
|
79
|
+
set_base_url @redis
|
80
|
+
|
81
|
+
@cobweb_links = CobwebLinks.new(@options)
|
82
|
+
if within_queue_limits?
|
83
|
+
internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
84
|
+
#get rid of duplicate links in the same page.
|
85
|
+
internal_links.uniq!
|
86
|
+
# select the link if its internal
|
87
|
+
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
88
|
+
|
89
|
+
# reject the link if we've crawled it or queued it
|
90
|
+
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
91
|
+
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
92
|
+
|
93
|
+
internal_links.each do |link|
|
94
|
+
if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
|
95
|
+
if status != CobwebCrawlHelper::CANCELLED
|
96
|
+
yield link if block_given?
|
97
|
+
unless link.nil?
|
98
|
+
@redis.sadd "queued", link
|
99
|
+
increment_queue_counter
|
100
|
+
end
|
101
|
+
else
|
102
|
+
puts "Cannot enqueue new content as crawl has been cancelled." if @options[:debug]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def content
|
110
|
+
raise "Content is not available" if @content.nil?
|
111
|
+
CobwebModule::CrawlObject.new(@content, @options)
|
112
|
+
end
|
113
|
+
|
114
|
+
def update_queues
|
115
|
+
@redis.multi do
|
116
|
+
#@redis.incr "inprogress"
|
117
|
+
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
118
|
+
@redis.srem "queued", @options[:url]
|
119
|
+
@redis.sadd "crawled", @options[:url]
|
120
|
+
if content.url != @options[:url]
|
121
|
+
@redis.srem "queued", content.url
|
122
|
+
@redis.sadd "crawled", content.url
|
123
|
+
end
|
124
|
+
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
125
|
+
if @options[:crawl_limit_by_page]
|
126
|
+
ap "#{content.mime_type} - #{content.url}"
|
127
|
+
if content.mime_type.match("text/html")
|
128
|
+
increment_crawl_counter
|
129
|
+
end
|
130
|
+
else
|
131
|
+
increment_crawl_counter
|
132
|
+
end
|
133
|
+
decrement_queue_counter
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def to_be_processed?
|
138
|
+
!finished? || first_to_finish? || within_process_limits?
|
139
|
+
end
|
140
|
+
|
141
|
+
def process
|
142
|
+
if @options[:crawl_limit_by_page]
|
143
|
+
if content.mime_type.match("text/html")
|
144
|
+
increment_process_counter
|
145
|
+
end
|
146
|
+
else
|
147
|
+
increment_process_counter
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def finished?
|
152
|
+
print_counters
|
153
|
+
# if there's nothing left queued or the crawled limit has been reached
|
154
|
+
if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
|
155
|
+
if queue_counter.to_i == 0
|
156
|
+
finished
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
elsif (queue_counter.to_i) == 0 || crawl_counter.to_i >= @options[:crawl_limit].to_i
|
160
|
+
finished
|
161
|
+
return true
|
162
|
+
end
|
163
|
+
false
|
164
|
+
end
|
165
|
+
|
166
|
+
def finished
|
167
|
+
set_first_to_finish if !@redis.exists("first_to_finish")
|
168
|
+
ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if @options[:debug]
|
169
|
+
@stats.end_crawl(@options)
|
170
|
+
end
|
171
|
+
|
172
|
+
def set_first_to_finish
|
173
|
+
@redis.watch("first_to_finish") do
|
174
|
+
if !@redis.exists("first_to_finish")
|
175
|
+
@redis.multi do
|
176
|
+
puts "set first to finish"
|
177
|
+
@first_to_finish = true
|
178
|
+
@redis.set("first_to_finish", 1)
|
179
|
+
end
|
180
|
+
else
|
181
|
+
@redis.unwatch
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
def first_to_finish?
|
188
|
+
@first_to_finish
|
189
|
+
end
|
190
|
+
|
191
|
+
def crawled_base_url
|
192
|
+
@redis.get("crawled_base_url")
|
193
|
+
end
|
194
|
+
|
195
|
+
def statistics
|
196
|
+
@stats.get_statistics
|
197
|
+
end
|
198
|
+
|
199
|
+
def redis
|
200
|
+
@redis
|
201
|
+
end
|
202
|
+
|
203
|
+
private
|
204
|
+
def setup_defaults
|
205
|
+
@options[:redis_options] = {} unless @options.has_key? :redis_options
|
206
|
+
@options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
|
207
|
+
@options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
|
208
|
+
end
|
209
|
+
|
210
|
+
# Increments the queue counter and refreshes crawl counters
|
211
|
+
def increment_queue_counter
|
212
|
+
@redis.incr "queue-counter"
|
213
|
+
end
|
214
|
+
# Increments the crawl counter and refreshes crawl counters
|
215
|
+
def increment_crawl_counter
|
216
|
+
@redis.incr "crawl-counter"
|
217
|
+
end
|
218
|
+
# Increments the process counter
|
219
|
+
def increment_process_counter
|
220
|
+
@redis.incr "process-counter"
|
221
|
+
end
|
222
|
+
# Decrements the queue counter and refreshes crawl counters
|
223
|
+
def decrement_queue_counter
|
224
|
+
@redis.decr "queue-counter"
|
225
|
+
end
|
226
|
+
|
227
|
+
def crawl_counter
|
228
|
+
@redis.get("crawl-counter").to_i
|
229
|
+
end
|
230
|
+
def queue_counter
|
231
|
+
@redis.get("queue-counter").to_i
|
232
|
+
end
|
233
|
+
def process_counter
|
234
|
+
@redis.get("process-counter").to_i
|
235
|
+
end
|
236
|
+
|
237
|
+
def status
|
238
|
+
@stats.get_status
|
239
|
+
end
|
240
|
+
|
241
|
+
def print_counters
|
242
|
+
puts counters
|
243
|
+
end
|
244
|
+
|
245
|
+
def counters
|
246
|
+
"crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
|
247
|
+
end
|
248
|
+
|
249
|
+
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
250
|
+
def set_base_url(redis)
|
251
|
+
if redis.get("base_url").nil?
|
252
|
+
unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal]
|
253
|
+
uri = Addressable::URI.parse(content.redirect_through.last)
|
254
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
255
|
+
end
|
256
|
+
redis.set("base_url", content.url)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
|
261
|
+
|
262
|
+
end
|
263
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -5,132 +5,58 @@ class CrawlJob
|
|
5
5
|
require "net/https"
|
6
6
|
require "uri"
|
7
7
|
require "redis"
|
8
|
-
|
9
|
-
|
8
|
+
|
10
9
|
@queue = :cobweb_crawl_job
|
11
|
-
|
10
|
+
|
12
11
|
# Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
|
13
12
|
def self.perform(content_request)
|
14
|
-
# change all hash keys to symbols
|
15
|
-
content_request = HashUtil.deep_symbolize_keys(content_request)
|
16
|
-
@content_request = content_request
|
17
|
-
@crawl = CobwebCrawlHelper.new(content_request)
|
18
|
-
|
19
|
-
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
20
|
-
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
21
|
-
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
22
13
|
|
23
|
-
|
24
|
-
@
|
14
|
+
# setup the crawl class to manage the crawl of this object
|
15
|
+
@crawl = CobwebModule::Crawl.new(content_request)
|
25
16
|
|
26
|
-
|
17
|
+
# update the counters and then perform the get, returns false if we are outwith limits
|
18
|
+
if @crawl.retrieve
|
27
19
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
34
|
-
if content_request[:url] == @redis.get("original_base_url")
|
35
|
-
@redis.set("crawled_base_url", content[:base_url])
|
36
|
-
end
|
37
|
-
if is_permitted_type(content)
|
38
|
-
begin
|
39
|
-
@redis.incr "inprogress"
|
40
|
-
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
41
|
-
@redis.srem "queued", content_request[:url]
|
42
|
-
@redis.sadd "crawled", content_request[:url]
|
43
|
-
@redis.srem "queued", content[:url]
|
44
|
-
@redis.sadd "crawled", content[:url]
|
45
|
-
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
46
|
-
if content_request[:crawl_limit_by_page]
|
47
|
-
if content[:mime_type].match("text/html")
|
48
|
-
increment_crawl_started_counter
|
49
|
-
end
|
50
|
-
else
|
51
|
-
increment_crawl_started_counter
|
52
|
-
end
|
53
|
-
|
54
|
-
## update statistics
|
55
|
-
@stats.update_status("Crawling #{content_request[:url]}...")
|
56
|
-
@stats.update_statistics(content)
|
57
|
-
|
58
|
-
# set the base url if this is the first page
|
59
|
-
set_base_url @redis, content, content_request
|
60
|
-
|
61
|
-
@cobweb_links = CobwebLinks.new(content_request)
|
62
|
-
if within_queue_limits?(content_request[:crawl_limit])
|
63
|
-
internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
|
64
|
-
#get rid of duplicate links in the same page.
|
65
|
-
internal_links.uniq!
|
66
|
-
# select the link if its internal
|
67
|
-
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
68
|
-
|
69
|
-
# reject the link if we've crawled it or queued it
|
70
|
-
internal_links.reject! { |link| @redis.sismember("crawled", link) }
|
71
|
-
internal_links.reject! { |link| @redis.sismember("queued", link) }
|
72
|
-
|
73
|
-
internal_links.each do |link|
|
74
|
-
puts link
|
75
|
-
puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
|
76
|
-
if within_queue_limits?(content_request[:crawl_limit])
|
77
|
-
if @crawl.status != CobwebCrawlHelper::CANCELLED
|
78
|
-
enqueue_content(content_request, link)
|
79
|
-
else
|
80
|
-
puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
# enqueue to processing queue
|
87
|
-
send_to_processing_queue(content, content_request)
|
20
|
+
# if the crawled object is an object type we are interested
|
21
|
+
if @crawl.content.permitted_type?
|
22
|
+
|
23
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
24
|
+
@crawl.process_links do |link|
|
88
25
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
93
|
-
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
94
|
-
end
|
26
|
+
# enqueue the links to resque
|
27
|
+
puts "ENQUEUED LINK: #{link}"
|
28
|
+
enqueue_content(content_request, link)
|
95
29
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
if @crawl.to_be_processed?
|
34
|
+
@crawl.process
|
35
|
+
|
36
|
+
# enqueue to processing queue
|
37
|
+
@crawl.redis.incr("crawl_job_enqueued_count")
|
38
|
+
puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
39
|
+
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
40
|
+
|
41
|
+
|
42
|
+
#if the enqueue counter has been requested update that
|
43
|
+
if content_request.has_key?(:enqueue_counter_key)
|
44
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
|
45
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
46
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
114
47
|
end
|
115
48
|
else
|
116
|
-
|
49
|
+
ap "@crawl.finished? #{@crawl.finished?}"
|
50
|
+
ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
51
|
+
ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
117
52
|
end
|
118
|
-
else
|
119
|
-
puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
|
120
53
|
end
|
121
|
-
|
122
|
-
else
|
123
|
-
@redis.srem "queued", content_request[:url]
|
124
|
-
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
125
54
|
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
finished(content_request)
|
132
|
-
end
|
133
|
-
elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
|
55
|
+
|
56
|
+
# test queue and crawl sizes to see if we have completed the crawl
|
57
|
+
ap "finished? #{@crawl.finished?}"
|
58
|
+
ap "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
|
59
|
+
if @crawl.finished? && @crawl.first_to_finish?
|
134
60
|
finished(content_request)
|
135
61
|
end
|
136
62
|
|
@@ -138,19 +64,12 @@ class CrawlJob
|
|
138
64
|
|
139
65
|
# Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
|
140
66
|
def self.finished(content_request)
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
148
|
-
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
149
|
-
|
150
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
|
151
|
-
else
|
152
|
-
# nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
|
153
|
-
end
|
67
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
|
68
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
69
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
70
|
+
|
71
|
+
@crawl.redis.incr("crawl_finished_enqueued_count")
|
72
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
|
154
73
|
end
|
155
74
|
|
156
75
|
# Enqueues the content to the processing queue setup in options
|
@@ -171,34 +90,6 @@ class CrawlJob
|
|
171
90
|
|
172
91
|
private
|
173
92
|
|
174
|
-
# Helper method to determine if this content is to be processed or not
|
175
|
-
def self.is_permitted_type(content)
|
176
|
-
@content_request[:valid_mime_types].each do |mime_type|
|
177
|
-
return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
|
178
|
-
end
|
179
|
-
false
|
180
|
-
end
|
181
|
-
|
182
|
-
# Returns true if the crawl count is within limits
|
183
|
-
def self.within_crawl_limits?(crawl_limit)
|
184
|
-
crawl_limit.nil? or crawl_counter < crawl_limit.to_i
|
185
|
-
end
|
186
|
-
|
187
|
-
# Returns true if the queue count is calculated to be still within limits when complete
|
188
|
-
def self.within_queue_limits?(crawl_limit)
|
189
|
-
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
|
190
|
-
end
|
191
|
-
|
192
|
-
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
193
|
-
def self.set_base_url(redis, content, content_request)
|
194
|
-
if redis.get("base_url").nil?
|
195
|
-
unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
196
|
-
uri = Addressable::URI.parse(content[:redirect_through].last)
|
197
|
-
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
198
|
-
end
|
199
|
-
redis.set("base_url", content[:url])
|
200
|
-
end
|
201
|
-
end
|
202
93
|
|
203
94
|
# Enqueues content to the crawl_job queue
|
204
95
|
def self.enqueue_content(content_request, link)
|
@@ -206,43 +97,8 @@ class CrawlJob
|
|
206
97
|
new_request[:url] = link
|
207
98
|
new_request[:parent] = content_request[:url]
|
208
99
|
#to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
|
209
|
-
@redis.sadd "queued", link
|
210
100
|
Resque.enqueue(CrawlJob, new_request)
|
211
|
-
increment_queue_counter
|
212
101
|
end
|
213
102
|
|
214
|
-
# Increments the queue counter and refreshes crawl counters
|
215
|
-
def self.increment_queue_counter
|
216
|
-
@redis.incr "queue-counter"
|
217
|
-
end
|
218
|
-
# Increments the crawl counter and refreshes crawl counters
|
219
|
-
def self.increment_crawl_counter
|
220
|
-
@redis.incr "crawl-counter"
|
221
|
-
end
|
222
|
-
def self.increment_crawl_started_counter
|
223
|
-
@redis.incr "crawl-started-counter"
|
224
|
-
end
|
225
|
-
# Decrements the queue counter and refreshes crawl counters
|
226
|
-
def self.decrement_queue_counter
|
227
|
-
@redis.decr "queue-counter"
|
228
|
-
end
|
229
|
-
|
230
|
-
def self.crawl_counter
|
231
|
-
@redis.get("crawl-counter").to_i
|
232
|
-
end
|
233
|
-
def self.crawl_started_counter
|
234
|
-
@redis.get("crawl-started-counter").to_i
|
235
|
-
end
|
236
|
-
def self.queue_counter
|
237
|
-
@redis.get("queue-counter").to_i
|
238
|
-
end
|
239
|
-
|
240
|
-
def self.print_counters
|
241
|
-
puts counters
|
242
|
-
end
|
243
|
-
|
244
|
-
def self.counters
|
245
|
-
"crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
|
246
|
-
end
|
247
103
|
|
248
104
|
end
|
data/lib/crawl_object.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module CobwebModule
|
2
|
+
class CrawlObject
|
3
|
+
|
4
|
+
def initialize(content_hash, options={})
|
5
|
+
@content = HashUtil.deep_symbolize_keys(content_hash)
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
# Helper method to determine if this content is to be processed or not
|
11
|
+
def permitted_type?
|
12
|
+
@options[:valid_mime_types].each do |valid_mime_type|
|
13
|
+
return true if @content[:mime_type].match(Cobweb.escape_pattern_for_regex(valid_mime_type))
|
14
|
+
end
|
15
|
+
false
|
16
|
+
end
|
17
|
+
|
18
|
+
def method_missing(m)
|
19
|
+
if @content.keys.include? m.to_sym
|
20
|
+
@content[m.to_sym]
|
21
|
+
else
|
22
|
+
super
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_hash
|
27
|
+
@content
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/hash_util.rb
CHANGED
data/lib/server.rb
CHANGED
@@ -16,7 +16,7 @@ class Server < Sinatra::Base
|
|
16
16
|
@crawls = []
|
17
17
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
18
18
|
version = cobweb_version(crawl_id)
|
19
|
-
redis =
|
19
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
|
20
20
|
stats = HashUtil.deep_symbolize_keys({
|
21
21
|
:cobweb_version => version,
|
22
22
|
:crawl_details => redis.hgetall("crawl_details"),
|
@@ -33,7 +33,7 @@ class Server < Sinatra::Base
|
|
33
33
|
get '/statistics/:crawl_id' do
|
34
34
|
|
35
35
|
version = cobweb_version(params[:crawl_id])
|
36
|
-
redis =
|
36
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
|
37
37
|
|
38
38
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
39
39
|
if @statistics[:status_counts].nil?
|
data/lib/stats.rb
CHANGED
@@ -8,7 +8,7 @@ class Stats
|
|
8
8
|
def initialize(options)
|
9
9
|
options[:redis_options] = {} unless options.has_key? :redis_options
|
10
10
|
@full_redis = Redis.new(options[:redis_options])
|
11
|
-
@redis =
|
11
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
|
12
12
|
end
|
13
13
|
|
14
14
|
# Sets up the crawl in statistics
|
File without changes
|
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
-
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
15
|
end
|
@@ -17,6 +17,7 @@ describe Cobweb, :local_only => true do
|
|
17
17
|
before(:each) do
|
18
18
|
@base_url = "http://localhost:3532/"
|
19
19
|
@base_page_count = 77
|
20
|
+
|
20
21
|
clear_queues
|
21
22
|
end
|
22
23
|
|
@@ -29,6 +30,7 @@ describe Cobweb, :local_only => true do
|
|
29
30
|
:debug => false,
|
30
31
|
:cache => nil
|
31
32
|
}
|
33
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
32
34
|
@cobweb = Cobweb.new @request
|
33
35
|
end
|
34
36
|
it "should not crawl anything if nothing has started" do
|
@@ -37,7 +39,7 @@ describe Cobweb, :local_only => true do
|
|
37
39
|
crawl_obj.destroy
|
38
40
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
39
41
|
wait_for_crawl_finished crawl[:crawl_id]
|
40
|
-
|
42
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 0
|
41
43
|
end
|
42
44
|
|
43
45
|
it "should not complete the crawl when cancelled" do
|
@@ -47,8 +49,8 @@ describe Cobweb, :local_only => true do
|
|
47
49
|
crawl_obj.destroy
|
48
50
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
49
51
|
wait_for_crawl_finished crawl[:crawl_id]
|
50
|
-
|
51
|
-
|
52
|
+
@redis.get("crawl_job_enqueued_count").to_i.should > 0
|
53
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
52
54
|
end
|
53
55
|
|
54
56
|
end
|
@@ -61,22 +63,24 @@ describe Cobweb, :local_only => true do
|
|
61
63
|
:debug => false,
|
62
64
|
:cache => nil
|
63
65
|
}
|
66
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
67
|
+
|
64
68
|
@cobweb = Cobweb.new @request
|
65
69
|
end
|
66
70
|
|
67
71
|
it "should crawl entire site" do
|
68
|
-
ap Resque.size("cobweb_process_job")
|
69
72
|
crawl = @cobweb.start(@base_url)
|
70
73
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
71
74
|
wait_for_crawl_finished crawl[:crawl_id]
|
72
|
-
|
73
|
-
|
75
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
76
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
74
77
|
end
|
75
78
|
it "detect crawl finished once" do
|
76
79
|
crawl = @cobweb.start(@base_url)
|
77
80
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
78
81
|
wait_for_crawl_finished crawl[:crawl_id]
|
79
|
-
|
82
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
83
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
80
84
|
end
|
81
85
|
end
|
82
86
|
describe "with limited mime_types" do
|
@@ -87,6 +91,7 @@ describe Cobweb, :local_only => true do
|
|
87
91
|
:cache => nil,
|
88
92
|
:valid_mime_types => ["text/html"]
|
89
93
|
}
|
94
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
90
95
|
@cobweb = Cobweb.new @request
|
91
96
|
end
|
92
97
|
|
@@ -94,7 +99,7 @@ describe Cobweb, :local_only => true do
|
|
94
99
|
crawl = @cobweb.start(@base_url)
|
95
100
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
96
101
|
wait_for_crawl_finished crawl[:crawl_id]
|
97
|
-
|
102
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 8
|
98
103
|
|
99
104
|
mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
|
100
105
|
mime_types.count.should == 8
|
@@ -110,6 +115,7 @@ describe Cobweb, :local_only => true do
|
|
110
115
|
:quiet => true,
|
111
116
|
:cache => nil
|
112
117
|
}
|
118
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
113
119
|
end
|
114
120
|
|
115
121
|
describe "limit to 1" do
|
@@ -122,19 +128,19 @@ describe Cobweb, :local_only => true do
|
|
122
128
|
crawl = @cobweb.start(@base_url)
|
123
129
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
124
130
|
wait_for_crawl_finished crawl[:crawl_id]
|
125
|
-
|
131
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
126
132
|
end
|
127
133
|
it "should only crawl 1 page" do
|
128
134
|
crawl = @cobweb.start(@base_url)
|
129
135
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
130
136
|
wait_for_crawl_finished crawl[:crawl_id]
|
131
|
-
|
137
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 1
|
132
138
|
end
|
133
139
|
it "should notify of crawl finished once" do
|
134
140
|
crawl = @cobweb.start(@base_url)
|
135
141
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
136
142
|
wait_for_crawl_finished crawl[:crawl_id]
|
137
|
-
|
143
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
138
144
|
end
|
139
145
|
end
|
140
146
|
|
@@ -145,6 +151,7 @@ describe Cobweb, :local_only => true do
|
|
145
151
|
@cobweb = Cobweb.new @request
|
146
152
|
end
|
147
153
|
|
154
|
+
# the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
|
148
155
|
it "should only use html pages towards the crawl limit" do
|
149
156
|
crawl = @cobweb.start(@base_url)
|
150
157
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -165,19 +172,19 @@ describe Cobweb, :local_only => true do
|
|
165
172
|
crawl = @cobweb.start(@base_url)
|
166
173
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
167
174
|
wait_for_crawl_finished crawl[:crawl_id]
|
168
|
-
|
175
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
169
176
|
end
|
170
177
|
it "should notify of crawl finished once" do
|
171
178
|
crawl = @cobweb.start(@base_url)
|
172
179
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
173
180
|
wait_for_crawl_finished crawl[:crawl_id]
|
174
|
-
|
181
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
175
182
|
end
|
176
183
|
it "should only crawl 10 objects" do
|
177
184
|
crawl = @cobweb.start(@base_url)
|
178
185
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
179
186
|
wait_for_crawl_finished crawl[:crawl_id]
|
180
|
-
|
187
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == 10
|
181
188
|
end
|
182
189
|
end
|
183
190
|
|
@@ -191,23 +198,24 @@ describe Cobweb, :local_only => true do
|
|
191
198
|
crawl = @cobweb.start(@base_url)
|
192
199
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
193
200
|
wait_for_crawl_finished crawl[:crawl_id]
|
194
|
-
|
201
|
+
@redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
|
195
202
|
end
|
196
203
|
it "should notify of crawl finished once" do
|
197
204
|
crawl = @cobweb.start(@base_url)
|
198
205
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
199
206
|
wait_for_crawl_finished crawl[:crawl_id]
|
200
|
-
|
207
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
201
208
|
end
|
202
209
|
it "should not crawl 100 pages" do
|
203
210
|
crawl = @cobweb.start(@base_url)
|
204
211
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
205
212
|
wait_for_crawl_finished crawl[:crawl_id]
|
206
|
-
|
213
|
+
@redis.get("crawl_job_enqueued_count").to_i.should_not == 100
|
207
214
|
end
|
208
215
|
end
|
209
216
|
end
|
210
217
|
|
218
|
+
|
211
219
|
after(:all) do
|
212
220
|
|
213
221
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
@@ -220,26 +228,43 @@ describe Cobweb, :local_only => true do
|
|
220
228
|
end
|
221
229
|
|
222
230
|
def wait_for_crawl_finished(crawl_id, timeout=20)
|
223
|
-
counter = 0
|
231
|
+
@counter = 0
|
224
232
|
start_time = Time.now
|
225
233
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
226
|
-
|
227
|
-
end
|
228
|
-
if Time.now > start_time + timeout
|
229
|
-
raise "End of crawl not detected"
|
230
|
-
end
|
234
|
+
sleep 0.5
|
231
235
|
end
|
232
|
-
|
233
|
-
|
234
|
-
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
236
|
+
if Time.now > start_time + timeout
|
237
|
+
raise "End of crawl not detected"
|
235
238
|
end
|
239
|
+
end
|
236
240
|
|
237
|
-
|
238
|
-
|
239
|
-
|
241
|
+
def running?(crawl_id)
|
242
|
+
status = @stat.get_status
|
243
|
+
result = true
|
244
|
+
if status == CobwebCrawlHelper::STARTING
|
245
|
+
result = true
|
246
|
+
else
|
247
|
+
if status == @last_stat
|
248
|
+
if @counter > 5
|
249
|
+
raise "Static status: #{status}"
|
250
|
+
else
|
251
|
+
@counter += 1
|
252
|
+
end
|
253
|
+
puts "Static Status.. #{6-@counter}"
|
254
|
+
else
|
255
|
+
result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
|
240
256
|
end
|
257
|
+
end
|
258
|
+
@last_stat = @stat.get_status
|
259
|
+
result
|
260
|
+
end
|
241
261
|
|
242
|
-
|
243
|
-
|
244
|
-
Resque.
|
262
|
+
def clear_queues
|
263
|
+
Resque.queues.each do |queue|
|
264
|
+
Resque.remove_queue(queue)
|
245
265
|
end
|
266
|
+
|
267
|
+
Resque.size("cobweb_process_job").should == 0
|
268
|
+
Resque.size("cobweb_finished_job").should == 0
|
269
|
+
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
270
|
+
end
|
@@ -76,11 +76,9 @@ describe ContentLinkParser do
|
|
76
76
|
links.length.should == 3
|
77
77
|
end
|
78
78
|
end
|
79
|
-
describe "returning unknown link type" do
|
79
|
+
describe "returning unknown link type should raise an error" do
|
80
80
|
it "should return an empty array" do
|
81
|
-
|
82
|
-
links.should_not be_nil
|
83
|
-
links.should be_an_instance_of Array
|
81
|
+
lambda {@content_parser.asdfasdfsadf}.should raise_error
|
84
82
|
end
|
85
83
|
end
|
86
84
|
end
|
@@ -122,7 +120,7 @@ describe ContentLinkParser do
|
|
122
120
|
describe "ignoring default tags" do
|
123
121
|
it "should not return any links" do
|
124
122
|
parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
|
125
|
-
parser.links.should
|
123
|
+
lambda{parser.links}.should raise_error(NoMethodError)
|
126
124
|
end
|
127
125
|
end
|
128
126
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe Cobweb, :local_only => true do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
#store all existing resque process ids so we don't kill them afterwards
|
7
|
+
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
8
|
+
|
9
|
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
|
+
puts "Starting Workers... Please Wait..."
|
11
|
+
`mkdir log`
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
|
+
puts "Workers Started."
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
before(:each) do
|
18
|
+
@base_url = "http://localhost:3532/"
|
19
|
+
@base_page_count = 77
|
20
|
+
clear_queues
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "with a crawl limit" do
|
24
|
+
before(:each) do
|
25
|
+
@request = {
|
26
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
27
|
+
:quiet => true,
|
28
|
+
:cache => nil,
|
29
|
+
:use_encoding_safe_process_job => true,
|
30
|
+
:crawl_limit_by_page => true
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "on ancestry.com.au" do
|
35
|
+
describe "limited to 100" do
|
36
|
+
before(:each) do
|
37
|
+
@request[:crawl_limit] = 100
|
38
|
+
@request[:valid_mime_types] = ["text/html"]
|
39
|
+
@cobweb = Cobweb.new @request
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should crawl 100 pages" do
|
43
|
+
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
44
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
45
|
+
wait_for_crawl_finished crawl[:crawl_id], 180
|
46
|
+
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe "limited to 999" do
|
51
|
+
before(:each) do
|
52
|
+
@request[:crawl_limit] = 999
|
53
|
+
@cobweb = Cobweb.new @request
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should crawl 999 pages" do
|
57
|
+
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
58
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
59
|
+
wait_for_crawl_finished crawl[:crawl_id], 720
|
60
|
+
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
61
|
+
end
|
62
|
+
end
|
63
|
+
__END__
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
after(:all) do
|
68
|
+
|
69
|
+
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
70
|
+
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
71
|
+
IO.popen(command)
|
72
|
+
|
73
|
+
clear_queues
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
def wait_for_crawl_finished(crawl_id, timeout=20)
|
79
|
+
counter = 0
|
80
|
+
start_time = Time.now
|
81
|
+
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
82
|
+
sleep 0.5
|
83
|
+
end
|
84
|
+
if Time.now > start_time + timeout
|
85
|
+
raise "End of crawl not detected"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def running?(crawl_id)
|
90
|
+
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear_queues
|
94
|
+
Resque.queues.each do |queue|
|
95
|
+
Resque.remove_queue(queue)
|
96
|
+
end
|
97
|
+
|
98
|
+
Resque.size("cobweb_process_job").should == 0
|
99
|
+
Resque.size("cobweb_finished_job").should == 0
|
100
|
+
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
101
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.74
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70347429190520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70347429190520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70347429190020 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70347429190020
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70347429189540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70347429189540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70347429188880 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70347429188880
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70347429187340 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70347429187340
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70347429185820 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70347429185820
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70347429185040 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70347429185040
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70347429184340 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70347429184340
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70347429183120 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70347429183120
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70347429181840 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70347429181840
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70347429180860 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70347429180860
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -142,13 +142,14 @@ extensions: []
|
|
142
142
|
extra_rdoc_files:
|
143
143
|
- README.textile
|
144
144
|
files:
|
145
|
+
- spec/cobweb/cobweb_crawl_helper_spec.rb
|
145
146
|
- spec/cobweb/cobweb_crawler_spec.rb
|
146
147
|
- spec/cobweb/cobweb_job_spec.rb
|
147
148
|
- spec/cobweb/cobweb_links_spec.rb
|
148
149
|
- spec/cobweb/cobweb_spec.rb
|
149
150
|
- spec/cobweb/content_link_parser_spec.rb
|
150
|
-
- spec/cobweb/crawl_spec.rb
|
151
151
|
- spec/cobweb/robots_spec.rb
|
152
|
+
- spec/cobweb/site_test_spec.rb.tmp
|
152
153
|
- spec/samples/robots.txt
|
153
154
|
- spec/samples/sample_html_links.html
|
154
155
|
- spec/samples/sample_server.rb
|
@@ -328,7 +329,9 @@ files:
|
|
328
329
|
- lib/cobweb_process_job.rb
|
329
330
|
- lib/cobweb_version.rb
|
330
331
|
- lib/content_link_parser.rb
|
332
|
+
- lib/crawl.rb
|
331
333
|
- lib/crawl_job.rb
|
334
|
+
- lib/crawl_object.rb
|
332
335
|
- lib/encoding_safe_process_job.rb
|
333
336
|
- lib/hash_util.rb
|
334
337
|
- lib/redirect_error.rb
|