cobweb 0.0.58 → 0.0.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +8 -6
- data/lib/cobweb.rb +11 -9
- data/lib/cobweb_crawler.rb +0 -2
- data/lib/cobweb_links.rb +2 -11
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl_job.rb +94 -48
- data/lib/robots.rb +2 -9
- data/spec/cobweb/cobweb_job_spec.rb +64 -17
- data/spec/samples/sample_site/index.html +0 -5
- metadata +22 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.59
|
3
3
|
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
4
4
|
|
5
5
|
h2. Intro
|
@@ -38,11 +38,6 @@ h3. Data Returned
|
|
38
38
|
** :related - url's from link tags
|
39
39
|
** :scripts - url's from script tags
|
40
40
|
** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
|
41
|
-
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
42
|
-
* :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
43
|
-
* :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
44
|
-
* :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
45
|
-
* :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
46
41
|
|
47
42
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
48
43
|
|
@@ -70,6 +65,13 @@ Creates a new crawler object based on a base_url
|
|
70
65
|
** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
|
71
66
|
** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
|
72
67
|
** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
|
68
|
+
** :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
69
|
+
** :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
70
|
+
** :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
71
|
+
** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
72
|
+
** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
73
|
+
** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
|
74
|
+
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
|
73
75
|
|
74
76
|
bq. crawler = CobWeb.new(:follow_redirects => false)
|
75
77
|
|
data/lib/cobweb.rb
CHANGED
@@ -180,6 +180,7 @@ class Cobweb
|
|
180
180
|
content[:character_set] = charset
|
181
181
|
end
|
182
182
|
content[:length] = response.content_length
|
183
|
+
content[:text_content] = text_content?(content[:mime_type])
|
183
184
|
if text_content?(content[:mime_type])
|
184
185
|
if response["Content-Encoding"]=="gzip"
|
185
186
|
content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
|
@@ -389,21 +390,22 @@ class Cobweb
|
|
389
390
|
|
390
391
|
end
|
391
392
|
|
393
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
394
|
+
def self.escape_pattern_for_regex(pattern)
|
395
|
+
pattern = pattern.gsub(".", "\\.")
|
396
|
+
pattern = pattern.gsub("?", "\\?")
|
397
|
+
pattern = pattern.gsub("+", "\\+")
|
398
|
+
pattern = pattern.gsub("*", ".*?")
|
399
|
+
pattern
|
400
|
+
end
|
401
|
+
|
392
402
|
private
|
393
403
|
# checks if the mime_type is textual
|
394
404
|
def text_content?(content_type)
|
395
405
|
@options[:text_mime_types].each do |mime_type|
|
396
|
-
return true if content_type.match(escape_pattern_for_regex(mime_type))
|
406
|
+
return true if content_type.match(Cobweb.escape_pattern_for_regex(mime_type))
|
397
407
|
end
|
398
408
|
false
|
399
409
|
end
|
400
|
-
|
401
|
-
# escapes characters with meaning in regular expressions and adds wildcard expression
|
402
|
-
def escape_pattern_for_regex(pattern)
|
403
|
-
pattern = pattern.gsub(".", "\\.")
|
404
|
-
pattern = pattern.gsub("?", "\\?")
|
405
|
-
pattern = pattern.gsub("*", ".*?")
|
406
|
-
pattern
|
407
|
-
end
|
408
410
|
|
409
411
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -85,8 +85,6 @@ class CobwebCrawler
|
|
85
85
|
|
86
86
|
@stats.update_statistics(content, crawl_counter, queue_counter)
|
87
87
|
@stats.update_status("Completed #{url}.")
|
88
|
-
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
|
89
|
-
|
90
88
|
yield content, @stats.get_statistics if block_given?
|
91
89
|
|
92
90
|
rescue => e
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,8 +12,8 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{escape_pattern_for_regex(pattern)}")}
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern)}")}
|
17
17
|
|
18
18
|
end
|
19
19
|
|
@@ -52,15 +52,6 @@ class CobwebLinks
|
|
52
52
|
@internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
|
53
53
|
end
|
54
54
|
|
55
|
-
private
|
56
|
-
# escapes characters with meaning in regular expressions and adds wildcard expression
|
57
|
-
def escape_pattern_for_regex(pattern)
|
58
|
-
pattern = pattern.gsub(".", "\\.")
|
59
|
-
pattern = pattern.gsub("?", "\\?")
|
60
|
-
pattern = pattern.gsub("*", ".*?")
|
61
|
-
ap pattern if @options[:debug]
|
62
|
-
pattern
|
63
|
-
end
|
64
55
|
end
|
65
56
|
|
66
57
|
# Exception raised for :internal_urls missing from CobwebLinks
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -11,83 +11,105 @@ class CrawlJob
|
|
11
11
|
|
12
12
|
# Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
|
13
13
|
def self.perform(content_request)
|
14
|
-
|
15
14
|
# change all hash keys to symbols
|
16
15
|
content_request = HashUtil.deep_symbolize_keys(content_request)
|
16
|
+
@content_request = content_request
|
17
17
|
|
18
18
|
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
19
|
+
content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
|
20
|
+
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
21
|
+
|
19
22
|
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
20
23
|
@stats = Stats.new(content_request)
|
21
24
|
|
22
25
|
@debug = content_request[:debug]
|
23
26
|
|
24
|
-
|
27
|
+
decrement_queue_counter
|
25
28
|
|
26
29
|
# check we haven't crawled this url before
|
27
30
|
unless @redis.sismember "crawled", content_request[:url]
|
28
|
-
@redis.srem "queued", content_request[:url]
|
29
|
-
decrement_queue_counter
|
30
|
-
@redis.sadd "crawled", content_request[:url]
|
31
|
-
increment_crawl_counter
|
32
31
|
|
33
|
-
|
34
|
-
if within_crawl_limits?(content_request[:crawl_limit])
|
32
|
+
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
35
33
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
34
|
+
if is_permitted_type(content)
|
35
|
+
# if there is no limit or we're still under it lets get the url
|
36
|
+
if within_crawl_limits?(content_request[:crawl_limit])
|
37
|
+
#update the queued and crawled lists if we are within the crawl limits.
|
38
|
+
@redis.srem "queued", content_request[:url]
|
39
|
+
@redis.sadd "crawled", content_request[:url]
|
40
|
+
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
41
|
+
if content_request[:crawl_limit_by_page]
|
42
|
+
if content[:mime_type].match("text/html")
|
43
|
+
increment_crawl_counter
|
44
|
+
increment_crawl_started_counter
|
45
|
+
end
|
46
|
+
else
|
47
|
+
increment_crawl_counter
|
48
|
+
increment_crawl_started_counter
|
49
|
+
end
|
44
50
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
#
|
50
|
-
|
51
|
+
## update statistics
|
52
|
+
@stats.update_status("Crawling #{content_request[:url]}...")
|
53
|
+
@stats.update_statistics(content)
|
54
|
+
|
55
|
+
# set the base url if this is the first page
|
56
|
+
set_base_url @redis, content, content_request
|
57
|
+
|
58
|
+
@cobweb_links = CobwebLinks.new(content_request)
|
59
|
+
if within_queue_limits?(content_request[:crawl_limit])
|
60
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
|
61
|
+
|
62
|
+
# select the link if its internal
|
63
|
+
internal_links.select!{|link| @cobweb_links.internal?(link)}
|
51
64
|
|
52
|
-
|
53
|
-
|
54
|
-
|
65
|
+
# reject the link if we've crawled it or queued it
|
66
|
+
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
67
|
+
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
55
68
|
|
56
|
-
|
57
|
-
|
69
|
+
internal_links.each do |link|
|
70
|
+
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
|
71
|
+
end
|
58
72
|
end
|
59
|
-
end
|
60
73
|
|
61
|
-
|
62
|
-
|
74
|
+
# enqueue to processing queue
|
75
|
+
send_to_processing_queue(content, content_request)
|
63
76
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
# if there's nothing left queued or the crawled limit has been reached
|
72
|
-
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
73
|
-
if @redis.scard("queued") == 0
|
74
|
-
finished(content_request)
|
77
|
+
#if the enqueue counter has been requested update that
|
78
|
+
if content_request.has_key? :enqueue_counter_key
|
79
|
+
enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
80
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
81
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
75
82
|
end
|
76
|
-
|
77
|
-
|
83
|
+
|
84
|
+
# update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
|
85
|
+
# really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
|
86
|
+
#increment_crawl_counter
|
87
|
+
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
|
78
88
|
end
|
89
|
+
else
|
90
|
+
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
79
91
|
end
|
92
|
+
|
80
93
|
else
|
81
94
|
@redis.srem "queued", content_request[:url]
|
82
|
-
decrement_queue_counter
|
83
95
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
84
96
|
end
|
85
|
-
|
97
|
+
|
98
|
+
# if there's nothing left queued or the crawled limit has been reached
|
99
|
+
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
100
|
+
if @queue_counter == 0
|
101
|
+
finished(content_request)
|
102
|
+
end
|
103
|
+
elsif @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
104
|
+
finished(content_request)
|
105
|
+
end
|
106
|
+
|
86
107
|
end
|
87
108
|
|
88
109
|
# Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
|
89
110
|
def self.finished(content_request)
|
90
111
|
# finished
|
112
|
+
ap "FINISHED"
|
91
113
|
@stats.end_crawl(content_request)
|
92
114
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
93
115
|
end
|
@@ -95,7 +117,10 @@ class CrawlJob
|
|
95
117
|
# Enqueues the content to the processing queue setup in options
|
96
118
|
def self.send_to_processing_queue(content, content_request)
|
97
119
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
98
|
-
if content_request[:
|
120
|
+
if content_request[:direct_call_process_job]
|
121
|
+
clazz = const_get(content_request[:processing_queue])
|
122
|
+
clazz.perform(content_to_send)
|
123
|
+
elsif content_request[:use_encoding_safe_process_job]
|
99
124
|
content_to_send[:body] = Base64.encode64(content[:body])
|
100
125
|
content_to_send[:processing_queue] = content_request[:processing_queue]
|
101
126
|
Resque.enqueue(EncodingSafeProcessJob, content_to_send)
|
@@ -103,19 +128,28 @@ class CrawlJob
|
|
103
128
|
Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
|
104
129
|
end
|
105
130
|
puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}" if content_request[:debug]
|
106
|
-
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
|
107
131
|
end
|
108
132
|
|
109
133
|
private
|
110
134
|
|
135
|
+
# Helper method to determine if this content is to be processed or not
|
136
|
+
def self.is_permitted_type(content)
|
137
|
+
@content_request[:valid_mime_types].each do |mime_type|
|
138
|
+
return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
|
139
|
+
end
|
140
|
+
false
|
141
|
+
end
|
142
|
+
|
111
143
|
# Returns true if the crawl count is within limits
|
112
144
|
def self.within_crawl_limits?(crawl_limit)
|
145
|
+
refresh_counters
|
113
146
|
crawl_limit.nil? or @crawl_counter <= crawl_limit.to_i
|
147
|
+
crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
|
114
148
|
end
|
115
149
|
|
116
150
|
# Returns true if the queue count is calculated to be still within limits when complete
|
117
151
|
def self.within_queue_limits?(crawl_limit)
|
118
|
-
within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @
|
152
|
+
@content_request[:crawl_limit_by_page] || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_started_counter) < crawl_limit.to_i)
|
119
153
|
end
|
120
154
|
|
121
155
|
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
@@ -149,6 +183,10 @@ class CrawlJob
|
|
149
183
|
@redis.incr "crawl-counter"
|
150
184
|
refresh_counters
|
151
185
|
end
|
186
|
+
def self.increment_crawl_started_counter
|
187
|
+
@redis.incr "crawl-started-counter"
|
188
|
+
refresh_counters
|
189
|
+
end
|
152
190
|
# Decrements the queue counter and refreshes crawl counters
|
153
191
|
def self.decrement_queue_counter
|
154
192
|
@redis.decr "queue-counter"
|
@@ -157,12 +195,20 @@ class CrawlJob
|
|
157
195
|
# Refreshes the crawl counters
|
158
196
|
def self.refresh_counters
|
159
197
|
@crawl_counter = @redis.get("crawl-counter").to_i
|
198
|
+
@crawl_started_counter = @redis.get("crawl-started-counter").to_i
|
160
199
|
@queue_counter = @redis.get("queue-counter").to_i
|
161
200
|
end
|
201
|
+
|
202
|
+
def self.print_counters
|
203
|
+
puts "@crawl_counter: #{@crawl_counter} @crawl_started_counter: #{@crawl_started_counter} @queue_counter: #{@queue_counter}"
|
204
|
+
end
|
205
|
+
|
162
206
|
# Sets the crawl counters based on the crawled and queued queues
|
163
207
|
def self.reset_counters
|
208
|
+
@redis.set("crawl-started-counter", @redis.smembers("crawled").count)
|
164
209
|
@redis.set("crawl-counter", @redis.smembers("crawled").count)
|
165
210
|
@redis.set("queue-counter", @redis.smembers("queued").count)
|
211
|
+
@crawl_started_counter = @redis.get("crawl-started-counter").to_i
|
166
212
|
@crawl_counter = @redis.get("crawl-counter").to_i
|
167
213
|
@queue_counter = @redis.get("queue-counter").to_i
|
168
214
|
end
|
data/lib/robots.rb
CHANGED
@@ -28,10 +28,10 @@ class Robots
|
|
28
28
|
def allowed?(url)
|
29
29
|
uri = URI.parse(url)
|
30
30
|
@params[:allow].each do |pattern|
|
31
|
-
return true if uri.path.match(escape_pattern_for_regex(pattern))
|
31
|
+
return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
32
32
|
end
|
33
33
|
@params[:disallow].each do |pattern|
|
34
|
-
return false if uri.path.match(escape_pattern_for_regex(pattern))
|
34
|
+
return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern))
|
35
35
|
end
|
36
36
|
true
|
37
37
|
end
|
@@ -45,13 +45,6 @@ class Robots
|
|
45
45
|
end
|
46
46
|
|
47
47
|
private
|
48
|
-
# escapes characters with meaning in regular expressions and adds wildcard expression
|
49
|
-
def escape_pattern_for_regex(pattern)
|
50
|
-
pattern = pattern.gsub(".", "\\.")
|
51
|
-
pattern = pattern.gsub("?", "\\?")
|
52
|
-
pattern = pattern.gsub("*", ".*?")
|
53
|
-
pattern
|
54
|
-
end
|
55
48
|
|
56
49
|
def parse_data(data)
|
57
50
|
user_agents = {}
|
@@ -46,7 +46,32 @@ describe Cobweb, :local_only => true do
|
|
46
46
|
Resque.size("cobweb_finished_job").should == 1
|
47
47
|
end
|
48
48
|
end
|
49
|
-
|
49
|
+
describe "with limited mime_types" do
|
50
|
+
before(:each) do
|
51
|
+
@request = {
|
52
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
53
|
+
:quiet => true,
|
54
|
+
:cache => nil,
|
55
|
+
:valid_mime_types => ["text/html"]
|
56
|
+
}
|
57
|
+
@cobweb = Cobweb.new @request
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should only crawl html pages" do
|
61
|
+
crawl = @cobweb.start(@base_url)
|
62
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
63
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
64
|
+
Resque.size("cobweb_process_job").should == 8
|
65
|
+
|
66
|
+
mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
|
67
|
+
mime_types.count.should == 8
|
68
|
+
mime_types.map{|m| m.should == "text/html"}
|
69
|
+
mime_types.select{|m| m=="text/html"}.count.should == 8
|
70
|
+
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
50
75
|
describe "with a crawl limit" do
|
51
76
|
before(:each) do
|
52
77
|
@request = {
|
@@ -54,12 +79,12 @@ describe Cobweb, :local_only => true do
|
|
54
79
|
:quiet => true,
|
55
80
|
:cache => nil
|
56
81
|
}
|
57
|
-
@cobweb = Cobweb.new @request
|
58
82
|
end
|
59
83
|
|
60
84
|
describe "limit to 1" do
|
61
85
|
before(:each) do
|
62
86
|
@request[:crawl_limit] = 1
|
87
|
+
@cobweb = Cobweb.new @request
|
63
88
|
end
|
64
89
|
|
65
90
|
it "should not crawl the entire site" do
|
@@ -82,11 +107,30 @@ describe Cobweb, :local_only => true do
|
|
82
107
|
end
|
83
108
|
|
84
109
|
end
|
110
|
+
|
111
|
+
describe "for pages only" do
|
112
|
+
before(:each) do
|
113
|
+
@request[:crawl_limit_by_page] = true
|
114
|
+
@request[:crawl_limit] = 5
|
115
|
+
@cobweb = Cobweb.new @request
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should only use html pages towards the crawl limit" do
|
119
|
+
crawl = @cobweb.start(@base_url)
|
120
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
121
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
122
|
+
mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
|
123
|
+
mime_types.count.should == 70
|
124
|
+
mime_types.select{|m| m=="text/html"}.count.should == 5
|
125
|
+
end
|
126
|
+
end
|
85
127
|
|
86
|
-
describe "limit to
|
128
|
+
describe "limit to 10" do
|
87
129
|
before(:each) do
|
88
|
-
@request[:crawl_limit] =
|
130
|
+
@request[:crawl_limit] = 10
|
131
|
+
@cobweb = Cobweb.new @request
|
89
132
|
end
|
133
|
+
|
90
134
|
it "should not crawl the entire site" do
|
91
135
|
crawl = @cobweb.start(@base_url)
|
92
136
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -99,21 +143,21 @@ describe Cobweb, :local_only => true do
|
|
99
143
|
wait_for_crawl_finished crawl[:crawl_id]
|
100
144
|
Resque.size("cobweb_finished_job").should == 1
|
101
145
|
end
|
102
|
-
it "should only crawl
|
146
|
+
it "should only crawl 10 objects" do
|
103
147
|
crawl = @cobweb.start(@base_url)
|
104
148
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
105
149
|
wait_for_crawl_finished crawl[:crawl_id]
|
106
|
-
Resque.size("cobweb_process_job").should ==
|
107
|
-
end
|
108
|
-
|
150
|
+
Resque.size("cobweb_process_job").should == 10
|
151
|
+
end
|
109
152
|
end
|
110
153
|
|
111
154
|
describe "limit to 100" do
|
112
155
|
before(:each) do
|
113
156
|
@request[:crawl_limit] = 100
|
157
|
+
@cobweb = Cobweb.new @request
|
114
158
|
end
|
115
|
-
|
116
|
-
it "should crawl the entire site" do
|
159
|
+
|
160
|
+
it "should crawl the entire sample site" do
|
117
161
|
crawl = @cobweb.start(@base_url)
|
118
162
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
119
163
|
wait_for_crawl_finished crawl[:crawl_id]
|
@@ -138,19 +182,21 @@ describe Cobweb, :local_only => true do
|
|
138
182
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
139
183
|
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
140
184
|
IO.popen(command)
|
185
|
+
|
186
|
+
clear_queues
|
141
187
|
end
|
142
188
|
|
143
189
|
end
|
144
190
|
|
145
191
|
def wait_for_crawl_finished(crawl_id, timeout=20)
|
146
|
-
counter = 0
|
147
|
-
|
148
|
-
|
149
|
-
|
192
|
+
counter = 0
|
193
|
+
start_time = Time.now
|
194
|
+
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
195
|
+
sleep 0.5
|
150
196
|
end
|
151
|
-
if
|
197
|
+
if Time.now > start_time + timeout
|
152
198
|
raise "End of crawl not detected"
|
153
|
-
end
|
199
|
+
end
|
154
200
|
end
|
155
201
|
|
156
202
|
def running?(crawl_id)
|
@@ -161,9 +207,10 @@ def clear_queues
|
|
161
207
|
Resque.queues.each do |queue|
|
162
208
|
Resque.remove_queue(queue)
|
163
209
|
end
|
210
|
+
puts "Cleared"
|
164
211
|
|
165
212
|
Resque.size("cobweb_process_job").should == 0
|
166
|
-
Resque.size("cobweb_finished_job").should == 0
|
213
|
+
Resque.size("cobweb_finished_job").should == 0
|
167
214
|
end
|
168
215
|
|
169
216
|
|
@@ -71,11 +71,6 @@
|
|
71
71
|
</ul>
|
72
72
|
</li>
|
73
73
|
<li><a href="typography.html">Typography</a></li>
|
74
|
-
<li><a href="boxgrid.html">Boxes Grid</a></li>
|
75
|
-
<li><a href="forms.html">Forms</a></li>
|
76
|
-
<li><a href="gallery.html">Gallery</a></li>
|
77
|
-
<li><a href="tables.html">Tables</a></li>
|
78
|
-
<li><a href="more.html">More</a></li>
|
79
74
|
</ul>
|
80
75
|
<div class="search">
|
81
76
|
<form action="" method="post">
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.59
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70204470213880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70204470213880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70204470212220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70204470212220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70204470211500 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70204470211500
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70204470210300 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70204470210300
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70204470208860 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70204470208860
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70204470223880 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70204470223880
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70204470223280 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70204470223280
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70204470222720 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70204470222720
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70204470222160 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70204470222160
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70204470221480 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70204470221480
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|