cobweb 0.0.44 → 0.0.45
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +15 -1
- data/lib/cobweb.rb +68 -34
- data/lib/cobweb_crawler.rb +10 -36
- data/lib/cobweb_links.rb +48 -0
- data/lib/cobweb_version.rb +6 -0
- data/lib/content_link_parser.rb +6 -2
- data/lib/crawl_job.rb +8 -34
- data/spec/cobweb/cobweb_links_spec.rb +103 -0
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/cobweb/content_link_parser_spec.rb +12 -2
- data/spec/samples/sample_html_links.html +4 -1
- data/spec/spec_helper.rb +8 -3
- metadata +26 -23
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.45
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -101,6 +101,20 @@ h3. Contributing/Testing
|
|
101
101
|
|
102
102
|
Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
|
103
103
|
|
104
|
+
h2. Todo
|
105
|
+
|
106
|
+
* Tidy up classes with link parsing
|
107
|
+
* Refactoring of code to simplify design
|
108
|
+
* Remove requirement of redis from standalone crawler
|
109
|
+
* Add redis settings to standalone crawler (ie to connect to remote redis)
|
110
|
+
* Add ability to start and stop crawls from web interface
|
111
|
+
* Allow crawler to start as web interface only (ie not run crawls at start)
|
112
|
+
* Fix content encoding issue requiring separate process job
|
113
|
+
|
114
|
+
h3. Big changes
|
115
|
+
|
116
|
+
* Refactor into a module and refactor class names to remove cobweb and increase simplicity
|
117
|
+
|
104
118
|
h2. License
|
105
119
|
|
106
120
|
h3. The MIT License
|
data/lib/cobweb.rb
CHANGED
@@ -20,7 +20,7 @@ class Cobweb
|
|
20
20
|
# investigate using event machine for single threaded crawling
|
21
21
|
|
22
22
|
def self.version
|
23
|
-
|
23
|
+
CobwebVersion.version
|
24
24
|
end
|
25
25
|
|
26
26
|
def method_missing(method_sym, *arguments, &block)
|
@@ -56,9 +56,9 @@ class Cobweb
|
|
56
56
|
:url => base_url
|
57
57
|
}
|
58
58
|
|
59
|
-
if @options[:internal_urls].empty?
|
59
|
+
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
60
60
|
uri = Addressable::URI.parse(base_url)
|
61
|
-
@options[:internal_urls]
|
61
|
+
@options[:internal_urls] = [[uri.scheme, "://", uri.host, "/*"].join]
|
62
62
|
end
|
63
63
|
|
64
64
|
request.merge!(@options)
|
@@ -79,6 +79,7 @@ class Cobweb
|
|
79
79
|
def get(url, options = @options)
|
80
80
|
raise "url cannot be nil" if url.nil?
|
81
81
|
uri = Addressable::URI.parse(url)
|
82
|
+
uri.normalize!
|
82
83
|
uri.fragment=nil
|
83
84
|
url = uri.to_s
|
84
85
|
|
@@ -104,9 +105,6 @@ class Cobweb
|
|
104
105
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
105
106
|
content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
106
107
|
else
|
107
|
-
# this url is valid for processing so lets get on with it
|
108
|
-
#TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
|
109
|
-
|
110
108
|
# retrieve data
|
111
109
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
112
110
|
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
@@ -122,7 +120,11 @@ class Cobweb
|
|
122
120
|
@http.open_timeout = @options[:timeout].to_i
|
123
121
|
begin
|
124
122
|
print "Retrieving #{url }... " unless @options[:quiet]
|
125
|
-
|
123
|
+
request_options={}
|
124
|
+
if options[:cookies]
|
125
|
+
request_options[ 'Cookie']= options[:cookies]
|
126
|
+
end
|
127
|
+
request = Net::HTTP::Get.new uri.request_uri, request_options
|
126
128
|
|
127
129
|
response = @http.request request
|
128
130
|
|
@@ -135,14 +137,11 @@ class Cobweb
|
|
135
137
|
# decrement redirect limit
|
136
138
|
redirect_limit = redirect_limit - 1
|
137
139
|
|
138
|
-
# raise exception if we're being redirected to somewhere we've been redirected to in this content request
|
139
|
-
#raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
|
140
|
-
|
141
|
-
# raise exception if redirect limit has reached 0
|
142
140
|
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
141
|
+
cookies = get_cookies(response)
|
143
142
|
|
144
143
|
# get the content from redirect location
|
145
|
-
content = get(url, options.merge(:redirect_limit => redirect_limit))
|
144
|
+
content = get(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
146
145
|
content[:url] = uri.to_s
|
147
146
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
148
147
|
content[:redirect_through].insert(0, url)
|
@@ -186,7 +185,7 @@ class Cobweb
|
|
186
185
|
redis.expire unique_id, @options[:cache].to_i
|
187
186
|
end
|
188
187
|
rescue RedirectError => e
|
189
|
-
puts "ERROR: #{e.message}"
|
188
|
+
puts "ERROR RedirectError: #{e.message}"
|
190
189
|
|
191
190
|
## generate a blank content
|
192
191
|
content = {}
|
@@ -201,7 +200,7 @@ class Cobweb
|
|
201
200
|
content[:links] = {}
|
202
201
|
|
203
202
|
rescue SocketError => e
|
204
|
-
puts "ERROR:
|
203
|
+
puts "ERROR SocketError: #{e.message}"
|
205
204
|
|
206
205
|
## generate a blank content
|
207
206
|
content = {}
|
@@ -233,10 +232,20 @@ class Cobweb
|
|
233
232
|
end
|
234
233
|
content
|
235
234
|
end
|
236
|
-
|
235
|
+
|
236
|
+
def get_cookies(response)
|
237
|
+
all_cookies = response.get_fields('set-cookie')
|
238
|
+
cookies_array = Array.new
|
239
|
+
all_cookies.each { |cookie|
|
240
|
+
cookies_array.push(cookie.split('; ')[0])
|
241
|
+
}
|
242
|
+
cookies = cookies_array.join('; ')
|
243
|
+
end
|
244
|
+
|
237
245
|
def head(url, options = @options)
|
238
246
|
raise "url cannot be nil" if url.nil?
|
239
247
|
uri = Addressable::URI.parse(url)
|
248
|
+
uri.normalize!
|
240
249
|
uri.fragment=nil
|
241
250
|
url = uri.to_s
|
242
251
|
|
@@ -255,37 +264,47 @@ class Cobweb
|
|
255
264
|
redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
|
256
265
|
end
|
257
266
|
|
258
|
-
content = {}
|
267
|
+
content = {:base_url => url}
|
259
268
|
|
260
269
|
# check if it has already been cached
|
261
270
|
if redis.get("head-#{unique_id}") and @options[:cache]
|
262
271
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
263
272
|
content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
264
273
|
else
|
265
|
-
print "Retrieving #{url }... " unless @options[:quiet]
|
266
|
-
|
267
274
|
# retrieve data
|
268
|
-
http
|
275
|
+
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
276
|
+
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
277
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
278
|
+
end
|
269
279
|
if uri.scheme == "https"
|
270
|
-
http.use_ssl = true
|
271
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
272
|
-
end
|
280
|
+
@http.use_ssl = true
|
281
|
+
@http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
282
|
+
end
|
273
283
|
|
274
284
|
request_time = Time.now.to_f
|
275
|
-
http.read_timeout = @options[:timeout].to_i
|
276
|
-
http.open_timeout = @options[:timeout].to_i
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
285
|
+
@http.read_timeout = @options[:timeout].to_i
|
286
|
+
@http.open_timeout = @options[:timeout].to_i
|
287
|
+
begin
|
288
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
289
|
+
request_options={}
|
290
|
+
if options[:cookies]
|
291
|
+
request_options[ 'Cookie']= options[:cookies]
|
292
|
+
end
|
293
|
+
request = Net::HTTP::Head.new uri.request_uri, request_options
|
294
|
+
|
295
|
+
response = @http.request request
|
281
296
|
|
282
297
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
283
298
|
puts "redirected... " unless @options[:quiet]
|
299
|
+
|
284
300
|
url = UriHelper.join_no_fragment(uri, response['location'])
|
301
|
+
|
285
302
|
redirect_limit = redirect_limit - 1
|
286
|
-
|
287
|
-
|
288
|
-
|
303
|
+
|
304
|
+
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
305
|
+
cookies = get_cookies(response)
|
306
|
+
|
307
|
+
content = head(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
289
308
|
content[:url] = uri.to_s
|
290
309
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
291
310
|
content[:redirect_through].insert(0, url)
|
@@ -293,7 +312,7 @@ class Cobweb
|
|
293
312
|
content[:url] = uri.to_s
|
294
313
|
content[:status_code] = response.code.to_i
|
295
314
|
unless response.content_type.nil?
|
296
|
-
content[:mime_type] = response.content_type.split(";")[0].strip
|
315
|
+
content[:mime_type] = response.content_type.split(";")[0].strip
|
297
316
|
if response["Content-Type"].include? ";"
|
298
317
|
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
299
318
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
@@ -310,8 +329,23 @@ class Cobweb
|
|
310
329
|
puts "Not storing in cache as cache disabled" if @options[:debug]
|
311
330
|
end
|
312
331
|
end
|
332
|
+
rescue RedirectError => e
|
333
|
+
puts "ERROR RedirectError: #{e.message}"
|
334
|
+
|
335
|
+
## generate a blank content
|
336
|
+
content = {}
|
337
|
+
content[:url] = uri.to_s
|
338
|
+
content[:response_time] = Time.now.to_f - request_time
|
339
|
+
content[:status_code] = 0
|
340
|
+
content[:length] = 0
|
341
|
+
content[:body] = ""
|
342
|
+
content[:error] = e.message
|
343
|
+
content[:mime_type] = "error/dnslookup"
|
344
|
+
content[:headers] = {}
|
345
|
+
content[:links] = {}
|
346
|
+
|
313
347
|
rescue SocketError => e
|
314
|
-
puts "ERROR: #{e.message}"
|
348
|
+
puts "ERROR SocketError: #{e.message}"
|
315
349
|
|
316
350
|
## generate a blank content
|
317
351
|
content = {}
|
@@ -326,7 +360,7 @@ class Cobweb
|
|
326
360
|
content[:links] = {}
|
327
361
|
|
328
362
|
rescue Timeout::Error => e
|
329
|
-
puts "ERROR: #{e.message}"
|
363
|
+
puts "ERROR Timeout::Error: #{e.message}"
|
330
364
|
|
331
365
|
## generate a blank content
|
332
366
|
content = {}
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -62,17 +62,17 @@ class CobwebCrawler
|
|
62
62
|
|
63
63
|
@redis.sadd "crawled", url.to_s
|
64
64
|
@redis.incr "crawl-counter"
|
65
|
-
|
66
|
-
internal_links =
|
65
|
+
|
66
|
+
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
67
|
|
68
|
+
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
69
|
+
cobweb_links = CobwebLinks.new(@options)
|
70
|
+
internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
|
71
|
+
|
68
72
|
# reject the link if we've crawled it or queued it
|
69
73
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
70
74
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
71
|
-
|
72
|
-
|
73
|
-
# select the link if its internal
|
74
|
-
internal_links.select!{|link| internal_link?(link)}
|
75
|
-
|
75
|
+
|
76
76
|
internal_links.each do |link|
|
77
77
|
puts "Added #{link.to_s} to queue" if @debug
|
78
78
|
@redis.sadd "queued", link
|
@@ -85,10 +85,11 @@ class CobwebCrawler
|
|
85
85
|
@stats.update_statistics(content, crawl_counter, queue_counter)
|
86
86
|
@stats.update_status("Completed #{url}.")
|
87
87
|
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
|
88
|
-
|
89
|
-
yield content, @
|
88
|
+
|
89
|
+
yield content, @stats.get_statistics if block_given?
|
90
90
|
|
91
91
|
rescue => e
|
92
|
+
raise e if ENVIRONMENT == "test"
|
92
93
|
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
93
94
|
ap e
|
94
95
|
ap e.backtrace
|
@@ -105,33 +106,6 @@ class CobwebCrawler
|
|
105
106
|
@stats.get_statistics
|
106
107
|
end
|
107
108
|
|
108
|
-
|
109
|
-
def internal_link?(link)
|
110
|
-
puts "Checking internal link for: #{link}" if @debug
|
111
|
-
valid_link = true
|
112
|
-
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
113
|
-
puts "Matching against #{pattern.source}" if @debug
|
114
|
-
if link.match(pattern)
|
115
|
-
puts "Matched as internal" if @debug
|
116
|
-
return true
|
117
|
-
end
|
118
|
-
end
|
119
|
-
puts "Didn't match any pattern so marked as not internal" if @debug
|
120
|
-
false
|
121
|
-
end
|
122
|
-
|
123
|
-
def internal_patterns
|
124
|
-
@internal_patterns ||= @redis.smembers("internal_urls")
|
125
|
-
end
|
126
|
-
|
127
|
-
def all_links_from_content(content)
|
128
|
-
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
129
|
-
links.reject!{|link| link.cobweb_starts_with?("javascript:")}
|
130
|
-
links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
|
131
|
-
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
132
|
-
links.uniq
|
133
|
-
links
|
134
|
-
end
|
135
109
|
end
|
136
110
|
|
137
111
|
class String
|
data/lib/cobweb_links.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
class CobwebLinks
|
2
|
+
|
3
|
+
# processes links supplied to it
|
4
|
+
def initialize(options={})
|
5
|
+
@options = options
|
6
|
+
|
7
|
+
raise InternalUrlsMissingError, ":internal_urls is required" unless @options.has_key? :internal_urls
|
8
|
+
raise InvalidUrlsError, ":internal_urls must be an array" unless @options[:internal_urls].kind_of? Array
|
9
|
+
raise InvalidUrlsError, ":external_urls must be an array" unless !@options.has_key?(:external_urls) || @options[:external_urls].kind_of?(Array)
|
10
|
+
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
11
|
+
@options[:debug] = false unless @options.has_key? :debug
|
12
|
+
|
13
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
|
14
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def internal?(link)
|
19
|
+
if @options[:debug]
|
20
|
+
puts "--------------------------------"
|
21
|
+
puts "Link: #{link}"
|
22
|
+
puts "Internal matches"
|
23
|
+
ap @internal_patterns.select{|pattern| link.match(pattern)}
|
24
|
+
puts "External matches"
|
25
|
+
ap @external_patterns.select{|pattern| link.match(pattern)}
|
26
|
+
end
|
27
|
+
!@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
|
28
|
+
end
|
29
|
+
|
30
|
+
def external?(link)
|
31
|
+
if @options[:debug]
|
32
|
+
puts "--------------------------------"
|
33
|
+
puts "Link: #{link}"
|
34
|
+
puts "Internal matches"
|
35
|
+
ap @internal_patterns.select{|pattern| link.match(pattern)}
|
36
|
+
puts "External matches"
|
37
|
+
ap @external_patterns.select{|pattern| link.match(pattern)}
|
38
|
+
end
|
39
|
+
@internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
class InternalUrlsMissingError < Exception
|
45
|
+
end
|
46
|
+
class InvalidUrlsError < Exception
|
47
|
+
end
|
48
|
+
|
data/lib/content_link_parser.rb
CHANGED
@@ -37,9 +37,13 @@ class ContentLinkParser
|
|
37
37
|
data
|
38
38
|
end
|
39
39
|
|
40
|
-
def all_links
|
40
|
+
def all_links(options = {})
|
41
|
+
options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
|
41
42
|
data = link_data
|
42
|
-
data.keys.map{|key| data[key]}.flatten.uniq
|
43
|
+
data = data.keys.map{|key| data[key]}.flatten.uniq
|
44
|
+
links = data.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
|
45
|
+
links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
|
46
|
+
links
|
43
47
|
end
|
44
48
|
|
45
49
|
def method_missing(m)
|
data/lib/crawl_job.rb
CHANGED
@@ -38,16 +38,17 @@ class CrawlJob
|
|
38
38
|
# set the base url if this is the first page
|
39
39
|
set_base_url @redis, content, content_request
|
40
40
|
|
41
|
+
@cobweb_links = CobwebLinks.new(content_request)
|
41
42
|
if within_queue_limits?(content_request[:crawl_limit])
|
42
|
-
internal_links =
|
43
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
|
44
|
+
|
45
|
+
# select the link if its internal
|
46
|
+
internal_links.select!{|link| @cobweb_links.internal?(link)}
|
43
47
|
|
44
48
|
# reject the link if we've crawled it or queued it
|
45
49
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
46
50
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
47
|
-
|
48
|
-
# select the link if its internal
|
49
|
-
internal_links.select!{|link| internal_link?(link)}
|
50
|
-
|
51
|
+
|
51
52
|
internal_links.each do |link|
|
52
53
|
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
|
53
54
|
end
|
@@ -83,11 +84,11 @@ class CrawlJob
|
|
83
84
|
def self.finished(content_request)
|
84
85
|
# finished
|
85
86
|
@stats.end_crawl(content_request)
|
86
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]),
|
87
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
87
88
|
end
|
88
89
|
|
89
90
|
def self.send_to_processing_queue(content, content_request)
|
90
|
-
content_to_send = content.merge({:internal_urls =>
|
91
|
+
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
91
92
|
if content_request[:use_encoding_safe_process_job]
|
92
93
|
content_to_send[:body] = Base64.encode64(content[:body])
|
93
94
|
content_to_send[:processing_queue] = content_request[:processing_queue]
|
@@ -119,33 +120,6 @@ class CrawlJob
|
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|
122
|
-
def self.internal_link?(link)
|
123
|
-
puts "Checking internal link for: #{link}" if @debug
|
124
|
-
valid_link = true
|
125
|
-
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
126
|
-
puts "Matching against #{pattern.source}" if @debug
|
127
|
-
if link.match(pattern)
|
128
|
-
puts "Matched as internal" if @debug
|
129
|
-
return true
|
130
|
-
end
|
131
|
-
end
|
132
|
-
puts "Didn't match any pattern so marked as not internal" if @debug
|
133
|
-
false
|
134
|
-
end
|
135
|
-
|
136
|
-
def self.internal_patterns
|
137
|
-
@internal_patterns ||= @redis.smembers("internal_urls")
|
138
|
-
end
|
139
|
-
|
140
|
-
def self.all_links_from_content(content)
|
141
|
-
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
142
|
-
links.reject!{|link| link.starts_with?("javascript:")}
|
143
|
-
links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
|
144
|
-
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
145
|
-
links.uniq
|
146
|
-
links
|
147
|
-
end
|
148
|
-
|
149
123
|
def self.enqueue_content(content_request, link)
|
150
124
|
new_request = content_request.clone
|
151
125
|
new_request[:url] = link
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
|
3
|
+
|
4
|
+
describe CobwebLinks do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
|
8
|
+
@base_url = "http://www.baseurl.com/"
|
9
|
+
|
10
|
+
@default_headers = {"Cache-Control" => "private, max-age=0",
|
11
|
+
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
12
|
+
"Expires" => "-1",
|
13
|
+
"Content-Type" => "text/html; charset=UTF-8",
|
14
|
+
"Content-Encoding" => "gzip",
|
15
|
+
"Transfer-Encoding" => "chunked",
|
16
|
+
"Server" => "gws",
|
17
|
+
"X-XSS-Protection" => "1; mode=block"}
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
it "should generate a cobweb_links object" do
|
23
|
+
CobwebLinks.new(:internal_urls => [""]).should be_an_instance_of CobwebLinks
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should raise error with no internal links" do
|
27
|
+
expect {CobwebLinks.new()}.to raise_error(InternalUrlsMissingError)
|
28
|
+
end
|
29
|
+
it "should not raise error with missing external links" do
|
30
|
+
expect {CobwebLinks.new(:internal_urls => ["http://domain_one.com/"])}.to_not raise_error(InternalUrlsMissingError)
|
31
|
+
end
|
32
|
+
it "should raise error with invalid internal links" do
|
33
|
+
expect {CobwebLinks.new(:internal_urls => "")}.to raise_error(InvalidUrlsError)
|
34
|
+
end
|
35
|
+
it "should raise error with invalid external links" do
|
36
|
+
expect {CobwebLinks.new(:internal_urls => [], :external_urls => "")}.to raise_error(InvalidUrlsError)
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
describe "internal and external links" do
|
41
|
+
it "should only return internal links" do
|
42
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
43
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
|
44
|
+
cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_true
|
45
|
+
end
|
46
|
+
it "should not return external links" do
|
47
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
48
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
|
49
|
+
cobweb_links.external?("http://domain_two.com/pageone.html").should be_true
|
50
|
+
cobweb_links.external?("http://external.com/pageone.html").should be_true
|
51
|
+
end
|
52
|
+
it "should override internal links with external links" do
|
53
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
|
54
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
|
55
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
|
56
|
+
cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_false
|
57
|
+
cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_true
|
58
|
+
cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_false
|
59
|
+
cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
it "should only match from beginning of url" do
|
63
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
|
64
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
65
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_false
|
66
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_true
|
67
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_false
|
68
|
+
end
|
69
|
+
|
70
|
+
describe "using wildcards" do
|
71
|
+
it "should match internal links with wildcards" do
|
72
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
|
73
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
74
|
+
cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_true
|
75
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
76
|
+
end
|
77
|
+
it "should match external links with wildcards" do
|
78
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
|
79
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
|
80
|
+
cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_false
|
81
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
82
|
+
end
|
83
|
+
it "should allow multiple wildcards" do
|
84
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
|
85
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
|
86
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
87
|
+
cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_true
|
88
|
+
cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_true
|
89
|
+
end
|
90
|
+
it "should allow multiple country tlds with wildcards" do
|
91
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
|
92
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
93
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_true
|
94
|
+
cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_true
|
95
|
+
cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_true
|
96
|
+
cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_true
|
97
|
+
cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_true
|
98
|
+
cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_true
|
99
|
+
cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -186,22 +186,22 @@ describe Cobweb do
|
|
186
186
|
describe "location setting" do
|
187
187
|
it "Get should strip fragments" do
|
188
188
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
189
|
-
Net::HTTP::Get.should_receive(:new).with("/")
|
189
|
+
Net::HTTP::Get.should_receive(:new).with("/", {})
|
190
190
|
@cobweb.get("http://www.google.com/#ignore")
|
191
191
|
end
|
192
192
|
it "head should strip fragments" do
|
193
193
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
194
|
-
Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
|
194
|
+
Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
|
195
195
|
@cobweb.head("http://www.google.com/#ignore")
|
196
196
|
end
|
197
197
|
it "get should not strip path" do
|
198
198
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
199
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
|
199
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
|
200
200
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
201
201
|
end
|
202
202
|
it "get should not strip query string" do
|
203
203
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
204
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
|
204
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
|
205
205
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
206
206
|
end
|
207
207
|
end
|
@@ -29,7 +29,7 @@ describe ContentLinkParser do
|
|
29
29
|
end
|
30
30
|
it "should return the correct links" do
|
31
31
|
links = @content_parser.links
|
32
|
-
links.length.should ==
|
32
|
+
links.length.should == 7
|
33
33
|
end
|
34
34
|
end
|
35
35
|
describe "returning image links" do
|
@@ -92,7 +92,17 @@ describe ContentLinkParser do
|
|
92
92
|
link_data.should be_an_instance_of Hash
|
93
93
|
|
94
94
|
link_data.keys.length.should == 5
|
95
|
-
link_data[:links].length.should ==
|
95
|
+
link_data[:links].length.should == 7
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should return all http and https links by default" do
|
99
|
+
links = @content_parser.all_links
|
100
|
+
links.count.should == 9
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should return only valid_schemes supplied" do
|
104
|
+
links = @content_parser.all_links(:valid_schemes => [:https])
|
105
|
+
links.count.should == 1
|
96
106
|
end
|
97
107
|
end
|
98
108
|
|
@@ -23,7 +23,10 @@
|
|
23
23
|
|
24
24
|
<body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
|
25
25
|
|
26
|
-
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
26
|
+
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
27
|
+
<a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
|
28
|
+
<a href="javascript:alert('javascript clicked');">click here for javscript</a>
|
29
|
+
<a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
|
27
30
|
<frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
|
28
31
|
|
29
32
|
<map id="testmap"><area href="http://sampleurl-area"></area>></map>
|
data/spec/spec_helper.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
2
2
|
require 'mock_redis'
|
3
3
|
|
4
|
+
ENVIRONMENT = "test"
|
5
|
+
|
4
6
|
RSpec.configure do |config|
|
5
7
|
config.before(:each) {
|
8
|
+
|
6
9
|
redis_mock = double("redis")
|
7
10
|
redis_mock.stub(:new).and_return(@redis_mock_object)
|
8
11
|
|
9
12
|
#redis_mock.flushdb
|
10
13
|
|
11
|
-
|
12
14
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
13
15
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
14
16
|
"Expires" => "-1",
|
@@ -39,8 +41,8 @@ RSpec.configure do |config|
|
|
39
41
|
|
40
42
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
41
43
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
42
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
|
43
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
|
44
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
45
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
44
46
|
|
45
47
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
46
48
|
|
@@ -60,6 +62,7 @@ RSpec.configure do |config|
|
|
60
62
|
@mock_http_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
61
63
|
@mock_http_response.stub!(:content_length).and_return(1024)
|
62
64
|
@mock_http_response.stub!(:body).and_return("asdf")
|
65
|
+
@mock_http_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
63
66
|
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
64
67
|
|
65
68
|
@mock_http_redirect_response.stub!(:code).and_return(301)
|
@@ -69,6 +72,7 @@ RSpec.configure do |config|
|
|
69
72
|
@mock_http_redirect_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
70
73
|
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
71
74
|
@mock_http_redirect_response.stub!(:body).and_return("redirected body")
|
75
|
+
@mock_http_redirect_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
72
76
|
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
73
77
|
|
74
78
|
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
@@ -78,6 +82,7 @@ RSpec.configure do |config|
|
|
78
82
|
@mock_http_redirect_response2.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
79
83
|
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
80
84
|
@mock_http_redirect_response2.stub!(:body).and_return("redirected body")
|
85
|
+
@mock_http_redirect_response2.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
81
86
|
@mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
|
82
87
|
}
|
83
88
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.45
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70107297058680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70107297058680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70107297056880 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70107297056880
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70107297056360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70107297056360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70107297055840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70107297055840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70107297055120 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70107297055120
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70107297054400 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70107297054400
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70107297053900 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70107297053900
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70107297053360 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70107297053360
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70107297052800 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,18 +109,18 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70107297052800
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70107297052240 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
119
119
|
- !ruby/object:Gem::Version
|
120
|
-
version:
|
120
|
+
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70107297052240
|
124
124
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
125
|
your crawl.
|
126
126
|
email: stewart@rockwellcottage.com
|
@@ -131,6 +131,7 @@ extra_rdoc_files:
|
|
131
131
|
files:
|
132
132
|
- spec/cobweb/cobweb_crawler_spec.rb
|
133
133
|
- spec/cobweb/cobweb_job_spec.rb
|
134
|
+
- spec/cobweb/cobweb_links_spec.rb
|
134
135
|
- spec/cobweb/cobweb_spec.rb
|
135
136
|
- spec/cobweb/content_link_parser_spec.rb
|
136
137
|
- spec/samples/sample_html_links.html
|
@@ -139,7 +140,9 @@ files:
|
|
139
140
|
- lib/cobweb.rb
|
140
141
|
- lib/cobweb_crawler.rb
|
141
142
|
- lib/cobweb_finished_job.rb
|
143
|
+
- lib/cobweb_links.rb
|
142
144
|
- lib/cobweb_process_job.rb
|
145
|
+
- lib/cobweb_version.rb
|
143
146
|
- lib/content_link_parser.rb
|
144
147
|
- lib/crawl_job.rb
|
145
148
|
- lib/encoding_safe_process_job.rb
|