cobweb 0.0.44 → 0.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +15 -1
- data/lib/cobweb.rb +68 -34
- data/lib/cobweb_crawler.rb +10 -36
- data/lib/cobweb_links.rb +48 -0
- data/lib/cobweb_version.rb +6 -0
- data/lib/content_link_parser.rb +6 -2
- data/lib/crawl_job.rb +8 -34
- data/spec/cobweb/cobweb_links_spec.rb +103 -0
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/cobweb/content_link_parser_spec.rb +12 -2
- data/spec/samples/sample_html_links.html +4 -1
- data/spec/spec_helper.rb +8 -3
- metadata +26 -23
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.45
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -101,6 +101,20 @@ h3. Contributing/Testing
|
|
101
101
|
|
102
102
|
Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
|
103
103
|
|
104
|
+
h2. Todo
|
105
|
+
|
106
|
+
* Tidy up classes with link parsing
|
107
|
+
* Refactoring of code to simplify design
|
108
|
+
* Remove requirement of redis from standalone crawler
|
109
|
+
* Add redis settings to standalone crawler (ie to connect to remote redis)
|
110
|
+
* Add ability to start and stop crawls from web interface
|
111
|
+
* Allow crawler to start as web interface only (ie not run crawls at start)
|
112
|
+
* Fix content encoding issue requiring separate process job
|
113
|
+
|
114
|
+
h3. Big changes
|
115
|
+
|
116
|
+
* Refactor into a module and refactor class names to remove cobweb and increase simplicity
|
117
|
+
|
104
118
|
h2. License
|
105
119
|
|
106
120
|
h3. The MIT License
|
data/lib/cobweb.rb
CHANGED
@@ -20,7 +20,7 @@ class Cobweb
|
|
20
20
|
# investigate using event machine for single threaded crawling
|
21
21
|
|
22
22
|
def self.version
|
23
|
-
|
23
|
+
CobwebVersion.version
|
24
24
|
end
|
25
25
|
|
26
26
|
def method_missing(method_sym, *arguments, &block)
|
@@ -56,9 +56,9 @@ class Cobweb
|
|
56
56
|
:url => base_url
|
57
57
|
}
|
58
58
|
|
59
|
-
if @options[:internal_urls].empty?
|
59
|
+
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
60
60
|
uri = Addressable::URI.parse(base_url)
|
61
|
-
@options[:internal_urls]
|
61
|
+
@options[:internal_urls] = [[uri.scheme, "://", uri.host, "/*"].join]
|
62
62
|
end
|
63
63
|
|
64
64
|
request.merge!(@options)
|
@@ -79,6 +79,7 @@ class Cobweb
|
|
79
79
|
def get(url, options = @options)
|
80
80
|
raise "url cannot be nil" if url.nil?
|
81
81
|
uri = Addressable::URI.parse(url)
|
82
|
+
uri.normalize!
|
82
83
|
uri.fragment=nil
|
83
84
|
url = uri.to_s
|
84
85
|
|
@@ -104,9 +105,6 @@ class Cobweb
|
|
104
105
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
105
106
|
content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
106
107
|
else
|
107
|
-
# this url is valid for processing so lets get on with it
|
108
|
-
#TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
|
109
|
-
|
110
108
|
# retrieve data
|
111
109
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
112
110
|
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
@@ -122,7 +120,11 @@ class Cobweb
|
|
122
120
|
@http.open_timeout = @options[:timeout].to_i
|
123
121
|
begin
|
124
122
|
print "Retrieving #{url }... " unless @options[:quiet]
|
125
|
-
|
123
|
+
request_options={}
|
124
|
+
if options[:cookies]
|
125
|
+
request_options[ 'Cookie']= options[:cookies]
|
126
|
+
end
|
127
|
+
request = Net::HTTP::Get.new uri.request_uri, request_options
|
126
128
|
|
127
129
|
response = @http.request request
|
128
130
|
|
@@ -135,14 +137,11 @@ class Cobweb
|
|
135
137
|
# decrement redirect limit
|
136
138
|
redirect_limit = redirect_limit - 1
|
137
139
|
|
138
|
-
# raise exception if we're being redirected to somewhere we've been redirected to in this content request
|
139
|
-
#raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
|
140
|
-
|
141
|
-
# raise exception if redirect limit has reached 0
|
142
140
|
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
141
|
+
cookies = get_cookies(response)
|
143
142
|
|
144
143
|
# get the content from redirect location
|
145
|
-
content = get(url, options.merge(:redirect_limit => redirect_limit))
|
144
|
+
content = get(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
146
145
|
content[:url] = uri.to_s
|
147
146
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
148
147
|
content[:redirect_through].insert(0, url)
|
@@ -186,7 +185,7 @@ class Cobweb
|
|
186
185
|
redis.expire unique_id, @options[:cache].to_i
|
187
186
|
end
|
188
187
|
rescue RedirectError => e
|
189
|
-
puts "ERROR: #{e.message}"
|
188
|
+
puts "ERROR RedirectError: #{e.message}"
|
190
189
|
|
191
190
|
## generate a blank content
|
192
191
|
content = {}
|
@@ -201,7 +200,7 @@ class Cobweb
|
|
201
200
|
content[:links] = {}
|
202
201
|
|
203
202
|
rescue SocketError => e
|
204
|
-
puts "ERROR:
|
203
|
+
puts "ERROR SocketError: #{e.message}"
|
205
204
|
|
206
205
|
## generate a blank content
|
207
206
|
content = {}
|
@@ -233,10 +232,20 @@ class Cobweb
|
|
233
232
|
end
|
234
233
|
content
|
235
234
|
end
|
236
|
-
|
235
|
+
|
236
|
+
def get_cookies(response)
|
237
|
+
all_cookies = response.get_fields('set-cookie')
|
238
|
+
cookies_array = Array.new
|
239
|
+
all_cookies.each { |cookie|
|
240
|
+
cookies_array.push(cookie.split('; ')[0])
|
241
|
+
}
|
242
|
+
cookies = cookies_array.join('; ')
|
243
|
+
end
|
244
|
+
|
237
245
|
def head(url, options = @options)
|
238
246
|
raise "url cannot be nil" if url.nil?
|
239
247
|
uri = Addressable::URI.parse(url)
|
248
|
+
uri.normalize!
|
240
249
|
uri.fragment=nil
|
241
250
|
url = uri.to_s
|
242
251
|
|
@@ -255,37 +264,47 @@ class Cobweb
|
|
255
264
|
redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
|
256
265
|
end
|
257
266
|
|
258
|
-
content = {}
|
267
|
+
content = {:base_url => url}
|
259
268
|
|
260
269
|
# check if it has already been cached
|
261
270
|
if redis.get("head-#{unique_id}") and @options[:cache]
|
262
271
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
263
272
|
content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
264
273
|
else
|
265
|
-
print "Retrieving #{url }... " unless @options[:quiet]
|
266
|
-
|
267
274
|
# retrieve data
|
268
|
-
http
|
275
|
+
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
276
|
+
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
277
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
278
|
+
end
|
269
279
|
if uri.scheme == "https"
|
270
|
-
http.use_ssl = true
|
271
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
272
|
-
end
|
280
|
+
@http.use_ssl = true
|
281
|
+
@http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
282
|
+
end
|
273
283
|
|
274
284
|
request_time = Time.now.to_f
|
275
|
-
http.read_timeout = @options[:timeout].to_i
|
276
|
-
http.open_timeout = @options[:timeout].to_i
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
285
|
+
@http.read_timeout = @options[:timeout].to_i
|
286
|
+
@http.open_timeout = @options[:timeout].to_i
|
287
|
+
begin
|
288
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
289
|
+
request_options={}
|
290
|
+
if options[:cookies]
|
291
|
+
request_options[ 'Cookie']= options[:cookies]
|
292
|
+
end
|
293
|
+
request = Net::HTTP::Head.new uri.request_uri, request_options
|
294
|
+
|
295
|
+
response = @http.request request
|
281
296
|
|
282
297
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
283
298
|
puts "redirected... " unless @options[:quiet]
|
299
|
+
|
284
300
|
url = UriHelper.join_no_fragment(uri, response['location'])
|
301
|
+
|
285
302
|
redirect_limit = redirect_limit - 1
|
286
|
-
|
287
|
-
|
288
|
-
|
303
|
+
|
304
|
+
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
305
|
+
cookies = get_cookies(response)
|
306
|
+
|
307
|
+
content = head(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
289
308
|
content[:url] = uri.to_s
|
290
309
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
291
310
|
content[:redirect_through].insert(0, url)
|
@@ -293,7 +312,7 @@ class Cobweb
|
|
293
312
|
content[:url] = uri.to_s
|
294
313
|
content[:status_code] = response.code.to_i
|
295
314
|
unless response.content_type.nil?
|
296
|
-
content[:mime_type] = response.content_type.split(";")[0].strip
|
315
|
+
content[:mime_type] = response.content_type.split(";")[0].strip
|
297
316
|
if response["Content-Type"].include? ";"
|
298
317
|
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
299
318
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
@@ -310,8 +329,23 @@ class Cobweb
|
|
310
329
|
puts "Not storing in cache as cache disabled" if @options[:debug]
|
311
330
|
end
|
312
331
|
end
|
332
|
+
rescue RedirectError => e
|
333
|
+
puts "ERROR RedirectError: #{e.message}"
|
334
|
+
|
335
|
+
## generate a blank content
|
336
|
+
content = {}
|
337
|
+
content[:url] = uri.to_s
|
338
|
+
content[:response_time] = Time.now.to_f - request_time
|
339
|
+
content[:status_code] = 0
|
340
|
+
content[:length] = 0
|
341
|
+
content[:body] = ""
|
342
|
+
content[:error] = e.message
|
343
|
+
content[:mime_type] = "error/dnslookup"
|
344
|
+
content[:headers] = {}
|
345
|
+
content[:links] = {}
|
346
|
+
|
313
347
|
rescue SocketError => e
|
314
|
-
puts "ERROR: #{e.message}"
|
348
|
+
puts "ERROR SocketError: #{e.message}"
|
315
349
|
|
316
350
|
## generate a blank content
|
317
351
|
content = {}
|
@@ -326,7 +360,7 @@ class Cobweb
|
|
326
360
|
content[:links] = {}
|
327
361
|
|
328
362
|
rescue Timeout::Error => e
|
329
|
-
puts "ERROR: #{e.message}"
|
363
|
+
puts "ERROR Timeout::Error: #{e.message}"
|
330
364
|
|
331
365
|
## generate a blank content
|
332
366
|
content = {}
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -62,17 +62,17 @@ class CobwebCrawler
|
|
62
62
|
|
63
63
|
@redis.sadd "crawled", url.to_s
|
64
64
|
@redis.incr "crawl-counter"
|
65
|
-
|
66
|
-
internal_links =
|
65
|
+
|
66
|
+
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
67
|
|
68
|
+
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
69
|
+
cobweb_links = CobwebLinks.new(@options)
|
70
|
+
internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
|
71
|
+
|
68
72
|
# reject the link if we've crawled it or queued it
|
69
73
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
70
74
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
71
|
-
|
72
|
-
|
73
|
-
# select the link if its internal
|
74
|
-
internal_links.select!{|link| internal_link?(link)}
|
75
|
-
|
75
|
+
|
76
76
|
internal_links.each do |link|
|
77
77
|
puts "Added #{link.to_s} to queue" if @debug
|
78
78
|
@redis.sadd "queued", link
|
@@ -85,10 +85,11 @@ class CobwebCrawler
|
|
85
85
|
@stats.update_statistics(content, crawl_counter, queue_counter)
|
86
86
|
@stats.update_status("Completed #{url}.")
|
87
87
|
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
|
88
|
-
|
89
|
-
yield content, @
|
88
|
+
|
89
|
+
yield content, @stats.get_statistics if block_given?
|
90
90
|
|
91
91
|
rescue => e
|
92
|
+
raise e if ENVIRONMENT == "test"
|
92
93
|
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
93
94
|
ap e
|
94
95
|
ap e.backtrace
|
@@ -105,33 +106,6 @@ class CobwebCrawler
|
|
105
106
|
@stats.get_statistics
|
106
107
|
end
|
107
108
|
|
108
|
-
|
109
|
-
def internal_link?(link)
|
110
|
-
puts "Checking internal link for: #{link}" if @debug
|
111
|
-
valid_link = true
|
112
|
-
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
113
|
-
puts "Matching against #{pattern.source}" if @debug
|
114
|
-
if link.match(pattern)
|
115
|
-
puts "Matched as internal" if @debug
|
116
|
-
return true
|
117
|
-
end
|
118
|
-
end
|
119
|
-
puts "Didn't match any pattern so marked as not internal" if @debug
|
120
|
-
false
|
121
|
-
end
|
122
|
-
|
123
|
-
def internal_patterns
|
124
|
-
@internal_patterns ||= @redis.smembers("internal_urls")
|
125
|
-
end
|
126
|
-
|
127
|
-
def all_links_from_content(content)
|
128
|
-
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
129
|
-
links.reject!{|link| link.cobweb_starts_with?("javascript:")}
|
130
|
-
links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
|
131
|
-
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
132
|
-
links.uniq
|
133
|
-
links
|
134
|
-
end
|
135
109
|
end
|
136
110
|
|
137
111
|
class String
|
data/lib/cobweb_links.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
class CobwebLinks
|
2
|
+
|
3
|
+
# processes links supplied to it
|
4
|
+
def initialize(options={})
|
5
|
+
@options = options
|
6
|
+
|
7
|
+
raise InternalUrlsMissingError, ":internal_urls is required" unless @options.has_key? :internal_urls
|
8
|
+
raise InvalidUrlsError, ":internal_urls must be an array" unless @options[:internal_urls].kind_of? Array
|
9
|
+
raise InvalidUrlsError, ":external_urls must be an array" unless !@options.has_key?(:external_urls) || @options[:external_urls].kind_of?(Array)
|
10
|
+
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
11
|
+
@options[:debug] = false unless @options.has_key? :debug
|
12
|
+
|
13
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
|
14
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def internal?(link)
|
19
|
+
if @options[:debug]
|
20
|
+
puts "--------------------------------"
|
21
|
+
puts "Link: #{link}"
|
22
|
+
puts "Internal matches"
|
23
|
+
ap @internal_patterns.select{|pattern| link.match(pattern)}
|
24
|
+
puts "External matches"
|
25
|
+
ap @external_patterns.select{|pattern| link.match(pattern)}
|
26
|
+
end
|
27
|
+
!@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
|
28
|
+
end
|
29
|
+
|
30
|
+
def external?(link)
|
31
|
+
if @options[:debug]
|
32
|
+
puts "--------------------------------"
|
33
|
+
puts "Link: #{link}"
|
34
|
+
puts "Internal matches"
|
35
|
+
ap @internal_patterns.select{|pattern| link.match(pattern)}
|
36
|
+
puts "External matches"
|
37
|
+
ap @external_patterns.select{|pattern| link.match(pattern)}
|
38
|
+
end
|
39
|
+
@internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
class InternalUrlsMissingError < Exception
|
45
|
+
end
|
46
|
+
class InvalidUrlsError < Exception
|
47
|
+
end
|
48
|
+
|
data/lib/content_link_parser.rb
CHANGED
@@ -37,9 +37,13 @@ class ContentLinkParser
|
|
37
37
|
data
|
38
38
|
end
|
39
39
|
|
40
|
-
def all_links
|
40
|
+
def all_links(options = {})
|
41
|
+
options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
|
41
42
|
data = link_data
|
42
|
-
data.keys.map{|key| data[key]}.flatten.uniq
|
43
|
+
data = data.keys.map{|key| data[key]}.flatten.uniq
|
44
|
+
links = data.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
|
45
|
+
links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
|
46
|
+
links
|
43
47
|
end
|
44
48
|
|
45
49
|
def method_missing(m)
|
data/lib/crawl_job.rb
CHANGED
@@ -38,16 +38,17 @@ class CrawlJob
|
|
38
38
|
# set the base url if this is the first page
|
39
39
|
set_base_url @redis, content, content_request
|
40
40
|
|
41
|
+
@cobweb_links = CobwebLinks.new(content_request)
|
41
42
|
if within_queue_limits?(content_request[:crawl_limit])
|
42
|
-
internal_links =
|
43
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
|
44
|
+
|
45
|
+
# select the link if its internal
|
46
|
+
internal_links.select!{|link| @cobweb_links.internal?(link)}
|
43
47
|
|
44
48
|
# reject the link if we've crawled it or queued it
|
45
49
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
46
50
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
47
|
-
|
48
|
-
# select the link if its internal
|
49
|
-
internal_links.select!{|link| internal_link?(link)}
|
50
|
-
|
51
|
+
|
51
52
|
internal_links.each do |link|
|
52
53
|
enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
|
53
54
|
end
|
@@ -83,11 +84,11 @@ class CrawlJob
|
|
83
84
|
def self.finished(content_request)
|
84
85
|
# finished
|
85
86
|
@stats.end_crawl(content_request)
|
86
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]),
|
87
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
87
88
|
end
|
88
89
|
|
89
90
|
def self.send_to_processing_queue(content, content_request)
|
90
|
-
content_to_send = content.merge({:internal_urls =>
|
91
|
+
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
91
92
|
if content_request[:use_encoding_safe_process_job]
|
92
93
|
content_to_send[:body] = Base64.encode64(content[:body])
|
93
94
|
content_to_send[:processing_queue] = content_request[:processing_queue]
|
@@ -119,33 +120,6 @@ class CrawlJob
|
|
119
120
|
end
|
120
121
|
end
|
121
122
|
|
122
|
-
def self.internal_link?(link)
|
123
|
-
puts "Checking internal link for: #{link}" if @debug
|
124
|
-
valid_link = true
|
125
|
-
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
126
|
-
puts "Matching against #{pattern.source}" if @debug
|
127
|
-
if link.match(pattern)
|
128
|
-
puts "Matched as internal" if @debug
|
129
|
-
return true
|
130
|
-
end
|
131
|
-
end
|
132
|
-
puts "Didn't match any pattern so marked as not internal" if @debug
|
133
|
-
false
|
134
|
-
end
|
135
|
-
|
136
|
-
def self.internal_patterns
|
137
|
-
@internal_patterns ||= @redis.smembers("internal_urls")
|
138
|
-
end
|
139
|
-
|
140
|
-
def self.all_links_from_content(content)
|
141
|
-
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
142
|
-
links.reject!{|link| link.starts_with?("javascript:")}
|
143
|
-
links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
|
144
|
-
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
145
|
-
links.uniq
|
146
|
-
links
|
147
|
-
end
|
148
|
-
|
149
123
|
def self.enqueue_content(content_request, link)
|
150
124
|
new_request = content_request.clone
|
151
125
|
new_request[:url] = link
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
|
3
|
+
|
4
|
+
describe CobwebLinks do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
|
8
|
+
@base_url = "http://www.baseurl.com/"
|
9
|
+
|
10
|
+
@default_headers = {"Cache-Control" => "private, max-age=0",
|
11
|
+
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
12
|
+
"Expires" => "-1",
|
13
|
+
"Content-Type" => "text/html; charset=UTF-8",
|
14
|
+
"Content-Encoding" => "gzip",
|
15
|
+
"Transfer-Encoding" => "chunked",
|
16
|
+
"Server" => "gws",
|
17
|
+
"X-XSS-Protection" => "1; mode=block"}
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
it "should generate a cobweb_links object" do
|
23
|
+
CobwebLinks.new(:internal_urls => [""]).should be_an_instance_of CobwebLinks
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should raise error with no internal links" do
|
27
|
+
expect {CobwebLinks.new()}.to raise_error(InternalUrlsMissingError)
|
28
|
+
end
|
29
|
+
it "should not raise error with missing external links" do
|
30
|
+
expect {CobwebLinks.new(:internal_urls => ["http://domain_one.com/"])}.to_not raise_error(InternalUrlsMissingError)
|
31
|
+
end
|
32
|
+
it "should raise error with invalid internal links" do
|
33
|
+
expect {CobwebLinks.new(:internal_urls => "")}.to raise_error(InvalidUrlsError)
|
34
|
+
end
|
35
|
+
it "should raise error with invalid external links" do
|
36
|
+
expect {CobwebLinks.new(:internal_urls => [], :external_urls => "")}.to raise_error(InvalidUrlsError)
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
describe "internal and external links" do
|
41
|
+
it "should only return internal links" do
|
42
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
43
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
|
44
|
+
cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_true
|
45
|
+
end
|
46
|
+
it "should not return external links" do
|
47
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
|
48
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
|
49
|
+
cobweb_links.external?("http://domain_two.com/pageone.html").should be_true
|
50
|
+
cobweb_links.external?("http://external.com/pageone.html").should be_true
|
51
|
+
end
|
52
|
+
it "should override internal links with external links" do
|
53
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
|
54
|
+
cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
|
55
|
+
cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
|
56
|
+
cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_false
|
57
|
+
cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_true
|
58
|
+
cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_false
|
59
|
+
cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
it "should only match from beginning of url" do
|
63
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
|
64
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
65
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_false
|
66
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_true
|
67
|
+
cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_false
|
68
|
+
end
|
69
|
+
|
70
|
+
describe "using wildcards" do
|
71
|
+
it "should match internal links with wildcards" do
|
72
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
|
73
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
74
|
+
cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_true
|
75
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
76
|
+
end
|
77
|
+
it "should match external links with wildcards" do
|
78
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
|
79
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
|
80
|
+
cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_false
|
81
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
82
|
+
end
|
83
|
+
it "should allow multiple wildcards" do
|
84
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
|
85
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
|
86
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
|
87
|
+
cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_true
|
88
|
+
cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_true
|
89
|
+
end
|
90
|
+
it "should allow multiple country tlds with wildcards" do
|
91
|
+
cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
|
92
|
+
cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
|
93
|
+
cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_true
|
94
|
+
cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_true
|
95
|
+
cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_true
|
96
|
+
cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_true
|
97
|
+
cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_true
|
98
|
+
cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_true
|
99
|
+
cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -186,22 +186,22 @@ describe Cobweb do
|
|
186
186
|
describe "location setting" do
|
187
187
|
it "Get should strip fragments" do
|
188
188
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
189
|
-
Net::HTTP::Get.should_receive(:new).with("/")
|
189
|
+
Net::HTTP::Get.should_receive(:new).with("/", {})
|
190
190
|
@cobweb.get("http://www.google.com/#ignore")
|
191
191
|
end
|
192
192
|
it "head should strip fragments" do
|
193
193
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
194
|
-
Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
|
194
|
+
Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
|
195
195
|
@cobweb.head("http://www.google.com/#ignore")
|
196
196
|
end
|
197
197
|
it "get should not strip path" do
|
198
198
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
199
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
|
199
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
|
200
200
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
201
201
|
end
|
202
202
|
it "get should not strip query string" do
|
203
203
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
204
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
|
204
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
|
205
205
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
206
206
|
end
|
207
207
|
end
|
@@ -29,7 +29,7 @@ describe ContentLinkParser do
|
|
29
29
|
end
|
30
30
|
it "should return the correct links" do
|
31
31
|
links = @content_parser.links
|
32
|
-
links.length.should ==
|
32
|
+
links.length.should == 7
|
33
33
|
end
|
34
34
|
end
|
35
35
|
describe "returning image links" do
|
@@ -92,7 +92,17 @@ describe ContentLinkParser do
|
|
92
92
|
link_data.should be_an_instance_of Hash
|
93
93
|
|
94
94
|
link_data.keys.length.should == 5
|
95
|
-
link_data[:links].length.should ==
|
95
|
+
link_data[:links].length.should == 7
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should return all http and https links by default" do
|
99
|
+
links = @content_parser.all_links
|
100
|
+
links.count.should == 9
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should return only valid_schemes supplied" do
|
104
|
+
links = @content_parser.all_links(:valid_schemes => [:https])
|
105
|
+
links.count.should == 1
|
96
106
|
end
|
97
107
|
end
|
98
108
|
|
@@ -23,7 +23,10 @@
|
|
23
23
|
|
24
24
|
<body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
|
25
25
|
|
26
|
-
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
26
|
+
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
27
|
+
<a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
|
28
|
+
<a href="javascript:alert('javascript clicked');">click here for javscript</a>
|
29
|
+
<a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
|
27
30
|
<frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
|
28
31
|
|
29
32
|
<map id="testmap"><area href="http://sampleurl-area"></area>></map>
|
data/spec/spec_helper.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
2
2
|
require 'mock_redis'
|
3
3
|
|
4
|
+
ENVIRONMENT = "test"
|
5
|
+
|
4
6
|
RSpec.configure do |config|
|
5
7
|
config.before(:each) {
|
8
|
+
|
6
9
|
redis_mock = double("redis")
|
7
10
|
redis_mock.stub(:new).and_return(@redis_mock_object)
|
8
11
|
|
9
12
|
#redis_mock.flushdb
|
10
13
|
|
11
|
-
|
12
14
|
@default_headers = {"Cache-Control" => "private, max-age=0",
|
13
15
|
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
14
16
|
"Expires" => "-1",
|
@@ -39,8 +41,8 @@ RSpec.configure do |config|
|
|
39
41
|
|
40
42
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
41
43
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
42
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
|
43
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
|
44
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
45
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
44
46
|
|
45
47
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
46
48
|
|
@@ -60,6 +62,7 @@ RSpec.configure do |config|
|
|
60
62
|
@mock_http_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
61
63
|
@mock_http_response.stub!(:content_length).and_return(1024)
|
62
64
|
@mock_http_response.stub!(:body).and_return("asdf")
|
65
|
+
@mock_http_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
63
66
|
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
64
67
|
|
65
68
|
@mock_http_redirect_response.stub!(:code).and_return(301)
|
@@ -69,6 +72,7 @@ RSpec.configure do |config|
|
|
69
72
|
@mock_http_redirect_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
70
73
|
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
71
74
|
@mock_http_redirect_response.stub!(:body).and_return("redirected body")
|
75
|
+
@mock_http_redirect_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
72
76
|
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
73
77
|
|
74
78
|
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
@@ -78,6 +82,7 @@ RSpec.configure do |config|
|
|
78
82
|
@mock_http_redirect_response2.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
|
79
83
|
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
80
84
|
@mock_http_redirect_response2.stub!(:body).and_return("redirected body")
|
85
|
+
@mock_http_redirect_response2.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
|
81
86
|
@mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
|
82
87
|
}
|
83
88
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.45
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-05-
|
12
|
+
date: 2012-05-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70107297058680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70107297058680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70107297056880 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70107297056880
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70107297056360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70107297056360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70107297055840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70107297055840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70107297055120 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70107297055120
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70107297054400 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70107297054400
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70107297053900 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70107297053900
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70107297053360 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70107297053360
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70107297052800 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,18 +109,18 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70107297052800
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70107297052240 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
119
119
|
- !ruby/object:Gem::Version
|
120
|
-
version:
|
120
|
+
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70107297052240
|
124
124
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
125
|
your crawl.
|
126
126
|
email: stewart@rockwellcottage.com
|
@@ -131,6 +131,7 @@ extra_rdoc_files:
|
|
131
131
|
files:
|
132
132
|
- spec/cobweb/cobweb_crawler_spec.rb
|
133
133
|
- spec/cobweb/cobweb_job_spec.rb
|
134
|
+
- spec/cobweb/cobweb_links_spec.rb
|
134
135
|
- spec/cobweb/cobweb_spec.rb
|
135
136
|
- spec/cobweb/content_link_parser_spec.rb
|
136
137
|
- spec/samples/sample_html_links.html
|
@@ -139,7 +140,9 @@ files:
|
|
139
140
|
- lib/cobweb.rb
|
140
141
|
- lib/cobweb_crawler.rb
|
141
142
|
- lib/cobweb_finished_job.rb
|
143
|
+
- lib/cobweb_links.rb
|
142
144
|
- lib/cobweb_process_job.rb
|
145
|
+
- lib/cobweb_version.rb
|
143
146
|
- lib/content_link_parser.rb
|
144
147
|
- lib/crawl_job.rb
|
145
148
|
- lib/encoding_safe_process_job.rb
|