cobweb 0.0.44 → 0.0.45

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.44
2
+ h1. Cobweb v0.0.45
3
3
 
4
4
  h2. Intro
5
5
 
@@ -101,6 +101,20 @@ h3. Contributing/Testing
101
101
 
102
102
  Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
103
103
 
104
+ h2. Todo
105
+
106
+ * Tidy up classes with link parsing
107
+ * Refactoring of code to simplify design
108
+ * Remove requirement of redis from standalone crawler
109
+ * Add redis settings to standalone crawler (ie to connect to remote redis)
110
+ * Add ability to start and stop crawls from web interface
111
+ * Allow crawler to start as web interface only (ie not run crawls at start)
112
+ * Fix content encoding issue requiring separate process job
113
+
114
+ h3. Big changes
115
+
116
+ * Refactor into a module and refactor class names to remove cobweb and increase simplicity
117
+
104
118
  h2. License
105
119
 
106
120
  h3. The MIT License
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.44"
23
+ CobwebVersion.version
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -56,9 +56,9 @@ class Cobweb
56
56
  :url => base_url
57
57
  }
58
58
 
59
- if @options[:internal_urls].empty?
59
+ if @options[:internal_urls].nil? || @options[:internal_urls].empty?
60
60
  uri = Addressable::URI.parse(base_url)
61
- @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
61
+ @options[:internal_urls] = [[uri.scheme, "://", uri.host, "/*"].join]
62
62
  end
63
63
 
64
64
  request.merge!(@options)
@@ -79,6 +79,7 @@ class Cobweb
79
79
  def get(url, options = @options)
80
80
  raise "url cannot be nil" if url.nil?
81
81
  uri = Addressable::URI.parse(url)
82
+ uri.normalize!
82
83
  uri.fragment=nil
83
84
  url = uri.to_s
84
85
 
@@ -104,9 +105,6 @@ class Cobweb
104
105
  puts "Cache hit for #{url}" unless @options[:quiet]
105
106
  content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
106
107
  else
107
- # this url is valid for processing so lets get on with it
108
- #TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
109
-
110
108
  # retrieve data
111
109
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
112
110
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
@@ -122,7 +120,11 @@ class Cobweb
122
120
  @http.open_timeout = @options[:timeout].to_i
123
121
  begin
124
122
  print "Retrieving #{url }... " unless @options[:quiet]
125
- request = Net::HTTP::Get.new uri.request_uri
123
+ request_options={}
124
+ if options[:cookies]
125
+ request_options[ 'Cookie']= options[:cookies]
126
+ end
127
+ request = Net::HTTP::Get.new uri.request_uri, request_options
126
128
 
127
129
  response = @http.request request
128
130
 
@@ -135,14 +137,11 @@ class Cobweb
135
137
  # decrement redirect limit
136
138
  redirect_limit = redirect_limit - 1
137
139
 
138
- # raise exception if we're being redirected to somewhere we've been redirected to in this content request
139
- #raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
140
-
141
- # raise exception if redirect limit has reached 0
142
140
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
141
+ cookies = get_cookies(response)
143
142
 
144
143
  # get the content from redirect location
145
- content = get(url, options.merge(:redirect_limit => redirect_limit))
144
+ content = get(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
146
145
  content[:url] = uri.to_s
147
146
  content[:redirect_through] = [] if content[:redirect_through].nil?
148
147
  content[:redirect_through].insert(0, url)
@@ -186,7 +185,7 @@ class Cobweb
186
185
  redis.expire unique_id, @options[:cache].to_i
187
186
  end
188
187
  rescue RedirectError => e
189
- puts "ERROR: #{e.message}"
188
+ puts "ERROR RedirectError: #{e.message}"
190
189
 
191
190
  ## generate a blank content
192
191
  content = {}
@@ -201,7 +200,7 @@ class Cobweb
201
200
  content[:links] = {}
202
201
 
203
202
  rescue SocketError => e
204
- puts "ERROR: SocketError#{e.message}"
203
+ puts "ERROR SocketError: #{e.message}"
205
204
 
206
205
  ## generate a blank content
207
206
  content = {}
@@ -233,10 +232,20 @@ class Cobweb
233
232
  end
234
233
  content
235
234
  end
236
-
235
+
236
+ def get_cookies(response)
237
+ all_cookies = response.get_fields('set-cookie')
238
+ cookies_array = Array.new
239
+ all_cookies.each { |cookie|
240
+ cookies_array.push(cookie.split('; ')[0])
241
+ }
242
+ cookies = cookies_array.join('; ')
243
+ end
244
+
237
245
  def head(url, options = @options)
238
246
  raise "url cannot be nil" if url.nil?
239
247
  uri = Addressable::URI.parse(url)
248
+ uri.normalize!
240
249
  uri.fragment=nil
241
250
  url = uri.to_s
242
251
 
@@ -255,37 +264,47 @@ class Cobweb
255
264
  redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
256
265
  end
257
266
 
258
- content = {}
267
+ content = {:base_url => url}
259
268
 
260
269
  # check if it has already been cached
261
270
  if redis.get("head-#{unique_id}") and @options[:cache]
262
271
  puts "Cache hit for #{url}" unless @options[:quiet]
263
272
  content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
264
273
  else
265
- print "Retrieving #{url }... " unless @options[:quiet]
266
-
267
274
  # retrieve data
268
- http = Net::HTTP.new(uri.host, uri.inferred_port)
275
+ unless @http && @http.address == uri.host && @http.port == uri.inferred_port
276
+ puts "Creating connection to #{uri.host}..." unless @options[:quiet]
277
+ @http = Net::HTTP.new(uri.host, uri.inferred_port)
278
+ end
269
279
  if uri.scheme == "https"
270
- http.use_ssl = true
271
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
272
- end
280
+ @http.use_ssl = true
281
+ @http.verify_mode = OpenSSL::SSL::VERIFY_NONE
282
+ end
273
283
 
274
284
  request_time = Time.now.to_f
275
- http.read_timeout = @options[:timeout].to_i
276
- http.open_timeout = @options[:timeout].to_i
277
-
278
- begin
279
- request = Net::HTTP::Head.new uri.request_uri
280
- response = http.request request
285
+ @http.read_timeout = @options[:timeout].to_i
286
+ @http.open_timeout = @options[:timeout].to_i
287
+ begin
288
+ print "Retrieving #{url }... " unless @options[:quiet]
289
+ request_options={}
290
+ if options[:cookies]
291
+ request_options[ 'Cookie']= options[:cookies]
292
+ end
293
+ request = Net::HTTP::Head.new uri.request_uri, request_options
294
+
295
+ response = @http.request request
281
296
 
282
297
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
283
298
  puts "redirected... " unless @options[:quiet]
299
+
284
300
  url = UriHelper.join_no_fragment(uri, response['location'])
301
+
285
302
  redirect_limit = redirect_limit - 1
286
- options = options.clone
287
- options[:redirect_limit]=redirect_limit
288
- content = head(url, options)
303
+
304
+ raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
305
+ cookies = get_cookies(response)
306
+
307
+ content = head(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
289
308
  content[:url] = uri.to_s
290
309
  content[:redirect_through] = [] if content[:redirect_through].nil?
291
310
  content[:redirect_through].insert(0, url)
@@ -293,7 +312,7 @@ class Cobweb
293
312
  content[:url] = uri.to_s
294
313
  content[:status_code] = response.code.to_i
295
314
  unless response.content_type.nil?
296
- content[:mime_type] = response.content_type.split(";")[0].strip
315
+ content[:mime_type] = response.content_type.split(";")[0].strip
297
316
  if response["Content-Type"].include? ";"
298
317
  charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
299
318
  charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
@@ -310,8 +329,23 @@ class Cobweb
310
329
  puts "Not storing in cache as cache disabled" if @options[:debug]
311
330
  end
312
331
  end
332
+ rescue RedirectError => e
333
+ puts "ERROR RedirectError: #{e.message}"
334
+
335
+ ## generate a blank content
336
+ content = {}
337
+ content[:url] = uri.to_s
338
+ content[:response_time] = Time.now.to_f - request_time
339
+ content[:status_code] = 0
340
+ content[:length] = 0
341
+ content[:body] = ""
342
+ content[:error] = e.message
343
+ content[:mime_type] = "error/dnslookup"
344
+ content[:headers] = {}
345
+ content[:links] = {}
346
+
313
347
  rescue SocketError => e
314
- puts "ERROR: #{e.message}"
348
+ puts "ERROR SocketError: #{e.message}"
315
349
 
316
350
  ## generate a blank content
317
351
  content = {}
@@ -326,7 +360,7 @@ class Cobweb
326
360
  content[:links] = {}
327
361
 
328
362
  rescue Timeout::Error => e
329
- puts "ERROR: #{e.message}"
363
+ puts "ERROR Timeout::Error: #{e.message}"
330
364
 
331
365
  ## generate a blank content
332
366
  content = {}
@@ -62,17 +62,17 @@ class CobwebCrawler
62
62
 
63
63
  @redis.sadd "crawled", url.to_s
64
64
  @redis.incr "crawl-counter"
65
-
66
- internal_links = all_links_from_content(content).map{|link| link.to_s}
65
+
66
+ internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
67
67
 
68
+ # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
69
+ cobweb_links = CobwebLinks.new(@options)
70
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
71
+
68
72
  # reject the link if we've crawled it or queued it
69
73
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
70
74
  internal_links.reject!{|link| @redis.sismember("queued", link)}
71
-
72
-
73
- # select the link if its internal
74
- internal_links.select!{|link| internal_link?(link)}
75
-
75
+
76
76
  internal_links.each do |link|
77
77
  puts "Added #{link.to_s} to queue" if @debug
78
78
  @redis.sadd "queued", link
@@ -85,10 +85,11 @@ class CobwebCrawler
85
85
  @stats.update_statistics(content, crawl_counter, queue_counter)
86
86
  @stats.update_status("Completed #{url}.")
87
87
  puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
88
-
89
- yield content, @statistic if block_given?
88
+
89
+ yield content, @stats.get_statistics if block_given?
90
90
 
91
91
  rescue => e
92
+ raise e if ENVIRONMENT == "test"
92
93
  puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
93
94
  ap e
94
95
  ap e.backtrace
@@ -105,33 +106,6 @@ class CobwebCrawler
105
106
  @stats.get_statistics
106
107
  end
107
108
 
108
-
109
- def internal_link?(link)
110
- puts "Checking internal link for: #{link}" if @debug
111
- valid_link = true
112
- internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
113
- puts "Matching against #{pattern.source}" if @debug
114
- if link.match(pattern)
115
- puts "Matched as internal" if @debug
116
- return true
117
- end
118
- end
119
- puts "Didn't match any pattern so marked as not internal" if @debug
120
- false
121
- end
122
-
123
- def internal_patterns
124
- @internal_patterns ||= @redis.smembers("internal_urls")
125
- end
126
-
127
- def all_links_from_content(content)
128
- links = content[:links].keys.map{|key| content[:links][key]}.flatten
129
- links.reject!{|link| link.cobweb_starts_with?("javascript:")}
130
- links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
131
- links.select!{|link| link.scheme == "http" || link.scheme == "https"}
132
- links.uniq
133
- links
134
- end
135
109
  end
136
110
 
137
111
  class String
@@ -0,0 +1,48 @@
1
+ class CobwebLinks
2
+
3
+ # processes links supplied to it
4
+ def initialize(options={})
5
+ @options = options
6
+
7
+ raise InternalUrlsMissingError, ":internal_urls is required" unless @options.has_key? :internal_urls
8
+ raise InvalidUrlsError, ":internal_urls must be an array" unless @options[:internal_urls].kind_of? Array
9
+ raise InvalidUrlsError, ":external_urls must be an array" unless !@options.has_key?(:external_urls) || @options[:external_urls].kind_of?(Array)
10
+ @options[:external_urls] = [] unless @options.has_key? :external_urls
11
+ @options[:debug] = false unless @options.has_key? :debug
12
+
13
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
14
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
15
+
16
+ end
17
+
18
+ def internal?(link)
19
+ if @options[:debug]
20
+ puts "--------------------------------"
21
+ puts "Link: #{link}"
22
+ puts "Internal matches"
23
+ ap @internal_patterns.select{|pattern| link.match(pattern)}
24
+ puts "External matches"
25
+ ap @external_patterns.select{|pattern| link.match(pattern)}
26
+ end
27
+ !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
28
+ end
29
+
30
+ def external?(link)
31
+ if @options[:debug]
32
+ puts "--------------------------------"
33
+ puts "Link: #{link}"
34
+ puts "Internal matches"
35
+ ap @internal_patterns.select{|pattern| link.match(pattern)}
36
+ puts "External matches"
37
+ ap @external_patterns.select{|pattern| link.match(pattern)}
38
+ end
39
+ @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
40
+ end
41
+
42
+ end
43
+
44
+ class InternalUrlsMissingError < Exception
45
+ end
46
+ class InvalidUrlsError < Exception
47
+ end
48
+
@@ -0,0 +1,6 @@
1
+ class CobwebVersion
2
+ def self.version
3
+ "0.0.45"
4
+ end
5
+
6
+ end
@@ -37,9 +37,13 @@ class ContentLinkParser
37
37
  data
38
38
  end
39
39
 
40
- def all_links
40
+ def all_links(options = {})
41
+ options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
41
42
  data = link_data
42
- data.keys.map{|key| data[key]}.flatten.uniq
43
+ data = data.keys.map{|key| data[key]}.flatten.uniq
44
+ links = data.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
45
+ links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
46
+ links
43
47
  end
44
48
 
45
49
  def method_missing(m)
@@ -38,16 +38,17 @@ class CrawlJob
38
38
  # set the base url if this is the first page
39
39
  set_base_url @redis, content, content_request
40
40
 
41
+ @cobweb_links = CobwebLinks.new(content_request)
41
42
  if within_queue_limits?(content_request[:crawl_limit])
42
- internal_links = all_links_from_content(content).map{|link| link.to_s}
43
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
44
+
45
+ # select the link if its internal
46
+ internal_links.select!{|link| @cobweb_links.internal?(link)}
43
47
 
44
48
  # reject the link if we've crawled it or queued it
45
49
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
46
50
  internal_links.reject!{|link| @redis.sismember("queued", link)}
47
-
48
- # select the link if its internal
49
- internal_links.select!{|link| internal_link?(link)}
50
-
51
+
51
52
  internal_links.each do |link|
52
53
  enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
53
54
  end
@@ -83,11 +84,11 @@ class CrawlJob
83
84
  def self.finished(content_request)
84
85
  # finished
85
86
  @stats.end_crawl(content_request)
86
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
87
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
87
88
  end
88
89
 
89
90
  def self.send_to_processing_queue(content, content_request)
90
- content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
91
+ content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
91
92
  if content_request[:use_encoding_safe_process_job]
92
93
  content_to_send[:body] = Base64.encode64(content[:body])
93
94
  content_to_send[:processing_queue] = content_request[:processing_queue]
@@ -119,33 +120,6 @@ class CrawlJob
119
120
  end
120
121
  end
121
122
 
122
- def self.internal_link?(link)
123
- puts "Checking internal link for: #{link}" if @debug
124
- valid_link = true
125
- internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
126
- puts "Matching against #{pattern.source}" if @debug
127
- if link.match(pattern)
128
- puts "Matched as internal" if @debug
129
- return true
130
- end
131
- end
132
- puts "Didn't match any pattern so marked as not internal" if @debug
133
- false
134
- end
135
-
136
- def self.internal_patterns
137
- @internal_patterns ||= @redis.smembers("internal_urls")
138
- end
139
-
140
- def self.all_links_from_content(content)
141
- links = content[:links].keys.map{|key| content[:links][key]}.flatten
142
- links.reject!{|link| link.starts_with?("javascript:")}
143
- links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
144
- links.select!{|link| link.scheme == "http" || link.scheme == "https"}
145
- links.uniq
146
- links
147
- end
148
-
149
123
  def self.enqueue_content(content_request, link)
150
124
  new_request = content_request.clone
151
125
  new_request[:url] = link
@@ -0,0 +1,103 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
3
+
4
+ describe CobwebLinks do
5
+
6
+ before(:each) do
7
+
8
+ @base_url = "http://www.baseurl.com/"
9
+
10
+ @default_headers = {"Cache-Control" => "private, max-age=0",
11
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
12
+ "Expires" => "-1",
13
+ "Content-Type" => "text/html; charset=UTF-8",
14
+ "Content-Encoding" => "gzip",
15
+ "Transfer-Encoding" => "chunked",
16
+ "Server" => "gws",
17
+ "X-XSS-Protection" => "1; mode=block"}
18
+
19
+ end
20
+
21
+
22
+ it "should generate a cobweb_links object" do
23
+ CobwebLinks.new(:internal_urls => [""]).should be_an_instance_of CobwebLinks
24
+ end
25
+
26
+ it "should raise error with no internal links" do
27
+ expect {CobwebLinks.new()}.to raise_error(InternalUrlsMissingError)
28
+ end
29
+ it "should not raise error with missing external links" do
30
+ expect {CobwebLinks.new(:internal_urls => ["http://domain_one.com/"])}.to_not raise_error(InternalUrlsMissingError)
31
+ end
32
+ it "should raise error with invalid internal links" do
33
+ expect {CobwebLinks.new(:internal_urls => "")}.to raise_error(InvalidUrlsError)
34
+ end
35
+ it "should raise error with invalid external links" do
36
+ expect {CobwebLinks.new(:internal_urls => [], :external_urls => "")}.to raise_error(InvalidUrlsError)
37
+ end
38
+
39
+
40
+ describe "internal and external links" do
41
+ it "should only return internal links" do
42
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
43
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
44
+ cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_true
45
+ end
46
+ it "should not return external links" do
47
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
48
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
49
+ cobweb_links.external?("http://domain_two.com/pageone.html").should be_true
50
+ cobweb_links.external?("http://external.com/pageone.html").should be_true
51
+ end
52
+ it "should override internal links with external links" do
53
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
54
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
55
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
56
+ cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_false
57
+ cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_true
58
+ cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_false
59
+ cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_true
60
+ end
61
+ end
62
+ it "should only match from beginning of url" do
63
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
64
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
65
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_false
66
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_true
67
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_false
68
+ end
69
+
70
+ describe "using wildcards" do
71
+ it "should match internal links with wildcards" do
72
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
73
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
74
+ cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_true
75
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
76
+ end
77
+ it "should match external links with wildcards" do
78
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
79
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
80
+ cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_false
81
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
82
+ end
83
+ it "should allow multiple wildcards" do
84
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
85
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
86
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
87
+ cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_true
88
+ cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_true
89
+ end
90
+ it "should allow multiple country tlds with wildcards" do
91
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
92
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
93
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_true
94
+ cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_true
95
+ cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_true
96
+ cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_true
97
+ cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_true
98
+ cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_true
99
+ cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_true
100
+ end
101
+ end
102
+
103
+ end
@@ -186,22 +186,22 @@ describe Cobweb do
186
186
  describe "location setting" do
187
187
  it "Get should strip fragments" do
188
188
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/")
189
+ Net::HTTP::Get.should_receive(:new).with("/", {})
190
190
  @cobweb.get("http://www.google.com/#ignore")
191
191
  end
192
192
  it "head should strip fragments" do
193
193
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
194
- Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
194
+ Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
195
195
  @cobweb.head("http://www.google.com/#ignore")
196
196
  end
197
197
  it "get should not strip path" do
198
198
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
200
200
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
201
  end
202
202
  it "get should not strip query string" do
203
203
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
205
205
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
206
  end
207
207
  end
@@ -29,7 +29,7 @@ describe ContentLinkParser do
29
29
  end
30
30
  it "should return the correct links" do
31
31
  links = @content_parser.links
32
- links.length.should == 4
32
+ links.length.should == 7
33
33
  end
34
34
  end
35
35
  describe "returning image links" do
@@ -92,7 +92,17 @@ describe ContentLinkParser do
92
92
  link_data.should be_an_instance_of Hash
93
93
 
94
94
  link_data.keys.length.should == 5
95
- link_data[:links].length.should == 4
95
+ link_data[:links].length.should == 7
96
+ end
97
+
98
+ it "should return all http and https links by default" do
99
+ links = @content_parser.all_links
100
+ links.count.should == 9
101
+ end
102
+
103
+ it "should return only valid_schemes supplied" do
104
+ links = @content_parser.all_links(:valid_schemes => [:https])
105
+ links.count.should == 1
96
106
  end
97
107
  end
98
108
 
@@ -23,7 +23,10 @@
23
23
 
24
24
  <body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
25
25
 
26
- <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
26
+ <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
27
+ <a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
28
+ <a href="javascript:alert('javascript clicked');">click here for javscript</a>
29
+ <a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
27
30
  <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
28
31
 
29
32
  <map id="testmap"><area href="http://sampleurl-area"></area>></map>
@@ -1,14 +1,16 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
2
2
  require 'mock_redis'
3
3
 
4
+ ENVIRONMENT = "test"
5
+
4
6
  RSpec.configure do |config|
5
7
  config.before(:each) {
8
+
6
9
  redis_mock = double("redis")
7
10
  redis_mock.stub(:new).and_return(@redis_mock_object)
8
11
 
9
12
  #redis_mock.flushdb
10
13
 
11
-
12
14
  @default_headers = {"Cache-Control" => "private, max-age=0",
13
15
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
14
16
  "Expires" => "-1",
@@ -39,8 +41,8 @@ RSpec.configure do |config|
39
41
 
40
42
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
41
43
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
42
- Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
43
- Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
44
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
45
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
44
46
 
45
47
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
46
48
 
@@ -60,6 +62,7 @@ RSpec.configure do |config|
60
62
  @mock_http_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
61
63
  @mock_http_response.stub!(:content_length).and_return(1024)
62
64
  @mock_http_response.stub!(:body).and_return("asdf")
65
+ @mock_http_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
63
66
  @mock_http_response.stub!(:to_hash).and_return(@default_headers)
64
67
 
65
68
  @mock_http_redirect_response.stub!(:code).and_return(301)
@@ -69,6 +72,7 @@ RSpec.configure do |config|
69
72
  @mock_http_redirect_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
70
73
  @mock_http_redirect_response.stub!(:content_length).and_return(2048)
71
74
  @mock_http_redirect_response.stub!(:body).and_return("redirected body")
75
+ @mock_http_redirect_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
72
76
  @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
73
77
 
74
78
  @mock_http_redirect_response2.stub!(:code).and_return(301)
@@ -78,6 +82,7 @@ RSpec.configure do |config|
78
82
  @mock_http_redirect_response2.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
79
83
  @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
80
84
  @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
85
+ @mock_http_redirect_response2.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
81
86
  @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
82
87
  }
83
88
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.44
4
+ version: 0.0.45
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-03 00:00:00.000000000 Z
12
+ date: 2012-05-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70139600146360 !ruby/object:Gem::Requirement
16
+ requirement: &70107297058680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70139600146360
24
+ version_requirements: *70107297058680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70139600145940 !ruby/object:Gem::Requirement
27
+ requirement: &70107297056880 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70139600145940
35
+ version_requirements: *70107297056880
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70139600145520 !ruby/object:Gem::Requirement
38
+ requirement: &70107297056360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70139600145520
46
+ version_requirements: *70107297056360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70139600145100 !ruby/object:Gem::Requirement
49
+ requirement: &70107297055840 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70139600145100
57
+ version_requirements: *70107297055840
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70139600144680 !ruby/object:Gem::Requirement
60
+ requirement: &70107297055120 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70139600144680
68
+ version_requirements: *70107297055120
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70139600144260 !ruby/object:Gem::Requirement
71
+ requirement: &70107297054400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70139600144260
79
+ version_requirements: *70107297054400
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70139600143840 !ruby/object:Gem::Requirement
82
+ requirement: &70107297053900 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70139600143840
90
+ version_requirements: *70107297053900
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70139600143420 !ruby/object:Gem::Requirement
93
+ requirement: &70107297053360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70139600143420
101
+ version_requirements: *70107297053360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70139600143000 !ruby/object:Gem::Requirement
104
+ requirement: &70107297052800 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,18 +109,18 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70139600143000
112
+ version_requirements: *70107297052800
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70139600142580 !ruby/object:Gem::Requirement
115
+ requirement: &70107297052240 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
119
119
  - !ruby/object:Gem::Version
120
- version: '0'
120
+ version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70139600142580
123
+ version_requirements: *70107297052240
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -131,6 +131,7 @@ extra_rdoc_files:
131
131
  files:
132
132
  - spec/cobweb/cobweb_crawler_spec.rb
133
133
  - spec/cobweb/cobweb_job_spec.rb
134
+ - spec/cobweb/cobweb_links_spec.rb
134
135
  - spec/cobweb/cobweb_spec.rb
135
136
  - spec/cobweb/content_link_parser_spec.rb
136
137
  - spec/samples/sample_html_links.html
@@ -139,7 +140,9 @@ files:
139
140
  - lib/cobweb.rb
140
141
  - lib/cobweb_crawler.rb
141
142
  - lib/cobweb_finished_job.rb
143
+ - lib/cobweb_links.rb
142
144
  - lib/cobweb_process_job.rb
145
+ - lib/cobweb_version.rb
143
146
  - lib/content_link_parser.rb
144
147
  - lib/crawl_job.rb
145
148
  - lib/encoding_safe_process_job.rb