cobweb 0.0.44 → 0.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.44
2
+ h1. Cobweb v0.0.45
3
3
 
4
4
  h2. Intro
5
5
 
@@ -101,6 +101,20 @@ h3. Contributing/Testing
101
101
 
102
102
  Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
103
103
 
104
+ h2. Todo
105
+
106
+ * Tidy up classes with link parsing
107
+ * Refactoring of code to simplify design
108
+ * Remove requirement of redis from standalone crawler
109
+ * Add redis settings to standalone crawler (ie to connect to remote redis)
110
+ * Add ability to start and stop crawls from web interface
111
+ * Allow crawler to start as web interface only (ie not run crawls at start)
112
+ * Fix content encoding issue requiring separate process job
113
+
114
+ h3. Big changes
115
+
116
+ * Refactor into a module and refactor class names to remove cobweb and increase simplicity
117
+
104
118
  h2. License
105
119
 
106
120
  h3. The MIT License
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.44"
23
+ CobwebVersion.version
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -56,9 +56,9 @@ class Cobweb
56
56
  :url => base_url
57
57
  }
58
58
 
59
- if @options[:internal_urls].empty?
59
+ if @options[:internal_urls].nil? || @options[:internal_urls].empty?
60
60
  uri = Addressable::URI.parse(base_url)
61
- @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
61
+ @options[:internal_urls] = [[uri.scheme, "://", uri.host, "/*"].join]
62
62
  end
63
63
 
64
64
  request.merge!(@options)
@@ -79,6 +79,7 @@ class Cobweb
79
79
  def get(url, options = @options)
80
80
  raise "url cannot be nil" if url.nil?
81
81
  uri = Addressable::URI.parse(url)
82
+ uri.normalize!
82
83
  uri.fragment=nil
83
84
  url = uri.to_s
84
85
 
@@ -104,9 +105,6 @@ class Cobweb
104
105
  puts "Cache hit for #{url}" unless @options[:quiet]
105
106
  content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
106
107
  else
107
- # this url is valid for processing so lets get on with it
108
- #TODO the @http here is different from in head. Should it be? - in head we are using a method-scoped variable.
109
-
110
108
  # retrieve data
111
109
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
112
110
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
@@ -122,7 +120,11 @@ class Cobweb
122
120
  @http.open_timeout = @options[:timeout].to_i
123
121
  begin
124
122
  print "Retrieving #{url }... " unless @options[:quiet]
125
- request = Net::HTTP::Get.new uri.request_uri
123
+ request_options={}
124
+ if options[:cookies]
125
+ request_options[ 'Cookie']= options[:cookies]
126
+ end
127
+ request = Net::HTTP::Get.new uri.request_uri, request_options
126
128
 
127
129
  response = @http.request request
128
130
 
@@ -135,14 +137,11 @@ class Cobweb
135
137
  # decrement redirect limit
136
138
  redirect_limit = redirect_limit - 1
137
139
 
138
- # raise exception if we're being redirected to somewhere we've been redirected to in this content request
139
- #raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
140
-
141
- # raise exception if redirect limit has reached 0
142
140
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
141
+ cookies = get_cookies(response)
143
142
 
144
143
  # get the content from redirect location
145
- content = get(url, options.merge(:redirect_limit => redirect_limit))
144
+ content = get(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
146
145
  content[:url] = uri.to_s
147
146
  content[:redirect_through] = [] if content[:redirect_through].nil?
148
147
  content[:redirect_through].insert(0, url)
@@ -186,7 +185,7 @@ class Cobweb
186
185
  redis.expire unique_id, @options[:cache].to_i
187
186
  end
188
187
  rescue RedirectError => e
189
- puts "ERROR: #{e.message}"
188
+ puts "ERROR RedirectError: #{e.message}"
190
189
 
191
190
  ## generate a blank content
192
191
  content = {}
@@ -201,7 +200,7 @@ class Cobweb
201
200
  content[:links] = {}
202
201
 
203
202
  rescue SocketError => e
204
- puts "ERROR: SocketError#{e.message}"
203
+ puts "ERROR SocketError: #{e.message}"
205
204
 
206
205
  ## generate a blank content
207
206
  content = {}
@@ -233,10 +232,20 @@ class Cobweb
233
232
  end
234
233
  content
235
234
  end
236
-
235
+
236
+ def get_cookies(response)
237
+ all_cookies = response.get_fields('set-cookie')
238
+ cookies_array = Array.new
239
+ all_cookies.each { |cookie|
240
+ cookies_array.push(cookie.split('; ')[0])
241
+ }
242
+ cookies = cookies_array.join('; ')
243
+ end
244
+
237
245
  def head(url, options = @options)
238
246
  raise "url cannot be nil" if url.nil?
239
247
  uri = Addressable::URI.parse(url)
248
+ uri.normalize!
240
249
  uri.fragment=nil
241
250
  url = uri.to_s
242
251
 
@@ -255,37 +264,47 @@ class Cobweb
255
264
  redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
256
265
  end
257
266
 
258
- content = {}
267
+ content = {:base_url => url}
259
268
 
260
269
  # check if it has already been cached
261
270
  if redis.get("head-#{unique_id}") and @options[:cache]
262
271
  puts "Cache hit for #{url}" unless @options[:quiet]
263
272
  content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
264
273
  else
265
- print "Retrieving #{url }... " unless @options[:quiet]
266
-
267
274
  # retrieve data
268
- http = Net::HTTP.new(uri.host, uri.inferred_port)
275
+ unless @http && @http.address == uri.host && @http.port == uri.inferred_port
276
+ puts "Creating connection to #{uri.host}..." unless @options[:quiet]
277
+ @http = Net::HTTP.new(uri.host, uri.inferred_port)
278
+ end
269
279
  if uri.scheme == "https"
270
- http.use_ssl = true
271
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
272
- end
280
+ @http.use_ssl = true
281
+ @http.verify_mode = OpenSSL::SSL::VERIFY_NONE
282
+ end
273
283
 
274
284
  request_time = Time.now.to_f
275
- http.read_timeout = @options[:timeout].to_i
276
- http.open_timeout = @options[:timeout].to_i
277
-
278
- begin
279
- request = Net::HTTP::Head.new uri.request_uri
280
- response = http.request request
285
+ @http.read_timeout = @options[:timeout].to_i
286
+ @http.open_timeout = @options[:timeout].to_i
287
+ begin
288
+ print "Retrieving #{url }... " unless @options[:quiet]
289
+ request_options={}
290
+ if options[:cookies]
291
+ request_options[ 'Cookie']= options[:cookies]
292
+ end
293
+ request = Net::HTTP::Head.new uri.request_uri, request_options
294
+
295
+ response = @http.request request
281
296
 
282
297
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
283
298
  puts "redirected... " unless @options[:quiet]
299
+
284
300
  url = UriHelper.join_no_fragment(uri, response['location'])
301
+
285
302
  redirect_limit = redirect_limit - 1
286
- options = options.clone
287
- options[:redirect_limit]=redirect_limit
288
- content = head(url, options)
303
+
304
+ raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
305
+ cookies = get_cookies(response)
306
+
307
+ content = head(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
289
308
  content[:url] = uri.to_s
290
309
  content[:redirect_through] = [] if content[:redirect_through].nil?
291
310
  content[:redirect_through].insert(0, url)
@@ -293,7 +312,7 @@ class Cobweb
293
312
  content[:url] = uri.to_s
294
313
  content[:status_code] = response.code.to_i
295
314
  unless response.content_type.nil?
296
- content[:mime_type] = response.content_type.split(";")[0].strip
315
+ content[:mime_type] = response.content_type.split(";")[0].strip
297
316
  if response["Content-Type"].include? ";"
298
317
  charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
299
318
  charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
@@ -310,8 +329,23 @@ class Cobweb
310
329
  puts "Not storing in cache as cache disabled" if @options[:debug]
311
330
  end
312
331
  end
332
+ rescue RedirectError => e
333
+ puts "ERROR RedirectError: #{e.message}"
334
+
335
+ ## generate a blank content
336
+ content = {}
337
+ content[:url] = uri.to_s
338
+ content[:response_time] = Time.now.to_f - request_time
339
+ content[:status_code] = 0
340
+ content[:length] = 0
341
+ content[:body] = ""
342
+ content[:error] = e.message
343
+ content[:mime_type] = "error/dnslookup"
344
+ content[:headers] = {}
345
+ content[:links] = {}
346
+
313
347
  rescue SocketError => e
314
- puts "ERROR: #{e.message}"
348
+ puts "ERROR SocketError: #{e.message}"
315
349
 
316
350
  ## generate a blank content
317
351
  content = {}
@@ -326,7 +360,7 @@ class Cobweb
326
360
  content[:links] = {}
327
361
 
328
362
  rescue Timeout::Error => e
329
- puts "ERROR: #{e.message}"
363
+ puts "ERROR Timeout::Error: #{e.message}"
330
364
 
331
365
  ## generate a blank content
332
366
  content = {}
@@ -62,17 +62,17 @@ class CobwebCrawler
62
62
 
63
63
  @redis.sadd "crawled", url.to_s
64
64
  @redis.incr "crawl-counter"
65
-
66
- internal_links = all_links_from_content(content).map{|link| link.to_s}
65
+
66
+ internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
67
67
 
68
+ # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
69
+ cobweb_links = CobwebLinks.new(@options)
70
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
71
+
68
72
  # reject the link if we've crawled it or queued it
69
73
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
70
74
  internal_links.reject!{|link| @redis.sismember("queued", link)}
71
-
72
-
73
- # select the link if its internal
74
- internal_links.select!{|link| internal_link?(link)}
75
-
75
+
76
76
  internal_links.each do |link|
77
77
  puts "Added #{link.to_s} to queue" if @debug
78
78
  @redis.sadd "queued", link
@@ -85,10 +85,11 @@ class CobwebCrawler
85
85
  @stats.update_statistics(content, crawl_counter, queue_counter)
86
86
  @stats.update_status("Completed #{url}.")
87
87
  puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
88
-
89
- yield content, @statistic if block_given?
88
+
89
+ yield content, @stats.get_statistics if block_given?
90
90
 
91
91
  rescue => e
92
+ raise e if ENVIRONMENT == "test"
92
93
  puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
93
94
  ap e
94
95
  ap e.backtrace
@@ -105,33 +106,6 @@ class CobwebCrawler
105
106
  @stats.get_statistics
106
107
  end
107
108
 
108
-
109
- def internal_link?(link)
110
- puts "Checking internal link for: #{link}" if @debug
111
- valid_link = true
112
- internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
113
- puts "Matching against #{pattern.source}" if @debug
114
- if link.match(pattern)
115
- puts "Matched as internal" if @debug
116
- return true
117
- end
118
- end
119
- puts "Didn't match any pattern so marked as not internal" if @debug
120
- false
121
- end
122
-
123
- def internal_patterns
124
- @internal_patterns ||= @redis.smembers("internal_urls")
125
- end
126
-
127
- def all_links_from_content(content)
128
- links = content[:links].keys.map{|key| content[:links][key]}.flatten
129
- links.reject!{|link| link.cobweb_starts_with?("javascript:")}
130
- links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
131
- links.select!{|link| link.scheme == "http" || link.scheme == "https"}
132
- links.uniq
133
- links
134
- end
135
109
  end
136
110
 
137
111
  class String
@@ -0,0 +1,48 @@
1
+ class CobwebLinks
2
+
3
+ # processes links supplied to it
4
+ def initialize(options={})
5
+ @options = options
6
+
7
+ raise InternalUrlsMissingError, ":internal_urls is required" unless @options.has_key? :internal_urls
8
+ raise InvalidUrlsError, ":internal_urls must be an array" unless @options[:internal_urls].kind_of? Array
9
+ raise InvalidUrlsError, ":external_urls must be an array" unless !@options.has_key?(:external_urls) || @options[:external_urls].kind_of?(Array)
10
+ @options[:external_urls] = [] unless @options.has_key? :external_urls
11
+ @options[:debug] = false unless @options.has_key? :debug
12
+
13
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
14
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("*", ".*?")}")}
15
+
16
+ end
17
+
18
+ def internal?(link)
19
+ if @options[:debug]
20
+ puts "--------------------------------"
21
+ puts "Link: #{link}"
22
+ puts "Internal matches"
23
+ ap @internal_patterns.select{|pattern| link.match(pattern)}
24
+ puts "External matches"
25
+ ap @external_patterns.select{|pattern| link.match(pattern)}
26
+ end
27
+ !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
28
+ end
29
+
30
+ def external?(link)
31
+ if @options[:debug]
32
+ puts "--------------------------------"
33
+ puts "Link: #{link}"
34
+ puts "Internal matches"
35
+ ap @internal_patterns.select{|pattern| link.match(pattern)}
36
+ puts "External matches"
37
+ ap @external_patterns.select{|pattern| link.match(pattern)}
38
+ end
39
+ @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
40
+ end
41
+
42
+ end
43
+
44
+ class InternalUrlsMissingError < Exception
45
+ end
46
+ class InvalidUrlsError < Exception
47
+ end
48
+
@@ -0,0 +1,6 @@
1
+ class CobwebVersion
2
+ def self.version
3
+ "0.0.45"
4
+ end
5
+
6
+ end
@@ -37,9 +37,13 @@ class ContentLinkParser
37
37
  data
38
38
  end
39
39
 
40
- def all_links
40
+ def all_links(options = {})
41
+ options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
41
42
  data = link_data
42
- data.keys.map{|key| data[key]}.flatten.uniq
43
+ data = data.keys.map{|key| data[key]}.flatten.uniq
44
+ links = data.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
45
+ links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
46
+ links
43
47
  end
44
48
 
45
49
  def method_missing(m)
@@ -38,16 +38,17 @@ class CrawlJob
38
38
  # set the base url if this is the first page
39
39
  set_base_url @redis, content, content_request
40
40
 
41
+ @cobweb_links = CobwebLinks.new(content_request)
41
42
  if within_queue_limits?(content_request[:crawl_limit])
42
- internal_links = all_links_from_content(content).map{|link| link.to_s}
43
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
44
+
45
+ # select the link if its internal
46
+ internal_links.select!{|link| @cobweb_links.internal?(link)}
43
47
 
44
48
  # reject the link if we've crawled it or queued it
45
49
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
46
50
  internal_links.reject!{|link| @redis.sismember("queued", link)}
47
-
48
- # select the link if its internal
49
- internal_links.select!{|link| internal_link?(link)}
50
-
51
+
51
52
  internal_links.each do |link|
52
53
  enqueue_content(content_request, link) if within_queue_limits?(content_request[:crawl_limit])
53
54
  end
@@ -83,11 +84,11 @@ class CrawlJob
83
84
  def self.finished(content_request)
84
85
  # finished
85
86
  @stats.end_crawl(content_request)
86
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), Stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
87
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
87
88
  end
88
89
 
89
90
  def self.send_to_processing_queue(content, content_request)
90
- content_to_send = content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
91
+ content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
91
92
  if content_request[:use_encoding_safe_process_job]
92
93
  content_to_send[:body] = Base64.encode64(content[:body])
93
94
  content_to_send[:processing_queue] = content_request[:processing_queue]
@@ -119,33 +120,6 @@ class CrawlJob
119
120
  end
120
121
  end
121
122
 
122
- def self.internal_link?(link)
123
- puts "Checking internal link for: #{link}" if @debug
124
- valid_link = true
125
- internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
126
- puts "Matching against #{pattern.source}" if @debug
127
- if link.match(pattern)
128
- puts "Matched as internal" if @debug
129
- return true
130
- end
131
- end
132
- puts "Didn't match any pattern so marked as not internal" if @debug
133
- false
134
- end
135
-
136
- def self.internal_patterns
137
- @internal_patterns ||= @redis.smembers("internal_urls")
138
- end
139
-
140
- def self.all_links_from_content(content)
141
- links = content[:links].keys.map{|key| content[:links][key]}.flatten
142
- links.reject!{|link| link.starts_with?("javascript:")}
143
- links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
144
- links.select!{|link| link.scheme == "http" || link.scheme == "https"}
145
- links.uniq
146
- links
147
- end
148
-
149
123
  def self.enqueue_content(content_request, link)
150
124
  new_request = content_request.clone
151
125
  new_request[:url] = link
@@ -0,0 +1,103 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/cobweb_links')
3
+
4
+ describe CobwebLinks do
5
+
6
+ before(:each) do
7
+
8
+ @base_url = "http://www.baseurl.com/"
9
+
10
+ @default_headers = {"Cache-Control" => "private, max-age=0",
11
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
12
+ "Expires" => "-1",
13
+ "Content-Type" => "text/html; charset=UTF-8",
14
+ "Content-Encoding" => "gzip",
15
+ "Transfer-Encoding" => "chunked",
16
+ "Server" => "gws",
17
+ "X-XSS-Protection" => "1; mode=block"}
18
+
19
+ end
20
+
21
+
22
+ it "should generate a cobweb_links object" do
23
+ CobwebLinks.new(:internal_urls => [""]).should be_an_instance_of CobwebLinks
24
+ end
25
+
26
+ it "should raise error with no internal links" do
27
+ expect {CobwebLinks.new()}.to raise_error(InternalUrlsMissingError)
28
+ end
29
+ it "should not raise error with missing external links" do
30
+ expect {CobwebLinks.new(:internal_urls => ["http://domain_one.com/"])}.to_not raise_error(InternalUrlsMissingError)
31
+ end
32
+ it "should raise error with invalid internal links" do
33
+ expect {CobwebLinks.new(:internal_urls => "")}.to raise_error(InvalidUrlsError)
34
+ end
35
+ it "should raise error with invalid external links" do
36
+ expect {CobwebLinks.new(:internal_urls => [], :external_urls => "")}.to raise_error(InvalidUrlsError)
37
+ end
38
+
39
+
40
+ describe "internal and external links" do
41
+ it "should only return internal links" do
42
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
43
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
44
+ cobweb_links.internal?("http://domain_one.com/pagetwo.html").should be_true
45
+ end
46
+ it "should not return external links" do
47
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_two.com/"])
48
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
49
+ cobweb_links.external?("http://domain_two.com/pageone.html").should be_true
50
+ cobweb_links.external?("http://external.com/pageone.html").should be_true
51
+ end
52
+ it "should override internal links with external links" do
53
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://domain_one.com/"], :external_urls => ["http://domain_one.com/blog"])
54
+ cobweb_links.internal?("http://domain_one.com/pageone.html").should be_true
55
+ cobweb_links.external?("http://domain_one.com/pageone.html").should be_false
56
+ cobweb_links.internal?("http://domain_one.com/blog/pageone.html").should be_false
57
+ cobweb_links.external?("http://domain_one.com/blog/pageone.html").should be_true
58
+ cobweb_links.internal?("http://domain_two.com/blog/pageone.html").should be_false
59
+ cobweb_links.external?("http://domain_two.com/blog/pageone.html").should be_true
60
+ end
61
+ end
62
+ it "should only match from beginning of url" do
63
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://www.domain_two.com/"])
64
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
65
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html").should be_false
66
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html?url=http://www.domain_two.com/pageone.html").should be_true
67
+ cobweb_links.internal?("http://www.domain_two.com/pageone.html?url=http://www.domain_one.com/pageone.html").should be_false
68
+ end
69
+
70
+ describe "using wildcards" do
71
+ it "should match internal links with wildcards" do
72
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.com/"], :external_urls => ["http://blog.domain_one.com/"])
73
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
74
+ cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_true
75
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
76
+ end
77
+ it "should match external links with wildcards" do
78
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.domain_one.com/"], :external_urls => ["http://*.domain_one.com/"])
79
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
80
+ cobweb_links.internal?("http://images.domain_one.com/logo.png").should be_false
81
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
82
+ end
83
+ it "should allow multiple wildcards" do
84
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.*.domain_one.com/"])
85
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_false
86
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_false
87
+ cobweb_links.internal?("http://www.marketing.domain_one.com/pageone.html").should be_true
88
+ cobweb_links.internal?("http://blog.designers.domain_one.com/pagetwo.html").should be_true
89
+ end
90
+ it "should allow multiple country tlds with wildcards" do
91
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://*.domain_one.*/", "http://*.domain_one.*.*/"])
92
+ cobweb_links.internal?("http://www.domain_one.com/pageone.html").should be_true
93
+ cobweb_links.internal?("http://blog.domain_one.com/pageone.html").should be_true
94
+ cobweb_links.internal?("http://www.domain_one.co.uk/pageone.html").should be_true
95
+ cobweb_links.internal?("http://blog.domain_one.co.uk/pageone.html").should be_true
96
+ cobweb_links.internal?("http://www.domain_one.com.au/pageone.html").should be_true
97
+ cobweb_links.internal?("http://blog.domain_one.com.au/pageone.html").should be_true
98
+ cobweb_links.internal?("http://www.domain_one.ie/pageone.html").should be_true
99
+ cobweb_links.internal?("http://blog.domain_one.ie/pageone.html").should be_true
100
+ end
101
+ end
102
+
103
+ end
@@ -186,22 +186,22 @@ describe Cobweb do
186
186
  describe "location setting" do
187
187
  it "Get should strip fragments" do
188
188
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/")
189
+ Net::HTTP::Get.should_receive(:new).with("/", {})
190
190
  @cobweb.get("http://www.google.com/#ignore")
191
191
  end
192
192
  it "head should strip fragments" do
193
193
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
194
- Net::HTTP::Head.should_receive(:new).with("/").and_return(@mock_http_request)
194
+ Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
195
195
  @cobweb.head("http://www.google.com/#ignore")
196
196
  end
197
197
  it "get should not strip path" do
198
198
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff")
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
200
200
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
201
  end
202
202
  it "get should not strip query string" do
203
203
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string")
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
205
205
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
206
  end
207
207
  end
@@ -29,7 +29,7 @@ describe ContentLinkParser do
29
29
  end
30
30
  it "should return the correct links" do
31
31
  links = @content_parser.links
32
- links.length.should == 4
32
+ links.length.should == 7
33
33
  end
34
34
  end
35
35
  describe "returning image links" do
@@ -92,7 +92,17 @@ describe ContentLinkParser do
92
92
  link_data.should be_an_instance_of Hash
93
93
 
94
94
  link_data.keys.length.should == 5
95
- link_data[:links].length.should == 4
95
+ link_data[:links].length.should == 7
96
+ end
97
+
98
+ it "should return all http and https links by default" do
99
+ links = @content_parser.all_links
100
+ links.count.should == 9
101
+ end
102
+
103
+ it "should return only valid_schemes supplied" do
104
+ links = @content_parser.all_links(:valid_schemes => [:https])
105
+ links.count.should == 1
96
106
  end
97
107
  end
98
108
 
@@ -23,7 +23,10 @@
23
23
 
24
24
  <body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
25
25
 
26
- <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
26
+ <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
27
+ <a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
28
+ <a href="javascript:alert('javascript clicked');">click here for javscript</a>
29
+ <a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
27
30
  <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
28
31
 
29
32
  <map id="testmap"><area href="http://sampleurl-area"></area>></map>
@@ -1,14 +1,16 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
2
2
  require 'mock_redis'
3
3
 
4
+ ENVIRONMENT = "test"
5
+
4
6
  RSpec.configure do |config|
5
7
  config.before(:each) {
8
+
6
9
  redis_mock = double("redis")
7
10
  redis_mock.stub(:new).and_return(@redis_mock_object)
8
11
 
9
12
  #redis_mock.flushdb
10
13
 
11
-
12
14
  @default_headers = {"Cache-Control" => "private, max-age=0",
13
15
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
14
16
  "Expires" => "-1",
@@ -39,8 +41,8 @@ RSpec.configure do |config|
39
41
 
40
42
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
41
43
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
42
- Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
43
- Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
44
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
45
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
44
46
 
45
47
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
46
48
 
@@ -60,6 +62,7 @@ RSpec.configure do |config|
60
62
  @mock_http_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
61
63
  @mock_http_response.stub!(:content_length).and_return(1024)
62
64
  @mock_http_response.stub!(:body).and_return("asdf")
65
+ @mock_http_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
63
66
  @mock_http_response.stub!(:to_hash).and_return(@default_headers)
64
67
 
65
68
  @mock_http_redirect_response.stub!(:code).and_return(301)
@@ -69,6 +72,7 @@ RSpec.configure do |config|
69
72
  @mock_http_redirect_response.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
70
73
  @mock_http_redirect_response.stub!(:content_length).and_return(2048)
71
74
  @mock_http_redirect_response.stub!(:body).and_return("redirected body")
75
+ @mock_http_redirect_response.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
72
76
  @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
73
77
 
74
78
  @mock_http_redirect_response2.stub!(:code).and_return(301)
@@ -78,6 +82,7 @@ RSpec.configure do |config|
78
82
  @mock_http_redirect_response2.stub!(:[]).with("Content-Encoding").and_return(@default_headers["Content-Encoding"])
79
83
  @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
80
84
  @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
85
+ @mock_http_redirect_response2.stub!(:get_fields).with('set-cookie').and_return(["session=al98axx; expires=Fri, 31-Dec-1999 23:58:23", "query=rubyscript; expires=Fri, 31-Dec-1999 23:58:23"])
81
86
  @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
82
87
  }
83
88
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.44
4
+ version: 0.0.45
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-03 00:00:00.000000000 Z
12
+ date: 2012-05-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70139600146360 !ruby/object:Gem::Requirement
16
+ requirement: &70107297058680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70139600146360
24
+ version_requirements: *70107297058680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70139600145940 !ruby/object:Gem::Requirement
27
+ requirement: &70107297056880 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70139600145940
35
+ version_requirements: *70107297056880
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70139600145520 !ruby/object:Gem::Requirement
38
+ requirement: &70107297056360 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70139600145520
46
+ version_requirements: *70107297056360
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70139600145100 !ruby/object:Gem::Requirement
49
+ requirement: &70107297055840 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70139600145100
57
+ version_requirements: *70107297055840
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70139600144680 !ruby/object:Gem::Requirement
60
+ requirement: &70107297055120 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70139600144680
68
+ version_requirements: *70107297055120
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70139600144260 !ruby/object:Gem::Requirement
71
+ requirement: &70107297054400 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70139600144260
79
+ version_requirements: *70107297054400
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70139600143840 !ruby/object:Gem::Requirement
82
+ requirement: &70107297053900 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70139600143840
90
+ version_requirements: *70107297053900
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70139600143420 !ruby/object:Gem::Requirement
93
+ requirement: &70107297053360 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70139600143420
101
+ version_requirements: *70107297053360
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70139600143000 !ruby/object:Gem::Requirement
104
+ requirement: &70107297052800 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,18 +109,18 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70139600143000
112
+ version_requirements: *70107297052800
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70139600142580 !ruby/object:Gem::Requirement
115
+ requirement: &70107297052240 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
119
119
  - !ruby/object:Gem::Version
120
- version: '0'
120
+ version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70139600142580
123
+ version_requirements: *70107297052240
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com
@@ -131,6 +131,7 @@ extra_rdoc_files:
131
131
  files:
132
132
  - spec/cobweb/cobweb_crawler_spec.rb
133
133
  - spec/cobweb/cobweb_job_spec.rb
134
+ - spec/cobweb/cobweb_links_spec.rb
134
135
  - spec/cobweb/cobweb_spec.rb
135
136
  - spec/cobweb/content_link_parser_spec.rb
136
137
  - spec/samples/sample_html_links.html
@@ -139,7 +140,9 @@ files:
139
140
  - lib/cobweb.rb
140
141
  - lib/cobweb_crawler.rb
141
142
  - lib/cobweb_finished_job.rb
143
+ - lib/cobweb_links.rb
142
144
  - lib/cobweb_process_job.rb
145
+ - lib/cobweb_version.rb
143
146
  - lib/content_link_parser.rb
144
147
  - lib/crawl_job.rb
145
148
  - lib/encoding_safe_process_job.rb