magellan 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +11 -0
- data/VERSION.yml +4 -0
- data/lib/magellan.rb +13 -0
- data/lib/magellan/broken_link_tracker.rb +30 -0
- data/lib/magellan/cartographer.rb +70 -0
- data/lib/magellan/expected_links_tracker.rb +55 -0
- data/lib/magellan/explorer.rb +45 -0
- data/lib/magellan/extensions/array.rb +10 -0
- data/lib/magellan/extensions/mechanize_page.rb +10 -0
- data/lib/magellan/extensions/string.rb +21 -0
- data/lib/magellan/logger.rb +8 -0
- data/lib/magellan/rake/base_magellan_task.rb +40 -0
- data/lib/magellan/rake/broken_link_task.rb +33 -0
- data/lib/magellan/rake/expected_links_task.rb +35 -0
- data/lib/magellan/result.rb +20 -0
- data/spec/array_spec.rb +15 -0
- data/spec/broken_link_task_spec.rb +64 -0
- data/spec/broken_link_tracker_spec.rb +67 -0
- data/spec/cartographer_spec.rb +176 -0
- data/spec/expected_links_task_spec.rb +68 -0
- data/spec/expected_links_tracker_spec.rb +87 -0
- data/spec/explorer_spec.rb +72 -0
- data/spec/logger_spec.rb +15 -0
- data/spec/mechanize_page_spec.rb +44 -0
- data/spec/result_spec.rb +17 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/string_extensions_spec.rb +67 -0
- metadata +102 -0
data/README
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Magellan: (alpha)
|
2
|
+
|
3
|
+
Currently the supported functionality is a rake task that crawl your website and find any broken a[@href], img[@src], or script[@src] links.
|
4
|
+
|
5
|
+
Magellan::Rake::Task.new do |t|
|
6
|
+
t.origin_url = "http://localhost:3000/"
|
7
|
+
t.explore_depth = 100
|
8
|
+
end
|
9
|
+
|
10
|
+
Assumptions:
|
11
|
+
This tool works best if you follow the rules of unobtrusive javascript and property set the http status code header.
|
data/VERSION.yml
ADDED
data/lib/magellan.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'magellan/cartographer'
|
2
|
+
require 'magellan/explorer'
|
3
|
+
require 'magellan/result'
|
4
|
+
require 'magellan/broken_link_tracker'
|
5
|
+
require 'magellan/expected_links_tracker'
|
6
|
+
require 'magellan/logger'
|
7
|
+
require 'magellan/extensions/string'
|
8
|
+
require 'magellan/extensions/array'
|
9
|
+
require 'magellan/extensions/mechanize_page'
|
10
|
+
|
11
|
+
module Magellan
|
12
|
+
VERSION = '0.0.1'
|
13
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Magellan
|
2
|
+
class BrokenLinkTracker
|
3
|
+
include Observable
|
4
|
+
|
5
|
+
attr_reader :broken_links
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@broken_links = []
|
9
|
+
@first_linked_from = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(time,result)
|
13
|
+
failed = result.status_code.starts_with?("5") || result.status_code.starts_with?("4")
|
14
|
+
@broken_links << result if failed
|
15
|
+
changed
|
16
|
+
notify_observers(Time.now, !failed)
|
17
|
+
result.absolute_linked_resources.each do |linked_resource|
|
18
|
+
@first_linked_from[linked_resource] = result.url if !@first_linked_from.has_key?(linked_resource)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def failed?
|
23
|
+
!@broken_links.empty?
|
24
|
+
end
|
25
|
+
|
26
|
+
def failure_message
|
27
|
+
@broken_links.map{|broken_link| "#{broken_link.url} first linked from: #{@first_linked_from[broken_link.url]} returned: #{broken_link.status_code}"}.join("\n")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'activesupport'
|
2
|
+
require 'observer'
|
3
|
+
|
4
|
+
module Magellan
|
5
|
+
class Cartographer
|
6
|
+
include Observable
|
7
|
+
|
8
|
+
def initialize(settings)
|
9
|
+
@origin_url = settings[:origin_url]
|
10
|
+
@known_urls = settings[:ignored_urls]
|
11
|
+
@domains = settings[:domains].map {|domain| URI.parse(domain)}
|
12
|
+
@depth_to_explore = settings[:depth_to_explore]
|
13
|
+
@links_we_want_to_explore = settings[:links_to_explore]
|
14
|
+
@trace = settings[:trace]
|
15
|
+
end
|
16
|
+
|
17
|
+
def crawl
|
18
|
+
recursive_explore([@origin_url],1)
|
19
|
+
end
|
20
|
+
|
21
|
+
def recursive_explore(urls,depth)
|
22
|
+
if i_am_not_too_deep?(depth)
|
23
|
+
$stdout.puts "exploring:\n#{urls.join("\n")}" if @trace
|
24
|
+
results = Explorer.new(urls,@links_we_want_to_explore).explore
|
25
|
+
results.each do |result|
|
26
|
+
changed
|
27
|
+
notify_observers(Time.now, result)
|
28
|
+
@known_urls << result.url.remove_fragment
|
29
|
+
@known_urls << result.destination_url.remove_fragment
|
30
|
+
remove_javascript_and_print_warning result
|
31
|
+
end
|
32
|
+
|
33
|
+
all_urls = results.map {|result| result.absolute_linked_resources }.flatten
|
34
|
+
all_urls.uniq!
|
35
|
+
#TODO: handle any other url parsing error
|
36
|
+
all_urls.delete_if { |url| !a_domain_we_care_about?(url)}
|
37
|
+
all_urls.delete_if { |url| i_have_seen_this_url_before?(url)}
|
38
|
+
all_urls.chunk(40).each do |result_chunk|
|
39
|
+
recursive_explore(result_chunk,depth+1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def i_have_seen_this_url_before?(url)
|
45
|
+
@known_urls.include?(url.remove_fragment)
|
46
|
+
end
|
47
|
+
|
48
|
+
def i_am_not_too_deep?(depth)
|
49
|
+
depth <= @depth_to_explore
|
50
|
+
end
|
51
|
+
|
52
|
+
def a_domain_we_care_about?(url)
|
53
|
+
begin
|
54
|
+
!@domains.select { |domain| URI.parse(url).host == domain.host }.empty?
|
55
|
+
rescue
|
56
|
+
!@domains.select { |domain| url.gsub(/https*:\/\//,'').starts_with?(domain.host) }.empty?
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def remove_javascript_and_print_warning(result)
|
61
|
+
result.linked_resources.delete_if do |linked_resource|
|
62
|
+
starts_with_javascript = linked_resource.downcase.starts_with?("javascript:")
|
63
|
+
#TODO: put this in the logger
|
64
|
+
#$stderr.puts "Found obtrusive javascript: #{linked_resource} on page #{result.url}" if starts_with_javascript
|
65
|
+
starts_with_javascript
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Magellan
|
2
|
+
class ExpectedLinksTracker
|
3
|
+
include Observable
|
4
|
+
attr_reader :errors
|
5
|
+
|
6
|
+
def initialize(expected_patterns)
|
7
|
+
@errors = []
|
8
|
+
@expected_patterns = expected_patterns
|
9
|
+
@evaluated_expectations = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(time,result)
|
13
|
+
if result.html_content?
|
14
|
+
patterns_that_apply(result).each do |pattern,expectation|
|
15
|
+
passed = result.linked_resources.include?(expectation)
|
16
|
+
changed
|
17
|
+
notify_observers(Time.now, passed)
|
18
|
+
@errors << "#{result.url} did not contain a link to #{expectation}" unless passed
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def patterns_that_apply(result)
|
24
|
+
res = @expected_patterns.select{|pattern,expecation| result.url =~ pattern || result.destination_url =~ pattern}
|
25
|
+
res.each { |expected_pattern| @evaluated_expectations[expected_pattern] = nil }
|
26
|
+
res
|
27
|
+
end
|
28
|
+
|
29
|
+
def has_errors?
|
30
|
+
!@errors.empty?
|
31
|
+
end
|
32
|
+
|
33
|
+
def unmet_expecations?
|
34
|
+
!unmet_expecations.empty?
|
35
|
+
end
|
36
|
+
|
37
|
+
def failed?
|
38
|
+
unmet_expecations? || has_errors?
|
39
|
+
end
|
40
|
+
|
41
|
+
def failure_message
|
42
|
+
unmet_expecations_messages << errors.join("\n")
|
43
|
+
end
|
44
|
+
|
45
|
+
def unmet_expecations_messages
|
46
|
+
message = "\n\n"
|
47
|
+
unmet_expecations.each {|pattern,unmet_expecation| message << "#{pattern} was never evaluted during the crawl\n"}
|
48
|
+
message
|
49
|
+
end
|
50
|
+
|
51
|
+
def unmet_expecations
|
52
|
+
@expected_patterns - @evaluated_expectations.keys
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Magellan
|
6
|
+
class Explorer
|
7
|
+
UNKNOWN_CONTENT = "unknown"
|
8
|
+
def initialize(urls,links)
|
9
|
+
@links = links
|
10
|
+
@urls = urls
|
11
|
+
end
|
12
|
+
|
13
|
+
def explore
|
14
|
+
reqs = []
|
15
|
+
@urls.each do |url|
|
16
|
+
reqs.push Thread.new { explore_a(url) }
|
17
|
+
end
|
18
|
+
reqs.collect { |req| req.value }
|
19
|
+
end
|
20
|
+
|
21
|
+
def explore_a(url)
|
22
|
+
begin
|
23
|
+
agent = WWW::Mechanize.new
|
24
|
+
agent.user_agent = "Ruby/#{RUBY_VERSION}"
|
25
|
+
doc = agent.get(url)
|
26
|
+
destination_url = doc.uri.to_s
|
27
|
+
status_code = doc.code
|
28
|
+
#TODO: clean this up, this is very hacky, I would rather pass in a hpricot doc to create a result
|
29
|
+
if doc.respond_to?(:content_type) && doc.content_type.starts_with?("text/html")
|
30
|
+
Explorer.create_result(url, destination_url, status_code, doc.links_to_other_documents(@links),doc.content_type)
|
31
|
+
else
|
32
|
+
Explorer.create_result(url, destination_url, status_code, [], doc.respond_to?(:content_type) ? doc.content_type : UNKNOWN_CONTENT)
|
33
|
+
end
|
34
|
+
rescue WWW::Mechanize::ResponseCodeError => the_error
|
35
|
+
Explorer.create_result(url, url, the_error.response_code, [],UNKNOWN_CONTENT)
|
36
|
+
rescue Timeout::Error
|
37
|
+
Explorer.create_result(url, url, "505", [],UNKNOWN_CONTENT)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.create_result(url,destination_url,status_code,links,content_type)
|
42
|
+
Result.new(status_code,url,destination_url,links.map{|link| link.to_s},content_type)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
class WWW::Mechanize::Page
|
3
|
+
def links_to_other_documents(links_to_other_resources)
|
4
|
+
links_to_other_resources.map {|links_to_other_resource| get_attributes(links_to_other_resource.first,links_to_other_resource.last)}.flatten
|
5
|
+
end
|
6
|
+
|
7
|
+
def get_attributes(tag,attribute)
|
8
|
+
(self/tag).map{|alink| alink.attributes[attribute]}.compact
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'activesupport'
|
2
|
+
require 'open-uri'
|
3
|
+
class String
|
4
|
+
def to_absolute_url(origin_url)
|
5
|
+
begin
|
6
|
+
#BUG in URI.join? URI.join('http://www.google.com/index.html?foo=b','?foo=a') # => http://www.google.com/?foo=a
|
7
|
+
stripped = self.strip
|
8
|
+
if stripped.starts_with?('?')
|
9
|
+
origin_url.gsub(/\?.*/,'') + stripped
|
10
|
+
else
|
11
|
+
URI.join(origin_url,stripped).to_s
|
12
|
+
end
|
13
|
+
rescue
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove_fragment
|
19
|
+
self.gsub(/#.*/,'')
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
#TODO: this is not a good place to use a template method - violates Liskov substitution principle
|
3
|
+
module Magellan
|
4
|
+
module Rake
|
5
|
+
class BaseMagellanTask < ::Rake::TaskLib
|
6
|
+
attr_accessor :origin_url
|
7
|
+
attr_accessor :explore_depth
|
8
|
+
attr_accessor :ignored_urls
|
9
|
+
|
10
|
+
def initialize(name)
|
11
|
+
@ignored_urls = []
|
12
|
+
@name=name
|
13
|
+
yield self if block_given?
|
14
|
+
define
|
15
|
+
end
|
16
|
+
|
17
|
+
def define
|
18
|
+
desc description
|
19
|
+
task @name do
|
20
|
+
settings = {:origin_url => origin_url, :depth_to_explore => explore_depth, :domains => [origin_url],
|
21
|
+
:ignored_urls =>ignored_urls, :links_to_explore => links_to_explore, :trace => ENV['TRACE']}
|
22
|
+
cartographer = Magellan::Cartographer.new(settings)
|
23
|
+
observer = create_observer
|
24
|
+
observer.add_observer(Magellan::Logger.new)
|
25
|
+
cartographer.add_observer(observer)
|
26
|
+
cartographer.crawl
|
27
|
+
if observer.failed?
|
28
|
+
STDERR.puts observer.failure_message
|
29
|
+
exit 1
|
30
|
+
else
|
31
|
+
$stdout.puts "\n" + success_message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/tasklib'
|
3
|
+
require 'magellan'
|
4
|
+
require 'magellan/rake/base_magellan_task'
|
5
|
+
|
6
|
+
module Magellan
|
7
|
+
module Rake
|
8
|
+
|
9
|
+
class BrokenLinkTask < BaseMagellanTask
|
10
|
+
def initialize(name="magellan:explore")
|
11
|
+
super(name)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_observer
|
15
|
+
Magellan::BrokenLinkTracker.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def links_to_explore
|
19
|
+
[["a","href"],["script","src"],["img","src"]]
|
20
|
+
end
|
21
|
+
|
22
|
+
def description
|
23
|
+
"explore #{@origin_url} for broken links"
|
24
|
+
end
|
25
|
+
|
26
|
+
def success_message
|
27
|
+
"No broken links were found!"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/tasklib'
|
3
|
+
require 'magellan'
|
4
|
+
require 'magellan/rake/base_magellan_task'
|
5
|
+
|
6
|
+
module Magellan
|
7
|
+
module Rake
|
8
|
+
|
9
|
+
class ExpectedLinksTask < BaseMagellanTask
|
10
|
+
attr_accessor :patterns_and_expected_links
|
11
|
+
|
12
|
+
def initialize(name="magellan:check_links")
|
13
|
+
super(name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def description
|
17
|
+
"Explore #{@origin_url} and find check if all given patterns are matched"
|
18
|
+
end
|
19
|
+
|
20
|
+
def links_to_explore
|
21
|
+
[["a","href"]]
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_observer
|
25
|
+
Magellan::ExpectedLinksTracker.new(@patterns_and_expected_links)
|
26
|
+
end
|
27
|
+
|
28
|
+
def success_message
|
29
|
+
"All expected links found!"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Magellan
|
2
|
+
class Result
|
3
|
+
attr_reader :status_code,:url,:destination_url,:linked_resources
|
4
|
+
def initialize(status_code,url,destination_url,linked_resources,content_type)
|
5
|
+
@status_code = status_code
|
6
|
+
@url = url
|
7
|
+
@destination_url = destination_url
|
8
|
+
@linked_resources = linked_resources
|
9
|
+
@content_type = content_type
|
10
|
+
end
|
11
|
+
|
12
|
+
def absolute_linked_resources
|
13
|
+
absolute_links = linked_resources.map { |linked_resource| linked_resource.to_s.to_absolute_url(destination_url) }.compact
|
14
|
+
end
|
15
|
+
|
16
|
+
def html_content?
|
17
|
+
@content_type.starts_with?("text/html")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/spec/array_spec.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "Array Extensions" do
|
5
|
+
it "should be able to break up a array into chunks with a max size" do
|
6
|
+
[1,2,3,4,5].chunk(3).size.should eql(2)
|
7
|
+
[1,2,3,4,5].chunk(3).first.should eql([1,2,3])
|
8
|
+
[1,2,3,4,5].chunk(3).last.should eql([4,5])
|
9
|
+
end
|
10
|
+
it "should be able to break up a array into chunks with a max size" do
|
11
|
+
[1,2,3,4,5].chunk(1).size.should eql(5)
|
12
|
+
[1,2,3,4,5].chunk(1).first.should eql([1])
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
describe "Magellan BrokenLinkTask" do
|
5
|
+
|
6
|
+
before :all do
|
7
|
+
@file_name = File.dirname(__FILE__) + "/../lib/magellan/rake/broken_link_task.rb"
|
8
|
+
@rake = Rake::Application.new
|
9
|
+
Rake.application = @rake
|
10
|
+
end
|
11
|
+
|
12
|
+
before :each do
|
13
|
+
load @file_name
|
14
|
+
$stdout.stubs(:putc)
|
15
|
+
end
|
16
|
+
|
17
|
+
after :all do
|
18
|
+
Rake.application = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should create a rake task" do
|
22
|
+
Magellan::Rake::BrokenLinkTask.new
|
23
|
+
tasks.include?("magellan:explore").should be_true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should explore when task is invoked" do
|
27
|
+
Magellan::Rake::BrokenLinkTask.new("invoke_task") do |t|
|
28
|
+
t.explore_depth = 1
|
29
|
+
t.origin_url = "http://localhost:8080"
|
30
|
+
end
|
31
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
32
|
+
$stdout.expects(:puts) #passed message
|
33
|
+
@rake.invoke_task("invoke_task")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should raise exception when broken links are found" do
|
37
|
+
Magellan::Rake::BrokenLinkTask.new("exception_task") do |t|
|
38
|
+
t.explore_depth = 1
|
39
|
+
t.origin_url = "http://canrailsscale.com"
|
40
|
+
end
|
41
|
+
$stderr.expects(:puts)
|
42
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","500"))
|
43
|
+
lambda {@rake.invoke_task("exception_task")}.should raise_error
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should attach logger" do
|
47
|
+
Magellan::Rake::BrokenLinkTask.new("logger_test") do |t|
|
48
|
+
t.explore_depth = 1
|
49
|
+
t.origin_url = "http://canrailsscale.com"
|
50
|
+
end
|
51
|
+
$stderr.stubs(:puts)
|
52
|
+
Magellan::Logger.any_instance.expects(:update)
|
53
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","500"))
|
54
|
+
lambda {@rake.invoke_task("logger_test")}.should raise_error
|
55
|
+
end
|
56
|
+
|
57
|
+
def create_result(url,status_code)
|
58
|
+
Magellan::Explorer.create_result(url,url,status_code, [],"foo")
|
59
|
+
end
|
60
|
+
|
61
|
+
def tasks
|
62
|
+
@rake.tasks.collect{|task| task.name }
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::BrokenLinkTracker do
|
5
|
+
|
6
|
+
it "should not report broken links if there are none" do
|
7
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
8
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.foo.com',['jalskdjflakjsf']))
|
9
|
+
broken_link_tracker.failed?.should be_false
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should record links by absolute_url" do
|
13
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
14
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.bozo.com/foople.html',['/apples.html']))
|
15
|
+
broken_link_tracker.update(Time.now,create_result('http://www.bozo.com/apples.html',"404",[]))
|
16
|
+
broken_link_tracker.failure_message.should include("http://www.bozo.com/foople.html")
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should only record broken links errors" do
|
20
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
21
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.foo.com',['http://www.google.com']))
|
22
|
+
broken_link_tracker.update(Time.now,create_result('http://www.foo.com/404',"404",[]))
|
23
|
+
broken_link_tracker.failed?.should be_true
|
24
|
+
broken_link_tracker.broken_links.size.should eql(1)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should record 4** errors" do
|
28
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
29
|
+
broken_link_tracker.update(Time.now,create_result('http://www.foo.com/404',"404",[]))
|
30
|
+
broken_link_tracker.broken_links.first.status_code.should eql('404')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "have url and status code in the error message" do
|
34
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
35
|
+
broken_link_tracker.update(Time.now,create_result('broke url',"404",[]))
|
36
|
+
broken_link_tracker.failure_message.should include('broke url')
|
37
|
+
broken_link_tracker.failure_message.should include("404")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should record 5** errors" do
|
41
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
42
|
+
broken_link_tracker.update(Time.now,create_result('fooz',"500",[]))
|
43
|
+
broken_link_tracker.broken_links.first.status_code.should eql('500')
|
44
|
+
end
|
45
|
+
|
46
|
+
def create_success_result(url,linked_resources)
|
47
|
+
create_result(url,"200",linked_resources)
|
48
|
+
end
|
49
|
+
|
50
|
+
def create_result(url,status_code, linked_resources)
|
51
|
+
Magellan::Result.new(status_code,url,url,linked_resources,"foo")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should update the observer with a pass" do
|
55
|
+
tracker = Magellan::BrokenLinkTracker.new
|
56
|
+
tracker.add_observer(Magellan::Logger.new)
|
57
|
+
$stdout.expects(:putc).with('.')
|
58
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should update the observer with a pass" do
|
62
|
+
tracker = Magellan::BrokenLinkTracker.new
|
63
|
+
tracker.add_observer(Magellan::Logger.new)
|
64
|
+
$stdout.expects(:putc).with('F')
|
65
|
+
tracker.update(Time.now,Magellan::Result.new('404','/zoro','/zoro',['/fail_about_us.html'],"text/html"))
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Cartographer do
|
5
|
+
|
6
|
+
it "should not visit the same url more then once" do
|
7
|
+
origin_url = "http://www.google.com"
|
8
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com']))
|
9
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
10
|
+
cartographer.crawl
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should not visit the origin url more then once if it finds a link with a finishing /" do
|
14
|
+
pending
|
15
|
+
origin_url = "http://www.google.com"
|
16
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/']))
|
17
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
18
|
+
cartographer.crawl
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should try to explore urls in the domain we care about that have non ascii characters in them" do
|
22
|
+
origin_url = "http://www.reddit.com"
|
23
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["http://www.reddit.com/r/science/comments/87dk7/cold_fusion_is_a_pipe_dream_but_μcatalyzed_cool/","http://www.domainwedontcareabout.com/μ"]))
|
24
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://www.reddit.com/r/science/comments/87dk7/cold_fusion_is_a_pipe_dream_but_μcatalyzed_cool/").returns(create_success_result([]))
|
25
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
26
|
+
cartographer.crawl
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should not visit the same url more then once if they differ by fragment id" do
|
30
|
+
origin_url = "http://www.google.com"
|
31
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com#foo']))
|
32
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
33
|
+
cartographer.crawl
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should notify observers when a result comes in" do
|
37
|
+
origin_url = "http://www.google.com"
|
38
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com']))
|
39
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
40
|
+
foo = Object.new
|
41
|
+
foo.expects(:update)
|
42
|
+
cartographer.add_observer(foo)
|
43
|
+
cartographer.crawl
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should notify observers everytime a result comes in" do
|
47
|
+
origin_url = "http://www.google.com"
|
48
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/bar.html']))
|
49
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
50
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/bar.html').returns(create_success_result([]))
|
51
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
52
|
+
foo = Object.new
|
53
|
+
foo.expects(:update).times(3)
|
54
|
+
cartographer.add_observer(foo)
|
55
|
+
cartographer.crawl
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should explore other linked resources" do
|
59
|
+
origin_url = "http://www.google.com"
|
60
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
61
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
62
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
63
|
+
cartographer.crawl
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should not explore ignored urls" do
|
67
|
+
origin_url = "http://www.google.com"
|
68
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/ignoreme.html']))
|
69
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
70
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url,3,[origin_url],['http://www.google.com/ignoreme.html']))
|
71
|
+
cartographer.crawl
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should not explore the same url more then once" do
|
75
|
+
origin_url = "http://www.google.com"
|
76
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/foo.html']))
|
77
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result([]))
|
78
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
79
|
+
cartographer.crawl
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to specify crawlable domains" do
|
83
|
+
origin_url = "http://www.google.com"
|
84
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.foo.com']))
|
85
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.foo.com').returns(create_success_result(['http://www.bar.com']))
|
86
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url, 5,['http://www.google.com','http://www.foo.com']))
|
87
|
+
cartographer.crawl
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should explore relative links" do
|
91
|
+
origin_url = "http://www.google.com"
|
92
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
93
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result(['/foo2.html']))
|
94
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo2.html').returns(create_success_result([]))
|
95
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
96
|
+
cartographer.crawl
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should go n layers deep into a site" do
|
100
|
+
origin_url = "http://www.google.com"
|
101
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
102
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result(['http://www.google.com/foo2.html']))
|
103
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo2.html').returns(create_success_result(['http://www.google.com/foo3.html']))
|
104
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url,3))
|
105
|
+
cartographer.crawl
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should use host to determine if we are in a allowed domain" do
|
109
|
+
origin_url = "http://www.google.com/jskfjlsajfd"
|
110
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
111
|
+
cartographer.a_domain_we_care_about?("http://www.google.com/index.html").should be_true
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should not explore js urls and print warnings if they are found, obtrusive javascript is bad mmkay" do
|
115
|
+
origin_url = "http://www.google.com"
|
116
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["javascript:bookmarksite('ThoughtWorks Studios', 'http://studios.thoughtworks.com')",'http://www.google.com/foo']))
|
117
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo').returns(create_success_result([]))
|
118
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
119
|
+
cartographer.crawl
|
120
|
+
end
|
121
|
+
|
122
|
+
#<a alex.hal9000@gmail.com="" href="mailto:PWang@thoughtworks.com,">PWang@thoughtworks.com, alex.hal9000@gmail.com</a>
|
123
|
+
|
124
|
+
it "should not die on checking the domain on invalid urls" do
|
125
|
+
origin_url = "http://www.google.com/adsfaf"
|
126
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
127
|
+
cartographer.a_domain_we_care_about?("mailto:PWang@thoughtworks.com,").should be_false
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should not explore mailto urls" do
|
131
|
+
origin_url = "http://www.google.com/adsfaf"
|
132
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["mailto:foo"]))
|
133
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
134
|
+
cartographer.crawl
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should puts out urls if the trace is enabled" do
|
138
|
+
origin_url = "http://www.google.com/adsfaf"
|
139
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with(origin_url).returns(create_success_result([]))
|
140
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url).merge( {:trace=> true}))
|
141
|
+
$stdout.expects(:puts).with {|value| value.include?(origin_url)}
|
142
|
+
cartographer.crawl
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should not puts if the trace is disabled" do
|
146
|
+
origin_url = "http://www.google.com/adsfaf"
|
147
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with(origin_url).returns(create_success_result([]))
|
148
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url).merge( {:trace=> false}))
|
149
|
+
$stdout.expects(:puts).never
|
150
|
+
cartographer.crawl
|
151
|
+
end
|
152
|
+
|
153
|
+
it "should record the source and the destination url in known urls" do
|
154
|
+
origin_url = "http://studios.thoughtworks.com/cruise"
|
155
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url, 1))
|
156
|
+
cartographer.crawl
|
157
|
+
cartographer.i_have_seen_this_url_before?(origin_url).should be_true
|
158
|
+
cartographer.i_have_seen_this_url_before?("http://studios.thoughtworks.com/cruise-continuous-integration").should be_true
|
159
|
+
end
|
160
|
+
|
161
|
+
it "should go through a entire site if layers to explore is set to -1"
|
162
|
+
it "should explore n layers into external domains"
|
163
|
+
|
164
|
+
def create_success_result(linked_resources)
|
165
|
+
create_result("200",linked_resources)
|
166
|
+
end
|
167
|
+
|
168
|
+
def settings(origin_url,depth=5,domains = [origin_url], ignored_urls=[])
|
169
|
+
{:origin_url => origin_url, :depth_to_explore => depth, :domains => domains, :ignored_urls =>ignored_urls, :links_to_explore => [["a","href"]] }
|
170
|
+
end
|
171
|
+
|
172
|
+
def create_result(status_code, linked_resources)
|
173
|
+
Magellan::Result.new(status_code,"http://www.google.com","http://www.google.com",linked_resources,"text/html")
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
describe "Magellan ExpectedLinksTask" do
|
5
|
+
|
6
|
+
before :all do
|
7
|
+
@file_name = File.dirname(__FILE__) + "/../lib/magellan/rake/expected_links_task.rb"
|
8
|
+
@rake = Rake::Application.new
|
9
|
+
Rake.application = @rake
|
10
|
+
end
|
11
|
+
|
12
|
+
before :each do
|
13
|
+
$stdout.stubs(:putc)
|
14
|
+
load @file_name
|
15
|
+
end
|
16
|
+
|
17
|
+
after :all do
|
18
|
+
Rake.application = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should create a rake task" do
|
22
|
+
Magellan::Rake::ExpectedLinksTask.new
|
23
|
+
tasks.include?("magellan:check_links").should be_true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should explore when task is invoked" do
|
27
|
+
Magellan::Rake::ExpectedLinksTask.new("some_task") do |t|
|
28
|
+
t.explore_depth = 1
|
29
|
+
t.patterns_and_expected_links = []
|
30
|
+
t.origin_url = "http://localhost:8080"
|
31
|
+
end
|
32
|
+
$stdout.expects(:puts)
|
33
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
34
|
+
@rake.invoke_task("some_task")
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
it "should notify a expected link tracker when a task is invoked" do
|
39
|
+
Magellan::Rake::ExpectedLinksTask.new("invoke_expected_link_tracker") do |t|
|
40
|
+
t.explore_depth = 1
|
41
|
+
t.patterns_and_expected_links = []
|
42
|
+
t.origin_url = "http://localhost:8080"
|
43
|
+
end
|
44
|
+
$stdout.expects(:puts)
|
45
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
46
|
+
Magellan::ExpectedLinksTracker.any_instance.expects(:update).once
|
47
|
+
@rake.invoke_task("invoke_expected_link_tracker")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should fail the rake task if expected links did not exist or rules did not evaluate to be true" do
|
51
|
+
Magellan::Rake::ExpectedLinksTask.new("exception_raising_task") do |t|
|
52
|
+
t.explore_depth = 1
|
53
|
+
t.patterns_and_expected_links = [[/.*/,'/about_us.html']]
|
54
|
+
t.origin_url = "http://canrailsscale.com"
|
55
|
+
end
|
56
|
+
$stderr.expects(:puts)
|
57
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","200"))
|
58
|
+
lambda {@rake.invoke_task("exception_raising_task")}.should raise_error
|
59
|
+
end
|
60
|
+
|
61
|
+
def create_result(url,status_code)
|
62
|
+
Magellan::Explorer.create_result(url,url,status_code, [],"text/html")
|
63
|
+
end
|
64
|
+
|
65
|
+
def tasks
|
66
|
+
@rake.tasks.collect{|task| task.name }
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::ExpectedLinksTracker do
|
5
|
+
|
6
|
+
it "should create a error message contianing the offending url and " do
|
7
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
8
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"text/html"))
|
9
|
+
tracker.errors.first.should include('/fozo')
|
10
|
+
tracker.errors.first.should include('/about_us.html')
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should be able specify all resource should link to something" do
|
14
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
15
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
16
|
+
tracker.has_errors?.should be_false
|
17
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_fail_us.html'],"text/html"))
|
18
|
+
tracker.has_errors?.should be_true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should only apply rules if they apply to source url" do
|
22
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
23
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_fail_us.html'],"text/html"))
|
24
|
+
tracker.has_errors?.should be_false
|
25
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/zoro',['/about_fail_us.html'],"text/html"))
|
26
|
+
tracker.has_errors?.should be_true
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should only apply rules if they apply to destination url" do
|
30
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
31
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
32
|
+
tracker.has_errors?.should be_false
|
33
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/foo.html',['/about_fail_us.html'],"text/html"))
|
34
|
+
tracker.has_errors?.should be_true
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should know if a expectation was never met" do
|
38
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
39
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
40
|
+
tracker.unmet_expecations?.should be_true
|
41
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/foo.html',['/about_fail_us.html'],"text/html"))
|
42
|
+
tracker.unmet_expecations?.should be_false
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should provide a meaningfull error message around unmet expectations" do
|
46
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
47
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
48
|
+
tracker.unmet_expecations_messages.should include(/foo\.html/.to_s)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return failed if there are unmet expectations" do
|
52
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
53
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
54
|
+
tracker.failed?.should be_true
|
55
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/zoro',['/about_us.html'],"text/html"))
|
56
|
+
tracker.failed?.should be_false
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return failed if there are failed expectations" do
|
60
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
61
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
62
|
+
tracker.failed?.should be_false
|
63
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"text/html"))
|
64
|
+
tracker.failed?.should be_true
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should ignore the result if it is not a html content type" do
|
68
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
69
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
70
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"application/javascript"))
|
71
|
+
tracker.failed?.should be_false
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should update the observer with a pass" do
|
75
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
76
|
+
tracker.add_observer(Magellan::Logger.new)
|
77
|
+
$stdout.expects(:putc).with('.')
|
78
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
79
|
+
end
|
80
|
+
it "should update the observer with a pass" do
|
81
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
82
|
+
tracker.add_observer(Magellan::Logger.new)
|
83
|
+
$stdout.expects(:putc).with('F')
|
84
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/fail_about_us.html'],"text/html"))
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Explorer do
|
5
|
+
|
6
|
+
it "should find other js resources" do
|
7
|
+
result = Magellan::Explorer.new(['http://canrailsscale.com/'],links_to_explore).explore
|
8
|
+
result.first.absolute_linked_resources.should include('http://pagead2.googlesyndication.com/pagead/show_ads.js')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should foo" do
|
12
|
+
WWW::Mechanize.any_instance.expects(:get).raises(Timeout::Error)
|
13
|
+
result = Magellan::Explorer.new(['http://canrailsscale.com/'],links_to_explore).explore
|
14
|
+
result.first.status_code.should eql('505')
|
15
|
+
result.first.url.should eql('http://canrailsscale.com/')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should have one result for one url" do
|
19
|
+
result = Magellan::Explorer.new(['http://www.google.com/'],links_to_explore).explore
|
20
|
+
result.size.should eql(1)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should have two results for two urls" do
|
24
|
+
result = Magellan::Explorer.new(['http://www.google.com/','http://www.apple.com/'],links_to_explore).explore
|
25
|
+
result.size.should eql(2)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should find other pages to explore via a href" do
|
29
|
+
result = Magellan::Explorer.new('http://www.google.com/',links_to_explore).explore
|
30
|
+
result.first.absolute_linked_resources.should include('http://video.google.com/?hl=en&tab=wv')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should translate relative urls to absolute ones" do
|
34
|
+
result = Magellan::Explorer.new('http://www.google.com/',links_to_explore).explore
|
35
|
+
result.first.absolute_linked_resources.should include('http://www.google.com/intl/en/about.html')
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should report non successful status codes" do
|
39
|
+
result = Magellan::Explorer.new('http://www.google.com/dfkjaslfkjaslfkj.html',links_to_explore).explore
|
40
|
+
result.first.status_code.should eql("404")
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should not get any links if it not a text/xhtml file" do
|
44
|
+
result = Magellan::Explorer.new("http://jqueryjs.googlecode.com/files/jquery-1.3.2.min.js",links_to_explore).explore
|
45
|
+
result.first.absolute_linked_resources.should be_empty
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should update url if redirected" do
|
49
|
+
result = Magellan::Explorer.new("http://www.thoughtworks.com/mingle",links_to_explore).explore
|
50
|
+
result.first.destination_url.should eql("http://studios.thoughtworks.com/mingle-agile-project-management")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should return source url as desintation url if a error occurs" do
|
54
|
+
result = Magellan::Explorer.new("http://www.google.com/dfkjaslfkjaslfkj.html",links_to_explore).explore
|
55
|
+
result.first.destination_url.should eql("http://www.google.com/dfkjaslfkjaslfkj.html")
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should be able to explore a url" do
|
59
|
+
Magellan::Explorer.new('',links_to_explore).explore_a("http://www.yahoo.com")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should be able to go from http to https" do
|
63
|
+
result = Magellan::Explorer.new("http://mail.yahoo.com",links_to_explore).explore
|
64
|
+
result.first.destination_url.starts_with?("https://").should be_true
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should be able to crawl ftp based links"
|
68
|
+
|
69
|
+
def links_to_explore
|
70
|
+
[["a","href"],["script","src"],["img","src"]]
|
71
|
+
end
|
72
|
+
end
|
data/spec/logger_spec.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Logger do
|
5
|
+
it "should put a . for a pass" do
|
6
|
+
logger = Magellan::Logger.new
|
7
|
+
$stdout.expects(:putc).with('.')
|
8
|
+
logger.update(Time.now,true)
|
9
|
+
end
|
10
|
+
it "should put a F for a fail" do
|
11
|
+
logger = Magellan::Logger.new
|
12
|
+
$stdout.expects(:putc).with('F')
|
13
|
+
logger.update(Time.now,false)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "WWW::Mechanize::Page Extensions" do
|
5
|
+
LINKS = [["a","href"],["script","src"],["img","src"]]
|
6
|
+
|
7
|
+
it "should not return nil for script tags without src attritubes" do
|
8
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<script class=foo>something</script>")
|
9
|
+
doc.links_to_other_documents(LINKS).should be_empty
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should find links based on script tags with src attritubes" do
|
13
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<script class=foo src='foozor'>something</script>")
|
14
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
15
|
+
links_to_other_documents.size.should eql(1)
|
16
|
+
links_to_other_documents.first.to_s.should eql("foozor")
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should be able to get two script sources" do
|
20
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<body><script class=foo src='foozor'>something</script><script class=foo src='fdsajfkajf'>something</script></body>")
|
21
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
22
|
+
links_to_other_documents.size.should eql(2)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should find links based on a tags with href attritubes" do
|
26
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<a class=foo href='bozo'>something</a>")
|
27
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
28
|
+
links_to_other_documents.size.should eql(1)
|
29
|
+
links_to_other_documents.first.to_s.should eql("bozo")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should find links based on img tags with src attritubes" do
|
33
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<img class=foo src='ohno' alt='whatever' />")
|
34
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
35
|
+
links_to_other_documents.size.should eql(1)
|
36
|
+
links_to_other_documents.first.to_s.should eql("ohno")
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should links based on a tags with href attritubes" do
|
40
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<a class=foo>something</a>")
|
41
|
+
doc.links_to_other_documents(LINKS).should be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/result_spec.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Result do
|
5
|
+
|
6
|
+
it "should not remove fragments when converting to absolute urls" do
|
7
|
+
results = Magellan::Result.new("200","http://www.google.com/index.html","http://www.google.com/index.html",["/index.html#foo"],"foo")
|
8
|
+
results.absolute_linked_resources.should include("http://www.google.com/index.html#foo")
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should use destination_url to build new absolute urls" do
|
12
|
+
results = Magellan::Result.new("200","http://www.google.com/bob.html","http://www.foo.com/bob.html",["/index.html"],"foo")
|
13
|
+
results.absolute_linked_resources.should include("http://www.foo.com/index.html")
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require 'mocha'
|
4
|
+
require File.dirname(__FILE__) + '/../config/vendorized_gems'
|
5
|
+
|
6
|
+
lib_path = File.expand_path("#{File.dirname(__FILE__)}/../lib")
|
7
|
+
$LOAD_PATH.unshift lib_path unless $LOAD_PATH.include?(lib_path)
|
8
|
+
|
9
|
+
Spec::Runner.configure do |config|
|
10
|
+
config.mock_with :mocha
|
11
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "String Extensions" do
|
5
|
+
|
6
|
+
it "should convert relative urls to absolute" do
|
7
|
+
input = '/Test_Automation_Framework/chrome/common/js/trac.js'
|
8
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.google.com/Test_Automation_Framework/chrome/common/js/trac.js')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should remove any relative path from original url" do
|
12
|
+
input = '/foo/trac.js'
|
13
|
+
input.to_absolute_url('http://www.google.com/something/index.html').should eql('http://www.google.com/foo/trac.js')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should merge urls correctly with dots" do
|
17
|
+
input = '../foo/trac.js'
|
18
|
+
input.to_absolute_url('http://www.google.com/something/index.html').should eql('http://www.google.com/foo/trac.js')
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should do nothing to absolute http urls" do
|
22
|
+
input = 'http://www.apple.com'
|
23
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.apple.com')
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should not put double slashes when converting absolute to relative" do
|
27
|
+
input = "/intl/en/about.html"
|
28
|
+
input.to_absolute_url('http://www.google.com/').should eql('http://www.google.com/intl/en/about.html')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should do nothing to absolute https urls" do
|
32
|
+
input = 'https://www.apple.com'
|
33
|
+
input.to_absolute_url('http://www.google.com').should eql('https://www.apple.com')
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should translate relative https urls to absolute" do
|
37
|
+
input = "/intl/en/about.html"
|
38
|
+
input.to_absolute_url('https://www.google.com/').should eql('https://www.google.com/intl/en/about.html')
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should translate relative urls to absolute ones" do
|
42
|
+
"/intl/en/about.html".to_absolute_url("http://www.google.com").should eql('http://www.google.com/intl/en/about.html')
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should not translate absolute urls" do
|
46
|
+
"http://video.google.com/foo/about.html".to_absolute_url("http://www.google.com").should eql("http://video.google.com/foo/about.html")
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should return string itself if uri parse fails" do
|
50
|
+
"something not a url".to_absolute_url("http://www.google.com").should eql("something not a url")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should chomp the fragment portion off the url" do
|
54
|
+
"http://video.google.com/foo/about.html#sdkfjskajflsajf".remove_fragment.should eql("http://video.google.com/foo/about.html")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should strip spaces off of the input url" do
|
58
|
+
input = ' http://www.apple.com'
|
59
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.apple.com')
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should correctly join urls that are stricktly query params" do
|
63
|
+
input = '?foo=bar'
|
64
|
+
input.to_absolute_url('http://www.google.com/index.html?foo=zoro').should eql('http://www.google.com/index.html?foo=bar')
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: magellan
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nolan Evans
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-06 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: TODO
|
36
|
+
email: nolane@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
files:
|
44
|
+
- VERSION.yml
|
45
|
+
- lib/magellan
|
46
|
+
- lib/magellan/broken_link_tracker.rb
|
47
|
+
- lib/magellan/cartographer.rb
|
48
|
+
- lib/magellan/expected_links_tracker.rb
|
49
|
+
- lib/magellan/explorer.rb
|
50
|
+
- lib/magellan/extensions
|
51
|
+
- lib/magellan/extensions/array.rb
|
52
|
+
- lib/magellan/extensions/mechanize_page.rb
|
53
|
+
- lib/magellan/extensions/string.rb
|
54
|
+
- lib/magellan/logger.rb
|
55
|
+
- lib/magellan/rake
|
56
|
+
- lib/magellan/rake/base_magellan_task.rb
|
57
|
+
- lib/magellan/rake/broken_link_task.rb
|
58
|
+
- lib/magellan/rake/expected_links_task.rb
|
59
|
+
- lib/magellan/result.rb
|
60
|
+
- lib/magellan.rb
|
61
|
+
- spec/array_spec.rb
|
62
|
+
- spec/broken_link_task_spec.rb
|
63
|
+
- spec/broken_link_tracker_spec.rb
|
64
|
+
- spec/cartographer_spec.rb
|
65
|
+
- spec/expected_links_task_spec.rb
|
66
|
+
- spec/expected_links_tracker_spec.rb
|
67
|
+
- spec/explorer_spec.rb
|
68
|
+
- spec/logger_spec.rb
|
69
|
+
- spec/mechanize_page_spec.rb
|
70
|
+
- spec/result_spec.rb
|
71
|
+
- spec/spec_helper.rb
|
72
|
+
- spec/string_extensions_spec.rb
|
73
|
+
- README
|
74
|
+
has_rdoc: true
|
75
|
+
homepage: http://github.com/nolman/magellan
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options:
|
78
|
+
- --inline-source
|
79
|
+
- --charset=UTF-8
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: "0"
|
87
|
+
version:
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: "0"
|
93
|
+
version:
|
94
|
+
requirements: []
|
95
|
+
|
96
|
+
rubyforge_project: magellan
|
97
|
+
rubygems_version: 1.3.1
|
98
|
+
signing_key:
|
99
|
+
specification_version: 2
|
100
|
+
summary: A web testing framework that embraces the discoverable nature of the web
|
101
|
+
test_files: []
|
102
|
+
|