magellan 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +11 -0
- data/VERSION.yml +4 -0
- data/lib/magellan.rb +13 -0
- data/lib/magellan/broken_link_tracker.rb +30 -0
- data/lib/magellan/cartographer.rb +70 -0
- data/lib/magellan/expected_links_tracker.rb +55 -0
- data/lib/magellan/explorer.rb +45 -0
- data/lib/magellan/extensions/array.rb +10 -0
- data/lib/magellan/extensions/mechanize_page.rb +10 -0
- data/lib/magellan/extensions/string.rb +21 -0
- data/lib/magellan/logger.rb +8 -0
- data/lib/magellan/rake/base_magellan_task.rb +40 -0
- data/lib/magellan/rake/broken_link_task.rb +33 -0
- data/lib/magellan/rake/expected_links_task.rb +35 -0
- data/lib/magellan/result.rb +20 -0
- data/spec/array_spec.rb +15 -0
- data/spec/broken_link_task_spec.rb +64 -0
- data/spec/broken_link_tracker_spec.rb +67 -0
- data/spec/cartographer_spec.rb +176 -0
- data/spec/expected_links_task_spec.rb +68 -0
- data/spec/expected_links_tracker_spec.rb +87 -0
- data/spec/explorer_spec.rb +72 -0
- data/spec/logger_spec.rb +15 -0
- data/spec/mechanize_page_spec.rb +44 -0
- data/spec/result_spec.rb +17 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/string_extensions_spec.rb +67 -0
- metadata +102 -0
data/README
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Magellan: (alpha)
|
2
|
+
|
3
|
+
Currently the supported functionality is a rake task that crawl your website and find any broken a[@href], img[@src], or script[@src] links.
|
4
|
+
|
5
|
+
Magellan::Rake::Task.new do |t|
|
6
|
+
t.origin_url = "http://localhost:3000/"
|
7
|
+
t.explore_depth = 100
|
8
|
+
end
|
9
|
+
|
10
|
+
Assumptions:
|
11
|
+
This tool works best if you follow the rules of unobtrusive javascript and property set the http status code header.
|
data/VERSION.yml
ADDED
data/lib/magellan.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'magellan/cartographer'
|
2
|
+
require 'magellan/explorer'
|
3
|
+
require 'magellan/result'
|
4
|
+
require 'magellan/broken_link_tracker'
|
5
|
+
require 'magellan/expected_links_tracker'
|
6
|
+
require 'magellan/logger'
|
7
|
+
require 'magellan/extensions/string'
|
8
|
+
require 'magellan/extensions/array'
|
9
|
+
require 'magellan/extensions/mechanize_page'
|
10
|
+
|
11
|
+
module Magellan
|
12
|
+
VERSION = '0.0.1'
|
13
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Magellan
|
2
|
+
class BrokenLinkTracker
|
3
|
+
include Observable
|
4
|
+
|
5
|
+
attr_reader :broken_links
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@broken_links = []
|
9
|
+
@first_linked_from = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(time,result)
|
13
|
+
failed = result.status_code.starts_with?("5") || result.status_code.starts_with?("4")
|
14
|
+
@broken_links << result if failed
|
15
|
+
changed
|
16
|
+
notify_observers(Time.now, !failed)
|
17
|
+
result.absolute_linked_resources.each do |linked_resource|
|
18
|
+
@first_linked_from[linked_resource] = result.url if !@first_linked_from.has_key?(linked_resource)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def failed?
|
23
|
+
!@broken_links.empty?
|
24
|
+
end
|
25
|
+
|
26
|
+
def failure_message
|
27
|
+
@broken_links.map{|broken_link| "#{broken_link.url} first linked from: #{@first_linked_from[broken_link.url]} returned: #{broken_link.status_code}"}.join("\n")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'activesupport'
|
2
|
+
require 'observer'
|
3
|
+
|
4
|
+
module Magellan
|
5
|
+
class Cartographer
|
6
|
+
include Observable
|
7
|
+
|
8
|
+
def initialize(settings)
|
9
|
+
@origin_url = settings[:origin_url]
|
10
|
+
@known_urls = settings[:ignored_urls]
|
11
|
+
@domains = settings[:domains].map {|domain| URI.parse(domain)}
|
12
|
+
@depth_to_explore = settings[:depth_to_explore]
|
13
|
+
@links_we_want_to_explore = settings[:links_to_explore]
|
14
|
+
@trace = settings[:trace]
|
15
|
+
end
|
16
|
+
|
17
|
+
def crawl
|
18
|
+
recursive_explore([@origin_url],1)
|
19
|
+
end
|
20
|
+
|
21
|
+
def recursive_explore(urls,depth)
|
22
|
+
if i_am_not_too_deep?(depth)
|
23
|
+
$stdout.puts "exploring:\n#{urls.join("\n")}" if @trace
|
24
|
+
results = Explorer.new(urls,@links_we_want_to_explore).explore
|
25
|
+
results.each do |result|
|
26
|
+
changed
|
27
|
+
notify_observers(Time.now, result)
|
28
|
+
@known_urls << result.url.remove_fragment
|
29
|
+
@known_urls << result.destination_url.remove_fragment
|
30
|
+
remove_javascript_and_print_warning result
|
31
|
+
end
|
32
|
+
|
33
|
+
all_urls = results.map {|result| result.absolute_linked_resources }.flatten
|
34
|
+
all_urls.uniq!
|
35
|
+
#TODO: handle any other url parsing error
|
36
|
+
all_urls.delete_if { |url| !a_domain_we_care_about?(url)}
|
37
|
+
all_urls.delete_if { |url| i_have_seen_this_url_before?(url)}
|
38
|
+
all_urls.chunk(40).each do |result_chunk|
|
39
|
+
recursive_explore(result_chunk,depth+1)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def i_have_seen_this_url_before?(url)
|
45
|
+
@known_urls.include?(url.remove_fragment)
|
46
|
+
end
|
47
|
+
|
48
|
+
def i_am_not_too_deep?(depth)
|
49
|
+
depth <= @depth_to_explore
|
50
|
+
end
|
51
|
+
|
52
|
+
def a_domain_we_care_about?(url)
|
53
|
+
begin
|
54
|
+
!@domains.select { |domain| URI.parse(url).host == domain.host }.empty?
|
55
|
+
rescue
|
56
|
+
!@domains.select { |domain| url.gsub(/https*:\/\//,'').starts_with?(domain.host) }.empty?
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def remove_javascript_and_print_warning(result)
|
61
|
+
result.linked_resources.delete_if do |linked_resource|
|
62
|
+
starts_with_javascript = linked_resource.downcase.starts_with?("javascript:")
|
63
|
+
#TODO: put this in the logger
|
64
|
+
#$stderr.puts "Found obtrusive javascript: #{linked_resource} on page #{result.url}" if starts_with_javascript
|
65
|
+
starts_with_javascript
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Magellan
|
2
|
+
class ExpectedLinksTracker
|
3
|
+
include Observable
|
4
|
+
attr_reader :errors
|
5
|
+
|
6
|
+
def initialize(expected_patterns)
|
7
|
+
@errors = []
|
8
|
+
@expected_patterns = expected_patterns
|
9
|
+
@evaluated_expectations = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def update(time,result)
|
13
|
+
if result.html_content?
|
14
|
+
patterns_that_apply(result).each do |pattern,expectation|
|
15
|
+
passed = result.linked_resources.include?(expectation)
|
16
|
+
changed
|
17
|
+
notify_observers(Time.now, passed)
|
18
|
+
@errors << "#{result.url} did not contain a link to #{expectation}" unless passed
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def patterns_that_apply(result)
|
24
|
+
res = @expected_patterns.select{|pattern,expecation| result.url =~ pattern || result.destination_url =~ pattern}
|
25
|
+
res.each { |expected_pattern| @evaluated_expectations[expected_pattern] = nil }
|
26
|
+
res
|
27
|
+
end
|
28
|
+
|
29
|
+
def has_errors?
|
30
|
+
!@errors.empty?
|
31
|
+
end
|
32
|
+
|
33
|
+
def unmet_expecations?
|
34
|
+
!unmet_expecations.empty?
|
35
|
+
end
|
36
|
+
|
37
|
+
def failed?
|
38
|
+
unmet_expecations? || has_errors?
|
39
|
+
end
|
40
|
+
|
41
|
+
def failure_message
|
42
|
+
unmet_expecations_messages << errors.join("\n")
|
43
|
+
end
|
44
|
+
|
45
|
+
def unmet_expecations_messages
|
46
|
+
message = "\n\n"
|
47
|
+
unmet_expecations.each {|pattern,unmet_expecation| message << "#{pattern} was never evaluted during the crawl\n"}
|
48
|
+
message
|
49
|
+
end
|
50
|
+
|
51
|
+
def unmet_expecations
|
52
|
+
@expected_patterns - @evaluated_expectations.keys
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Magellan
|
6
|
+
class Explorer
|
7
|
+
UNKNOWN_CONTENT = "unknown"
|
8
|
+
def initialize(urls,links)
|
9
|
+
@links = links
|
10
|
+
@urls = urls
|
11
|
+
end
|
12
|
+
|
13
|
+
def explore
|
14
|
+
reqs = []
|
15
|
+
@urls.each do |url|
|
16
|
+
reqs.push Thread.new { explore_a(url) }
|
17
|
+
end
|
18
|
+
reqs.collect { |req| req.value }
|
19
|
+
end
|
20
|
+
|
21
|
+
def explore_a(url)
|
22
|
+
begin
|
23
|
+
agent = WWW::Mechanize.new
|
24
|
+
agent.user_agent = "Ruby/#{RUBY_VERSION}"
|
25
|
+
doc = agent.get(url)
|
26
|
+
destination_url = doc.uri.to_s
|
27
|
+
status_code = doc.code
|
28
|
+
#TODO: clean this up, this is very hacky, I would rather pass in a hpricot doc to create a result
|
29
|
+
if doc.respond_to?(:content_type) && doc.content_type.starts_with?("text/html")
|
30
|
+
Explorer.create_result(url, destination_url, status_code, doc.links_to_other_documents(@links),doc.content_type)
|
31
|
+
else
|
32
|
+
Explorer.create_result(url, destination_url, status_code, [], doc.respond_to?(:content_type) ? doc.content_type : UNKNOWN_CONTENT)
|
33
|
+
end
|
34
|
+
rescue WWW::Mechanize::ResponseCodeError => the_error
|
35
|
+
Explorer.create_result(url, url, the_error.response_code, [],UNKNOWN_CONTENT)
|
36
|
+
rescue Timeout::Error
|
37
|
+
Explorer.create_result(url, url, "505", [],UNKNOWN_CONTENT)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.create_result(url,destination_url,status_code,links,content_type)
|
42
|
+
Result.new(status_code,url,destination_url,links.map{|link| link.to_s},content_type)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
class WWW::Mechanize::Page
|
3
|
+
def links_to_other_documents(links_to_other_resources)
|
4
|
+
links_to_other_resources.map {|links_to_other_resource| get_attributes(links_to_other_resource.first,links_to_other_resource.last)}.flatten
|
5
|
+
end
|
6
|
+
|
7
|
+
def get_attributes(tag,attribute)
|
8
|
+
(self/tag).map{|alink| alink.attributes[attribute]}.compact
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'activesupport'
|
2
|
+
require 'open-uri'
|
3
|
+
class String
|
4
|
+
def to_absolute_url(origin_url)
|
5
|
+
begin
|
6
|
+
#BUG in URI.join? URI.join('http://www.google.com/index.html?foo=b','?foo=a') # => http://www.google.com/?foo=a
|
7
|
+
stripped = self.strip
|
8
|
+
if stripped.starts_with?('?')
|
9
|
+
origin_url.gsub(/\?.*/,'') + stripped
|
10
|
+
else
|
11
|
+
URI.join(origin_url,stripped).to_s
|
12
|
+
end
|
13
|
+
rescue
|
14
|
+
self
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove_fragment
|
19
|
+
self.gsub(/#.*/,'')
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
#TODO: this is not a good place to use a template method - violates Liskov substitution principle
|
3
|
+
module Magellan
|
4
|
+
module Rake
|
5
|
+
class BaseMagellanTask < ::Rake::TaskLib
|
6
|
+
attr_accessor :origin_url
|
7
|
+
attr_accessor :explore_depth
|
8
|
+
attr_accessor :ignored_urls
|
9
|
+
|
10
|
+
def initialize(name)
|
11
|
+
@ignored_urls = []
|
12
|
+
@name=name
|
13
|
+
yield self if block_given?
|
14
|
+
define
|
15
|
+
end
|
16
|
+
|
17
|
+
def define
|
18
|
+
desc description
|
19
|
+
task @name do
|
20
|
+
settings = {:origin_url => origin_url, :depth_to_explore => explore_depth, :domains => [origin_url],
|
21
|
+
:ignored_urls =>ignored_urls, :links_to_explore => links_to_explore, :trace => ENV['TRACE']}
|
22
|
+
cartographer = Magellan::Cartographer.new(settings)
|
23
|
+
observer = create_observer
|
24
|
+
observer.add_observer(Magellan::Logger.new)
|
25
|
+
cartographer.add_observer(observer)
|
26
|
+
cartographer.crawl
|
27
|
+
if observer.failed?
|
28
|
+
STDERR.puts observer.failure_message
|
29
|
+
exit 1
|
30
|
+
else
|
31
|
+
$stdout.puts "\n" + success_message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/tasklib'
|
3
|
+
require 'magellan'
|
4
|
+
require 'magellan/rake/base_magellan_task'
|
5
|
+
|
6
|
+
module Magellan
|
7
|
+
module Rake
|
8
|
+
|
9
|
+
class BrokenLinkTask < BaseMagellanTask
|
10
|
+
def initialize(name="magellan:explore")
|
11
|
+
super(name)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_observer
|
15
|
+
Magellan::BrokenLinkTracker.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def links_to_explore
|
19
|
+
[["a","href"],["script","src"],["img","src"]]
|
20
|
+
end
|
21
|
+
|
22
|
+
def description
|
23
|
+
"explore #{@origin_url} for broken links"
|
24
|
+
end
|
25
|
+
|
26
|
+
def success_message
|
27
|
+
"No broken links were found!"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/tasklib'
|
3
|
+
require 'magellan'
|
4
|
+
require 'magellan/rake/base_magellan_task'
|
5
|
+
|
6
|
+
module Magellan
|
7
|
+
module Rake
|
8
|
+
|
9
|
+
class ExpectedLinksTask < BaseMagellanTask
|
10
|
+
attr_accessor :patterns_and_expected_links
|
11
|
+
|
12
|
+
def initialize(name="magellan:check_links")
|
13
|
+
super(name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def description
|
17
|
+
"Explore #{@origin_url} and find check if all given patterns are matched"
|
18
|
+
end
|
19
|
+
|
20
|
+
def links_to_explore
|
21
|
+
[["a","href"]]
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_observer
|
25
|
+
Magellan::ExpectedLinksTracker.new(@patterns_and_expected_links)
|
26
|
+
end
|
27
|
+
|
28
|
+
def success_message
|
29
|
+
"All expected links found!"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Magellan
|
2
|
+
class Result
|
3
|
+
attr_reader :status_code,:url,:destination_url,:linked_resources
|
4
|
+
def initialize(status_code,url,destination_url,linked_resources,content_type)
|
5
|
+
@status_code = status_code
|
6
|
+
@url = url
|
7
|
+
@destination_url = destination_url
|
8
|
+
@linked_resources = linked_resources
|
9
|
+
@content_type = content_type
|
10
|
+
end
|
11
|
+
|
12
|
+
def absolute_linked_resources
|
13
|
+
absolute_links = linked_resources.map { |linked_resource| linked_resource.to_s.to_absolute_url(destination_url) }.compact
|
14
|
+
end
|
15
|
+
|
16
|
+
def html_content?
|
17
|
+
@content_type.starts_with?("text/html")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/spec/array_spec.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "Array Extensions" do
|
5
|
+
it "should be able to break up a array into chunks with a max size" do
|
6
|
+
[1,2,3,4,5].chunk(3).size.should eql(2)
|
7
|
+
[1,2,3,4,5].chunk(3).first.should eql([1,2,3])
|
8
|
+
[1,2,3,4,5].chunk(3).last.should eql([4,5])
|
9
|
+
end
|
10
|
+
it "should be able to break up a array into chunks with a max size" do
|
11
|
+
[1,2,3,4,5].chunk(1).size.should eql(5)
|
12
|
+
[1,2,3,4,5].chunk(1).first.should eql([1])
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
describe "Magellan BrokenLinkTask" do
|
5
|
+
|
6
|
+
before :all do
|
7
|
+
@file_name = File.dirname(__FILE__) + "/../lib/magellan/rake/broken_link_task.rb"
|
8
|
+
@rake = Rake::Application.new
|
9
|
+
Rake.application = @rake
|
10
|
+
end
|
11
|
+
|
12
|
+
before :each do
|
13
|
+
load @file_name
|
14
|
+
$stdout.stubs(:putc)
|
15
|
+
end
|
16
|
+
|
17
|
+
after :all do
|
18
|
+
Rake.application = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should create a rake task" do
|
22
|
+
Magellan::Rake::BrokenLinkTask.new
|
23
|
+
tasks.include?("magellan:explore").should be_true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should explore when task is invoked" do
|
27
|
+
Magellan::Rake::BrokenLinkTask.new("invoke_task") do |t|
|
28
|
+
t.explore_depth = 1
|
29
|
+
t.origin_url = "http://localhost:8080"
|
30
|
+
end
|
31
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
32
|
+
$stdout.expects(:puts) #passed message
|
33
|
+
@rake.invoke_task("invoke_task")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should raise exception when broken links are found" do
|
37
|
+
Magellan::Rake::BrokenLinkTask.new("exception_task") do |t|
|
38
|
+
t.explore_depth = 1
|
39
|
+
t.origin_url = "http://canrailsscale.com"
|
40
|
+
end
|
41
|
+
$stderr.expects(:puts)
|
42
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","500"))
|
43
|
+
lambda {@rake.invoke_task("exception_task")}.should raise_error
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should attach logger" do
|
47
|
+
Magellan::Rake::BrokenLinkTask.new("logger_test") do |t|
|
48
|
+
t.explore_depth = 1
|
49
|
+
t.origin_url = "http://canrailsscale.com"
|
50
|
+
end
|
51
|
+
$stderr.stubs(:puts)
|
52
|
+
Magellan::Logger.any_instance.expects(:update)
|
53
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","500"))
|
54
|
+
lambda {@rake.invoke_task("logger_test")}.should raise_error
|
55
|
+
end
|
56
|
+
|
57
|
+
def create_result(url,status_code)
|
58
|
+
Magellan::Explorer.create_result(url,url,status_code, [],"foo")
|
59
|
+
end
|
60
|
+
|
61
|
+
def tasks
|
62
|
+
@rake.tasks.collect{|task| task.name }
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::BrokenLinkTracker do
|
5
|
+
|
6
|
+
it "should not report broken links if there are none" do
|
7
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
8
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.foo.com',['jalskdjflakjsf']))
|
9
|
+
broken_link_tracker.failed?.should be_false
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should record links by absolute_url" do
|
13
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
14
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.bozo.com/foople.html',['/apples.html']))
|
15
|
+
broken_link_tracker.update(Time.now,create_result('http://www.bozo.com/apples.html',"404",[]))
|
16
|
+
broken_link_tracker.failure_message.should include("http://www.bozo.com/foople.html")
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should only record broken links errors" do
|
20
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
21
|
+
broken_link_tracker.update(Time.now,create_success_result('http://www.foo.com',['http://www.google.com']))
|
22
|
+
broken_link_tracker.update(Time.now,create_result('http://www.foo.com/404',"404",[]))
|
23
|
+
broken_link_tracker.failed?.should be_true
|
24
|
+
broken_link_tracker.broken_links.size.should eql(1)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should record 4** errors" do
|
28
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
29
|
+
broken_link_tracker.update(Time.now,create_result('http://www.foo.com/404',"404",[]))
|
30
|
+
broken_link_tracker.broken_links.first.status_code.should eql('404')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "have url and status code in the error message" do
|
34
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
35
|
+
broken_link_tracker.update(Time.now,create_result('broke url',"404",[]))
|
36
|
+
broken_link_tracker.failure_message.should include('broke url')
|
37
|
+
broken_link_tracker.failure_message.should include("404")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should record 5** errors" do
|
41
|
+
broken_link_tracker = Magellan::BrokenLinkTracker.new
|
42
|
+
broken_link_tracker.update(Time.now,create_result('fooz',"500",[]))
|
43
|
+
broken_link_tracker.broken_links.first.status_code.should eql('500')
|
44
|
+
end
|
45
|
+
|
46
|
+
def create_success_result(url,linked_resources)
|
47
|
+
create_result(url,"200",linked_resources)
|
48
|
+
end
|
49
|
+
|
50
|
+
def create_result(url,status_code, linked_resources)
|
51
|
+
Magellan::Result.new(status_code,url,url,linked_resources,"foo")
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should update the observer with a pass" do
|
55
|
+
tracker = Magellan::BrokenLinkTracker.new
|
56
|
+
tracker.add_observer(Magellan::Logger.new)
|
57
|
+
$stdout.expects(:putc).with('.')
|
58
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should update the observer with a pass" do
|
62
|
+
tracker = Magellan::BrokenLinkTracker.new
|
63
|
+
tracker.add_observer(Magellan::Logger.new)
|
64
|
+
$stdout.expects(:putc).with('F')
|
65
|
+
tracker.update(Time.now,Magellan::Result.new('404','/zoro','/zoro',['/fail_about_us.html'],"text/html"))
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Cartographer do
|
5
|
+
|
6
|
+
it "should not visit the same url more then once" do
|
7
|
+
origin_url = "http://www.google.com"
|
8
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com']))
|
9
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
10
|
+
cartographer.crawl
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should not visit the origin url more then once if it finds a link with a finishing /" do
|
14
|
+
pending
|
15
|
+
origin_url = "http://www.google.com"
|
16
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/']))
|
17
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
18
|
+
cartographer.crawl
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should try to explore urls in the domain we care about that have non ascii characters in them" do
|
22
|
+
origin_url = "http://www.reddit.com"
|
23
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["http://www.reddit.com/r/science/comments/87dk7/cold_fusion_is_a_pipe_dream_but_μcatalyzed_cool/","http://www.domainwedontcareabout.com/μ"]))
|
24
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://www.reddit.com/r/science/comments/87dk7/cold_fusion_is_a_pipe_dream_but_μcatalyzed_cool/").returns(create_success_result([]))
|
25
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
26
|
+
cartographer.crawl
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should not visit the same url more then once if they differ by fragment id" do
|
30
|
+
origin_url = "http://www.google.com"
|
31
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com#foo']))
|
32
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
33
|
+
cartographer.crawl
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should notify observers when a result comes in" do
|
37
|
+
origin_url = "http://www.google.com"
|
38
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com']))
|
39
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
40
|
+
foo = Object.new
|
41
|
+
foo.expects(:update)
|
42
|
+
cartographer.add_observer(foo)
|
43
|
+
cartographer.crawl
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should notify observers everytime a result comes in" do
|
47
|
+
origin_url = "http://www.google.com"
|
48
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/bar.html']))
|
49
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
50
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/bar.html').returns(create_success_result([]))
|
51
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
52
|
+
foo = Object.new
|
53
|
+
foo.expects(:update).times(3)
|
54
|
+
cartographer.add_observer(foo)
|
55
|
+
cartographer.crawl
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should explore other linked resources" do
|
59
|
+
origin_url = "http://www.google.com"
|
60
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
61
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
62
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
63
|
+
cartographer.crawl
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should not explore ignored urls" do
|
67
|
+
origin_url = "http://www.google.com"
|
68
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/ignoreme.html']))
|
69
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with('http://www.google.com/foo.html').returns(create_success_result([]))
|
70
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url,3,[origin_url],['http://www.google.com/ignoreme.html']))
|
71
|
+
cartographer.crawl
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should not explore the same url more then once" do
|
75
|
+
origin_url = "http://www.google.com"
|
76
|
+
Magellan::Explorer.any_instance.expects(:explore_a).with(origin_url).returns(create_success_result(['http://www.google.com/foo.html','http://www.google.com/foo.html']))
|
77
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result([]))
|
78
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
79
|
+
cartographer.crawl
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to specify crawlable domains" do
|
83
|
+
origin_url = "http://www.google.com"
|
84
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.foo.com']))
|
85
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.foo.com').returns(create_success_result(['http://www.bar.com']))
|
86
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url, 5,['http://www.google.com','http://www.foo.com']))
|
87
|
+
cartographer.crawl
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should explore relative links" do
|
91
|
+
origin_url = "http://www.google.com"
|
92
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
93
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result(['/foo2.html']))
|
94
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo2.html').returns(create_success_result([]))
|
95
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
96
|
+
cartographer.crawl
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should go n layers deep into a site" do
|
100
|
+
origin_url = "http://www.google.com"
|
101
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(['http://www.google.com/foo.html']))
|
102
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo.html').returns(create_success_result(['http://www.google.com/foo2.html']))
|
103
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo2.html').returns(create_success_result(['http://www.google.com/foo3.html']))
|
104
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url,3))
|
105
|
+
cartographer.crawl
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should use host to determine if we are in a allowed domain" do
|
109
|
+
origin_url = "http://www.google.com/jskfjlsajfd"
|
110
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
111
|
+
cartographer.a_domain_we_care_about?("http://www.google.com/index.html").should be_true
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should not explore js urls and print warnings if they are found, obtrusive javascript is bad mmkay" do
|
115
|
+
origin_url = "http://www.google.com"
|
116
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["javascript:bookmarksite('ThoughtWorks Studios', 'http://studios.thoughtworks.com')",'http://www.google.com/foo']))
|
117
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with('http://www.google.com/foo').returns(create_success_result([]))
|
118
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
119
|
+
cartographer.crawl
|
120
|
+
end
|
121
|
+
|
122
|
+
#<a alex.hal9000@gmail.com="" href="mailto:PWang@thoughtworks.com,">PWang@thoughtworks.com, alex.hal9000@gmail.com</a>
|
123
|
+
|
124
|
+
it "should not die on checking the domain on invalid urls" do
|
125
|
+
origin_url = "http://www.google.com/adsfaf"
|
126
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
127
|
+
cartographer.a_domain_we_care_about?("mailto:PWang@thoughtworks.com,").should be_false
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should not explore mailto urls" do
|
131
|
+
origin_url = "http://www.google.com/adsfaf"
|
132
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with(origin_url).returns(create_success_result(["mailto:foo"]))
|
133
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url))
|
134
|
+
cartographer.crawl
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should puts out urls if the trace is enabled" do
|
138
|
+
origin_url = "http://www.google.com/adsfaf"
|
139
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with(origin_url).returns(create_success_result([]))
|
140
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url).merge( {:trace=> true}))
|
141
|
+
$stdout.expects(:puts).with {|value| value.include?(origin_url)}
|
142
|
+
cartographer.crawl
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should not puts if the trace is disabled" do
|
146
|
+
origin_url = "http://www.google.com/adsfaf"
|
147
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with(origin_url).returns(create_success_result([]))
|
148
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url).merge( {:trace=> false}))
|
149
|
+
$stdout.expects(:puts).never
|
150
|
+
cartographer.crawl
|
151
|
+
end
|
152
|
+
|
153
|
+
it "should record the source and the destination url in known urls" do
|
154
|
+
origin_url = "http://studios.thoughtworks.com/cruise"
|
155
|
+
cartographer = Magellan::Cartographer.new(settings(origin_url, 1))
|
156
|
+
cartographer.crawl
|
157
|
+
cartographer.i_have_seen_this_url_before?(origin_url).should be_true
|
158
|
+
cartographer.i_have_seen_this_url_before?("http://studios.thoughtworks.com/cruise-continuous-integration").should be_true
|
159
|
+
end
|
160
|
+
|
161
|
+
it "should go through a entire site if layers to explore is set to -1"
|
162
|
+
it "should explore n layers into external domains"
|
163
|
+
|
164
|
+
def create_success_result(linked_resources)
|
165
|
+
create_result("200",linked_resources)
|
166
|
+
end
|
167
|
+
|
168
|
+
def settings(origin_url,depth=5,domains = [origin_url], ignored_urls=[])
|
169
|
+
{:origin_url => origin_url, :depth_to_explore => depth, :domains => domains, :ignored_urls =>ignored_urls, :links_to_explore => [["a","href"]] }
|
170
|
+
end
|
171
|
+
|
172
|
+
def create_result(status_code, linked_resources)
|
173
|
+
Magellan::Result.new(status_code,"http://www.google.com","http://www.google.com",linked_resources,"text/html")
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
describe "Magellan ExpectedLinksTask" do
|
5
|
+
|
6
|
+
before :all do
|
7
|
+
@file_name = File.dirname(__FILE__) + "/../lib/magellan/rake/expected_links_task.rb"
|
8
|
+
@rake = Rake::Application.new
|
9
|
+
Rake.application = @rake
|
10
|
+
end
|
11
|
+
|
12
|
+
before :each do
|
13
|
+
$stdout.stubs(:putc)
|
14
|
+
load @file_name
|
15
|
+
end
|
16
|
+
|
17
|
+
after :all do
|
18
|
+
Rake.application = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should create a rake task" do
|
22
|
+
Magellan::Rake::ExpectedLinksTask.new
|
23
|
+
tasks.include?("magellan:check_links").should be_true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should explore when task is invoked" do
|
27
|
+
Magellan::Rake::ExpectedLinksTask.new("some_task") do |t|
|
28
|
+
t.explore_depth = 1
|
29
|
+
t.patterns_and_expected_links = []
|
30
|
+
t.origin_url = "http://localhost:8080"
|
31
|
+
end
|
32
|
+
$stdout.expects(:puts)
|
33
|
+
Magellan::Explorer.any_instance.expects(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
34
|
+
@rake.invoke_task("some_task")
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
it "should notify a expected link tracker when a task is invoked" do
|
39
|
+
Magellan::Rake::ExpectedLinksTask.new("invoke_expected_link_tracker") do |t|
|
40
|
+
t.explore_depth = 1
|
41
|
+
t.patterns_and_expected_links = []
|
42
|
+
t.origin_url = "http://localhost:8080"
|
43
|
+
end
|
44
|
+
$stdout.expects(:puts)
|
45
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://localhost:8080").returns(create_result("http://localhost:8080","200"))
|
46
|
+
Magellan::ExpectedLinksTracker.any_instance.expects(:update).once
|
47
|
+
@rake.invoke_task("invoke_expected_link_tracker")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should fail the rake task if expected links did not exist or rules did not evaluate to be true" do
|
51
|
+
Magellan::Rake::ExpectedLinksTask.new("exception_raising_task") do |t|
|
52
|
+
t.explore_depth = 1
|
53
|
+
t.patterns_and_expected_links = [[/.*/,'/about_us.html']]
|
54
|
+
t.origin_url = "http://canrailsscale.com"
|
55
|
+
end
|
56
|
+
$stderr.expects(:puts)
|
57
|
+
Magellan::Explorer.any_instance.stubs(:explore_a).once.with("http://canrailsscale.com").returns(create_result("http://canrailsscale.com","200"))
|
58
|
+
lambda {@rake.invoke_task("exception_raising_task")}.should raise_error
|
59
|
+
end
|
60
|
+
|
61
|
+
def create_result(url,status_code)
|
62
|
+
Magellan::Explorer.create_result(url,url,status_code, [],"text/html")
|
63
|
+
end
|
64
|
+
|
65
|
+
def tasks
|
66
|
+
@rake.tasks.collect{|task| task.name }
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::ExpectedLinksTracker do
|
5
|
+
|
6
|
+
it "should create a error message contianing the offending url and " do
|
7
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
8
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"text/html"))
|
9
|
+
tracker.errors.first.should include('/fozo')
|
10
|
+
tracker.errors.first.should include('/about_us.html')
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should be able specify all resource should link to something" do
|
14
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
15
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
16
|
+
tracker.has_errors?.should be_false
|
17
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_fail_us.html'],"text/html"))
|
18
|
+
tracker.has_errors?.should be_true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should only apply rules if they apply to source url" do
|
22
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
23
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_fail_us.html'],"text/html"))
|
24
|
+
tracker.has_errors?.should be_false
|
25
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/zoro',['/about_fail_us.html'],"text/html"))
|
26
|
+
tracker.has_errors?.should be_true
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should only apply rules if they apply to destination url" do
|
30
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
31
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
32
|
+
tracker.has_errors?.should be_false
|
33
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/foo.html',['/about_fail_us.html'],"text/html"))
|
34
|
+
tracker.has_errors?.should be_true
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should know if a expectation was never met" do
|
38
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
39
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
40
|
+
tracker.unmet_expecations?.should be_true
|
41
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/foo.html',['/about_fail_us.html'],"text/html"))
|
42
|
+
tracker.unmet_expecations?.should be_false
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should provide a meaningfull error message around unmet expectations" do
|
46
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
47
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
48
|
+
tracker.unmet_expecations_messages.should include(/foo\.html/.to_s)
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return failed if there are unmet expectations" do
|
52
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/foo\.html/,'/about_us.html']])
|
53
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zooo','/zoro',['/about_fail_us.html'],"text/html"))
|
54
|
+
tracker.failed?.should be_true
|
55
|
+
tracker.update(Time.now,Magellan::Result.new('200','/foo.html','/zoro',['/about_us.html'],"text/html"))
|
56
|
+
tracker.failed?.should be_false
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return failed if there are failed expectations" do
|
60
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
61
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
62
|
+
tracker.failed?.should be_false
|
63
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"text/html"))
|
64
|
+
tracker.failed?.should be_true
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should ignore the result if it is not a html content type" do
|
68
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
69
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
70
|
+
tracker.update(Time.now,Magellan::Result.new('200','/fozo',"/bar",[],"application/javascript"))
|
71
|
+
tracker.failed?.should be_false
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should update the observer with a pass" do
|
75
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
76
|
+
tracker.add_observer(Magellan::Logger.new)
|
77
|
+
$stdout.expects(:putc).with('.')
|
78
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/about_us.html'],"text/html"))
|
79
|
+
end
|
80
|
+
it "should update the observer with a pass" do
|
81
|
+
tracker = Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
|
82
|
+
tracker.add_observer(Magellan::Logger.new)
|
83
|
+
$stdout.expects(:putc).with('F')
|
84
|
+
tracker.update(Time.now,Magellan::Result.new('200','/zoro','/zoro',['/fail_about_us.html'],"text/html"))
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Explorer do
|
5
|
+
|
6
|
+
it "should find other js resources" do
|
7
|
+
result = Magellan::Explorer.new(['http://canrailsscale.com/'],links_to_explore).explore
|
8
|
+
result.first.absolute_linked_resources.should include('http://pagead2.googlesyndication.com/pagead/show_ads.js')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should foo" do
|
12
|
+
WWW::Mechanize.any_instance.expects(:get).raises(Timeout::Error)
|
13
|
+
result = Magellan::Explorer.new(['http://canrailsscale.com/'],links_to_explore).explore
|
14
|
+
result.first.status_code.should eql('505')
|
15
|
+
result.first.url.should eql('http://canrailsscale.com/')
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should have one result for one url" do
|
19
|
+
result = Magellan::Explorer.new(['http://www.google.com/'],links_to_explore).explore
|
20
|
+
result.size.should eql(1)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should have two results for two urls" do
|
24
|
+
result = Magellan::Explorer.new(['http://www.google.com/','http://www.apple.com/'],links_to_explore).explore
|
25
|
+
result.size.should eql(2)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should find other pages to explore via a href" do
|
29
|
+
result = Magellan::Explorer.new('http://www.google.com/',links_to_explore).explore
|
30
|
+
result.first.absolute_linked_resources.should include('http://video.google.com/?hl=en&tab=wv')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should translate relative urls to absolute ones" do
|
34
|
+
result = Magellan::Explorer.new('http://www.google.com/',links_to_explore).explore
|
35
|
+
result.first.absolute_linked_resources.should include('http://www.google.com/intl/en/about.html')
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should report non successful status codes" do
|
39
|
+
result = Magellan::Explorer.new('http://www.google.com/dfkjaslfkjaslfkj.html',links_to_explore).explore
|
40
|
+
result.first.status_code.should eql("404")
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should not get any links if it not a text/xhtml file" do
|
44
|
+
result = Magellan::Explorer.new("http://jqueryjs.googlecode.com/files/jquery-1.3.2.min.js",links_to_explore).explore
|
45
|
+
result.first.absolute_linked_resources.should be_empty
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should update url if redirected" do
|
49
|
+
result = Magellan::Explorer.new("http://www.thoughtworks.com/mingle",links_to_explore).explore
|
50
|
+
result.first.destination_url.should eql("http://studios.thoughtworks.com/mingle-agile-project-management")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should return source url as desintation url if a error occurs" do
|
54
|
+
result = Magellan::Explorer.new("http://www.google.com/dfkjaslfkjaslfkj.html",links_to_explore).explore
|
55
|
+
result.first.destination_url.should eql("http://www.google.com/dfkjaslfkjaslfkj.html")
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should be able to explore a url" do
|
59
|
+
Magellan::Explorer.new('',links_to_explore).explore_a("http://www.yahoo.com")
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should be able to go from http to https" do
|
63
|
+
result = Magellan::Explorer.new("http://mail.yahoo.com",links_to_explore).explore
|
64
|
+
result.first.destination_url.starts_with?("https://").should be_true
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should be able to crawl ftp based links"
|
68
|
+
|
69
|
+
def links_to_explore
|
70
|
+
[["a","href"],["script","src"],["img","src"]]
|
71
|
+
end
|
72
|
+
end
|
data/spec/logger_spec.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Logger do
|
5
|
+
it "should put a . for a pass" do
|
6
|
+
logger = Magellan::Logger.new
|
7
|
+
$stdout.expects(:putc).with('.')
|
8
|
+
logger.update(Time.now,true)
|
9
|
+
end
|
10
|
+
it "should put a F for a fail" do
|
11
|
+
logger = Magellan::Logger.new
|
12
|
+
$stdout.expects(:putc).with('F')
|
13
|
+
logger.update(Time.now,false)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "WWW::Mechanize::Page Extensions" do
|
5
|
+
LINKS = [["a","href"],["script","src"],["img","src"]]
|
6
|
+
|
7
|
+
it "should not return nil for script tags without src attritubes" do
|
8
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<script class=foo>something</script>")
|
9
|
+
doc.links_to_other_documents(LINKS).should be_empty
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should find links based on script tags with src attritubes" do
|
13
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<script class=foo src='foozor'>something</script>")
|
14
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
15
|
+
links_to_other_documents.size.should eql(1)
|
16
|
+
links_to_other_documents.first.to_s.should eql("foozor")
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should be able to get two script sources" do
|
20
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<body><script class=foo src='foozor'>something</script><script class=foo src='fdsajfkajf'>something</script></body>")
|
21
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
22
|
+
links_to_other_documents.size.should eql(2)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should find links based on a tags with href attritubes" do
|
26
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<a class=foo href='bozo'>something</a>")
|
27
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
28
|
+
links_to_other_documents.size.should eql(1)
|
29
|
+
links_to_other_documents.first.to_s.should eql("bozo")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should find links based on img tags with src attritubes" do
|
33
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<img class=foo src='ohno' alt='whatever' />")
|
34
|
+
links_to_other_documents = doc.links_to_other_documents(LINKS)
|
35
|
+
links_to_other_documents.size.should eql(1)
|
36
|
+
links_to_other_documents.first.to_s.should eql("ohno")
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should links based on a tags with href attritubes" do
|
40
|
+
doc = WWW::Mechanize::Page.new(nil,{'content-type' => "text/html"},"<a class=foo>something</a>")
|
41
|
+
doc.links_to_other_documents(LINKS).should be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/result_spec.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe Magellan::Result do
|
5
|
+
|
6
|
+
it "should not remove fragments when converting to absolute urls" do
|
7
|
+
results = Magellan::Result.new("200","http://www.google.com/index.html","http://www.google.com/index.html",["/index.html#foo"],"foo")
|
8
|
+
results.absolute_linked_resources.should include("http://www.google.com/index.html#foo")
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should use destination_url to build new absolute urls" do
|
12
|
+
results = Magellan::Result.new("200","http://www.google.com/bob.html","http://www.foo.com/bob.html",["/index.html"],"foo")
|
13
|
+
results.absolute_linked_resources.should include("http://www.foo.com/index.html")
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require 'mocha'
|
4
|
+
require File.dirname(__FILE__) + '/../config/vendorized_gems'
|
5
|
+
|
6
|
+
lib_path = File.expand_path("#{File.dirname(__FILE__)}/../lib")
|
7
|
+
$LOAD_PATH.unshift lib_path unless $LOAD_PATH.include?(lib_path)
|
8
|
+
|
9
|
+
Spec::Runner.configure do |config|
|
10
|
+
config.mock_with :mocha
|
11
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'magellan'
|
3
|
+
|
4
|
+
describe "String Extensions" do
|
5
|
+
|
6
|
+
it "should convert relative urls to absolute" do
|
7
|
+
input = '/Test_Automation_Framework/chrome/common/js/trac.js'
|
8
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.google.com/Test_Automation_Framework/chrome/common/js/trac.js')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should remove any relative path from original url" do
|
12
|
+
input = '/foo/trac.js'
|
13
|
+
input.to_absolute_url('http://www.google.com/something/index.html').should eql('http://www.google.com/foo/trac.js')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should merge urls correctly with dots" do
|
17
|
+
input = '../foo/trac.js'
|
18
|
+
input.to_absolute_url('http://www.google.com/something/index.html').should eql('http://www.google.com/foo/trac.js')
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should do nothing to absolute http urls" do
|
22
|
+
input = 'http://www.apple.com'
|
23
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.apple.com')
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should not put double slashes when converting absolute to relative" do
|
27
|
+
input = "/intl/en/about.html"
|
28
|
+
input.to_absolute_url('http://www.google.com/').should eql('http://www.google.com/intl/en/about.html')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should do nothing to absolute https urls" do
|
32
|
+
input = 'https://www.apple.com'
|
33
|
+
input.to_absolute_url('http://www.google.com').should eql('https://www.apple.com')
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should translate relative https urls to absolute" do
|
37
|
+
input = "/intl/en/about.html"
|
38
|
+
input.to_absolute_url('https://www.google.com/').should eql('https://www.google.com/intl/en/about.html')
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should translate relative urls to absolute ones" do
|
42
|
+
"/intl/en/about.html".to_absolute_url("http://www.google.com").should eql('http://www.google.com/intl/en/about.html')
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should not translate absolute urls" do
|
46
|
+
"http://video.google.com/foo/about.html".to_absolute_url("http://www.google.com").should eql("http://video.google.com/foo/about.html")
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should return string itself if uri parse fails" do
|
50
|
+
"something not a url".to_absolute_url("http://www.google.com").should eql("something not a url")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should chomp the fragment portion off the url" do
|
54
|
+
"http://video.google.com/foo/about.html#sdkfjskajflsajf".remove_fragment.should eql("http://video.google.com/foo/about.html")
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should strip spaces off of the input url" do
|
58
|
+
input = ' http://www.apple.com'
|
59
|
+
input.to_absolute_url('http://www.google.com').should eql('http://www.apple.com')
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should correctly join urls that are stricktly query params" do
|
63
|
+
input = '?foo=bar'
|
64
|
+
input.to_absolute_url('http://www.google.com/index.html?foo=zoro').should eql('http://www.google.com/index.html?foo=bar')
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: magellan
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nolan Evans
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-06 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: TODO
|
36
|
+
email: nolane@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
files:
|
44
|
+
- VERSION.yml
|
45
|
+
- lib/magellan
|
46
|
+
- lib/magellan/broken_link_tracker.rb
|
47
|
+
- lib/magellan/cartographer.rb
|
48
|
+
- lib/magellan/expected_links_tracker.rb
|
49
|
+
- lib/magellan/explorer.rb
|
50
|
+
- lib/magellan/extensions
|
51
|
+
- lib/magellan/extensions/array.rb
|
52
|
+
- lib/magellan/extensions/mechanize_page.rb
|
53
|
+
- lib/magellan/extensions/string.rb
|
54
|
+
- lib/magellan/logger.rb
|
55
|
+
- lib/magellan/rake
|
56
|
+
- lib/magellan/rake/base_magellan_task.rb
|
57
|
+
- lib/magellan/rake/broken_link_task.rb
|
58
|
+
- lib/magellan/rake/expected_links_task.rb
|
59
|
+
- lib/magellan/result.rb
|
60
|
+
- lib/magellan.rb
|
61
|
+
- spec/array_spec.rb
|
62
|
+
- spec/broken_link_task_spec.rb
|
63
|
+
- spec/broken_link_tracker_spec.rb
|
64
|
+
- spec/cartographer_spec.rb
|
65
|
+
- spec/expected_links_task_spec.rb
|
66
|
+
- spec/expected_links_tracker_spec.rb
|
67
|
+
- spec/explorer_spec.rb
|
68
|
+
- spec/logger_spec.rb
|
69
|
+
- spec/mechanize_page_spec.rb
|
70
|
+
- spec/result_spec.rb
|
71
|
+
- spec/spec_helper.rb
|
72
|
+
- spec/string_extensions_spec.rb
|
73
|
+
- README
|
74
|
+
has_rdoc: true
|
75
|
+
homepage: http://github.com/nolman/magellan
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options:
|
78
|
+
- --inline-source
|
79
|
+
- --charset=UTF-8
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: "0"
|
87
|
+
version:
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: "0"
|
93
|
+
version:
|
94
|
+
requirements: []
|
95
|
+
|
96
|
+
rubyforge_project: magellan
|
97
|
+
rubygems_version: 1.3.1
|
98
|
+
signing_key:
|
99
|
+
specification_version: 2
|
100
|
+
summary: A web testing framework that embraces the discoverable nature of the web
|
101
|
+
test_files: []
|
102
|
+
|