magellan 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,69 @@
1
+ = magellan
2
+
3
+ http://rubyforge.org/projects/magellan/
4
+ http://github.com/nolman/magellan/tree/master
5
+
6
+ == DESCRIPTION:
7
+
8
+ Magellan is a web testing tool that embraces the discoverable nature of the web.
9
+
10
+ == INSTALL:
11
+
12
+ $ [sudo] gem install magellan
13
+
14
+ == GETTING STARTED:
15
+
16
+ There are two supported rake tasks, the Broken Link Task that will explore a site for any //script[@src] //img[@srg] and //a[@href] that return http status codes of 4** or 5**.
17
+
18
+ In your Rakefile add:
19
+
20
+ require 'magellan/rake/broken_link_task'
21
+ Magellan::Rake::BrokenLinkTask.new("digg") do |t|
22
+ t.origin_url = "http://digg.com/"
23
+ t.explore_depth = 3
24
+ end
25
+
26
+ This will crawl any links within the same domain as the origin_url to a depth of 3. Treating the origin_url as a depth of 1 that means we will crawl all links that are linked within 2 pages of http://digg.com.
27
+
28
+ The second rake task is one that will explore your site and ensure that given links exist.
29
+
30
+ require 'magellan/rake/expected_links_task'
31
+ Magellan::Rake::ExpectedLinksTask.new("gap") do |t|
32
+ t.origin_url = "http://www.gap.com/"
33
+ t.explore_depth = 2
34
+ t.patterns_and_expected_links = [[/.*/,'http://www.oldnavy.com'],[/http:\/\/[^\/]*\/\z/,'/browse/division.do?cid=5643']]
35
+ end
36
+
37
+ The pattern and expected links is a array of tuples of regex, string. If the current url matches the regex the task will look for the associated url string in the document. This task by default only crawls //a[@href]'s.
38
+
39
+ == ASSUMPTIONS:
40
+
41
+ This tool works best if you follow the practices of unobtrusive javascript, and properly make use of http status codes.
42
+
43
+ == DEPENDENCIES:
44
+
45
+ * ruby 1.8.6
46
+ * mechanize[http://mechanize.rubyforge.org/]
47
+ * activesupport[http://as.rubyonrails.org/]
48
+
49
+ == SUPPORT:
50
+
51
+ General help forum is located at:
52
+
53
+ * http://rubyforge.org/forum/forum.php?forum_id=31224
54
+
55
+ Mailing list:
56
+
57
+ * http://rubyforge.org/mailman/listinfo/magellan-users
58
+
59
+ Bug tracker:
60
+
61
+ * http://rubyforge.org/tracker/?atid=31199&group_id=8055
62
+
63
+ == AUTHOR:
64
+
65
+ Nolan Evans
66
+
67
+ http://www.nolanevans.com
68
+
69
+ nolane at gmail dot com
@@ -1,4 +1,4 @@
1
1
  ---
2
- :minor: 1
3
- :patch: 2
2
+ :patch: 3
4
3
  :major: 0
4
+ :minor: 1
@@ -1,14 +1,18 @@
1
1
  module Magellan
2
+ # The class that will track all broken links, urls that return 4** or 5** http status codes.
2
3
  class BrokenLinkTracker
3
4
  include Observable
4
5
 
6
+ # All results containing 4** or 5** http status codes
5
7
  attr_reader :broken_links
6
8
 
9
+ # Create a new broken link tracker
7
10
  def initialize
8
11
  @broken_links = []
9
12
  @first_linked_from = {}
10
13
  end
11
14
 
15
+ # The updates that come in via a observable subject, the time the result came at and the Magellan::Result itself.
12
16
  def update(time,result)
13
17
  failed = result.status_code.starts_with?("5") || result.status_code.starts_with?("4")
14
18
  @broken_links << result if failed
@@ -19,16 +23,19 @@ module Magellan
19
23
  end
20
24
  end
21
25
 
22
- def failed?
26
+ # Are there any broken links?
27
+ def failed?
23
28
  !@broken_links.empty?
24
29
  end
25
30
 
31
+ # A text message of all failures
26
32
  def failure_message
27
33
  @broken_links.map{|broken_link| broken_link_message(broken_link)}.join("\n")
28
34
  end
29
35
 
30
- def broken_link_message(broken_link)
31
- "#{broken_link.url} first linked from: #{@first_linked_from[broken_link.url]} returned: #{broken_link.status_code}"
36
+ # Generate the failure message for a Magellan::Result
37
+ def broken_link_message(result)
38
+ "#{result.url} first linked from: #{@first_linked_from[result.url]} returned: #{result.status_code}"
32
39
  end
33
40
  end
34
41
  end
@@ -2,9 +2,23 @@ require 'activesupport'
2
2
  require 'observer'
3
3
 
4
4
  module Magellan
5
+ # An instance of the Cartographer class maps a set of domains from a given starting url
6
+ # every time a new response is received the cartographer updates any observers listening to it
7
+ # to subscribe to the updates:
8
+ # cartographer = Cartographer.new({})
9
+ # cartographer.add_observer(some_observer_instance)
10
+ #
11
+ # Your observer instance should implement a update(time,result) method that takes in the current time and a Magellan::Result from the crawl
5
12
  class Cartographer
6
13
  include Observable
7
14
 
15
+ # Create a new Cartographer with a hash of settings:
16
+ # [:origin_url] - where to start exploring
17
+ # [:ignored_urls] - an array of absolute urls to not explore
18
+ # [:domains] - domains we should crawl
19
+ # [:depth_to_explore] - how deep to explore
20
+ # [:links_we_want_to_explore] - the kind of resources we will follow ex: //a[@href]
21
+ # [:trace] - enable a step by step trace
8
22
  def initialize(settings)
9
23
  @origin_url = settings[:origin_url]
10
24
  @known_urls = settings[:ignored_urls]
@@ -14,13 +28,15 @@ module Magellan
14
28
  @trace = settings[:trace]
15
29
  end
16
30
 
31
+ # Start recursivily exploring the site at the origin url you specify.
17
32
  def crawl
18
33
  recursive_explore([@origin_url],1)
19
34
  end
20
35
 
36
+ # Recursivily explore a list or urls until you reach a given depth or run out of known urls
21
37
  def recursive_explore(urls,depth)
22
38
  if i_am_not_too_deep?(depth)
23
- $stdout.puts "exploring:\n#{urls.join("\n")}" if @trace
39
+ $stdout.puts "\nexploring:\n#{urls.join("\n")}" if @trace
24
40
  results = Explorer.new(urls,@links_we_want_to_explore).explore
25
41
  results.each do |result|
26
42
  changed
@@ -41,14 +57,17 @@ module Magellan
41
57
  end
42
58
  end
43
59
 
44
- def i_have_seen_this_url_before?(url)
60
+ # Has the cartographer seen this url before?
61
+ def i_have_seen_this_url_before?(url)
45
62
  @known_urls.include?(url.remove_fragment)
46
63
  end
47
-
64
+
65
+ # Should we keep exploring this depth?
48
66
  def i_am_not_too_deep?(depth)
49
67
  depth <= @depth_to_explore
50
68
  end
51
69
 
70
+ # Is a given url in a domain that we care about?
52
71
  def a_domain_we_care_about?(url)
53
72
  begin
54
73
  !@domains.select { |domain| URI.parse(url).host == domain.host }.empty?
@@ -57,13 +76,10 @@ module Magellan
57
76
  end
58
77
  end
59
78
 
79
+ # Remove the javascript links from the set of links on the page.
60
80
  def remove_javascript_and_print_warning(result)
61
- result.linked_resources.delete_if do |linked_resource|
62
- starts_with_javascript = linked_resource.downcase.starts_with?("javascript:")
63
- #TODO: put this in the logger
64
- #$stderr.puts "Found obtrusive javascript: #{linked_resource} on page #{result.url}" if starts_with_javascript
65
- starts_with_javascript
66
- end
81
+ #TODO: put this in the logger
82
+ result.linked_resources.delete_if { |linked_resource| linked_resource.downcase.starts_with?("javascript:") }
67
83
  end
68
84
 
69
85
  end
@@ -1,14 +1,21 @@
1
1
  module Magellan
2
+ # The observer that will listen to all results and compare them to a list of rules about expected urls.
2
3
  class ExpectedLinksTracker
3
4
  include Observable
5
+ # An array of failed expecations
4
6
  attr_reader :errors
5
7
 
8
+ # Create a new expected links tracker.
9
+ # An array of tuples of the url pattern and expected link is a required argument.
10
+ # Example:
11
+ # Magellan::ExpectedLinksTracker.new([[/.*/,'/about_us.html']])
6
12
  def initialize(expected_patterns)
7
13
  @errors = []
8
14
  @expected_patterns = expected_patterns
9
15
  @evaluated_expectations = {}
10
16
  end
11
17
 
18
+ # The updates that come in via a observable subject, the time the result came at and the Magellan::Result itself.
12
19
  def update(time,result)
13
20
  if result.html_content?
14
21
  patterns_that_apply(result).each do |pattern,expectation|
@@ -21,34 +28,37 @@ module Magellan
21
28
  end
22
29
  end
23
30
 
24
- def patterns_that_apply(result)
31
+ def patterns_that_apply(result) # :nodoc:
25
32
  res = @expected_patterns.select{|pattern,expecation| result.url =~ pattern || result.destination_url =~ pattern}
26
33
  res.each { |expected_pattern| @evaluated_expectations[expected_pattern] = nil }
27
34
  res
28
35
  end
29
36
 
30
- def has_errors?
37
+ def has_errors? # :nodoc:
31
38
  !@errors.empty?
32
39
  end
33
40
 
34
- def unmet_expecations?
41
+ def unmet_expecations? # :nodoc:
35
42
  !unmet_expecations.empty?
36
43
  end
37
44
 
45
+ # Are there expected urls that have not been found yet, or pages that have been found with missing links?
38
46
  def failed?
39
47
  unmet_expecations? || has_errors?
40
48
  end
41
-
49
+
50
+ # A string summary of all failure messages
42
51
  def failure_message
43
52
  unmet_expecations_messages << errors.join("\n")
44
53
  end
45
54
 
46
- def unmet_expecations_messages
55
+ def unmet_expecations_messages # :nodoc:
47
56
  message = ""
48
57
  unmet_expecations.each {|pattern,unmet_expecation| message << "#{pattern} was never evaluted during the crawl\n"}
49
58
  message
50
59
  end
51
60
 
61
+ # Expecations that have never been evaluated
52
62
  def unmet_expecations
53
63
  @expected_patterns - @evaluated_expectations.keys
54
64
  end
@@ -3,14 +3,14 @@ require 'open-uri'
3
3
  require 'ostruct'
4
4
 
5
5
  module Magellan
6
- class Explorer
6
+ class Explorer # :nodoc:
7
7
  UNKNOWN_CONTENT = "unknown"
8
- def initialize(urls,links)
8
+ def initialize(urls,links) # :nodoc:
9
9
  @links = links
10
10
  @urls = urls
11
11
  end
12
12
 
13
- def explore
13
+ def explore # :nodoc:
14
14
  reqs = []
15
15
  @urls.each do |url|
16
16
  reqs.push Thread.new { explore_a(url) }
@@ -18,7 +18,7 @@ module Magellan
18
18
  reqs.collect { |req| req.value }
19
19
  end
20
20
 
21
- def explore_a(url)
21
+ def explore_a(url) # :nodoc:
22
22
  begin
23
23
  agent = WWW::Mechanize.new
24
24
  agent.user_agent = "Ruby/#{RUBY_VERSION}"
@@ -38,7 +38,7 @@ module Magellan
38
38
  end
39
39
  end
40
40
 
41
- def self.create_result(url,destination_url,status_code,links,content_type)
41
+ def self.create_result(url,destination_url,status_code,links,content_type) # :nodoc:
42
42
  Result.new(status_code,url,destination_url,links.map{|link| link.to_s},content_type)
43
43
  end
44
44
  end
@@ -1,4 +1,8 @@
1
1
  class Array
2
+ # Break down an array into chunks of a given max size.
3
+ # Example:
4
+ # [1,2,3,4].chunk(3) # => [[1,2,3],[4]]
5
+ # [1,2,3,4].chunk(2) # => [[1,2],[3,4]]
2
6
  def chunk(max_size)
3
7
  result = []
4
8
  number_of_chunks = (self.size.to_f / max_size).ceil
@@ -1,10 +1,10 @@
1
1
  require 'mechanize'
2
2
  class WWW::Mechanize::Page
3
- def links_to_other_documents(links_to_other_resources)
3
+ def links_to_other_documents(links_to_other_resources) # :nodoc:
4
4
  links_to_other_resources.map {|links_to_other_resource| get_attributes(links_to_other_resource.first,links_to_other_resource.last)}.flatten
5
5
  end
6
6
 
7
- def get_attributes(tag,attribute)
7
+ def get_attributes(tag,attribute) # :nodoc:
8
8
  (self/tag).map{|alink| alink.attributes[attribute]}.compact
9
9
  end
10
10
  end
@@ -1,7 +1,12 @@
1
1
  require 'activesupport'
2
2
  require 'open-uri'
3
3
  class String
4
- def to_absolute_url(origin_url)
4
+
5
+ # Converts a relative url to a absolute url
6
+ # Example:
7
+ # '/foo.html'.to_absolute_url('http://www.google.com/index.html?foo=b') # => http://www.google.com/foo.html
8
+ # '?foo=a'.to_absolute_url('http://www.google.com/index.html?foo=b') # => http://www.google.com/index.html?foo=a
9
+ def to_absolute_url(origin_url) # :nodoc:
5
10
  begin
6
11
  #BUG in URI.join? URI.join('http://www.google.com/index.html?foo=b','?foo=a') # => http://www.google.com/?foo=a
7
12
  stripped = self.strip
@@ -15,6 +20,9 @@ class String
15
20
  end
16
21
  end
17
22
 
23
+ # Removes a fragment from a URL
24
+ # Example:
25
+ # '/foo.html#fsajfksafd'.remove_fragment # => /foo.html
18
26
  def remove_fragment
19
27
  self.gsub(/#.*/,'')
20
28
  end
@@ -1,14 +1,16 @@
1
- module Magellan
2
- class Logger
3
- def initialize(file_name=nil)
1
+ module Magellan
2
+ class Logger # :nodoc:
3
+
4
+ def initialize(file_name=nil) # :nodoc:
4
5
  @file_name = file_name
5
6
  File.open(@file_name, 'a') {} if @file_name
6
7
  end
7
8
 
8
- def update(time,passed,message)
9
+ def update(time,passed,message) # :nodoc:
9
10
  $stdout.putc(passed ? '.' : 'F')
10
11
  $stdout.flush
11
12
  File.open(@file_name, 'a') {|f| f.write(message + "\n") } if @file_name && !passed
12
13
  end
14
+
13
15
  end
14
16
  end
@@ -1,23 +1,30 @@
1
-
2
- #TODO: this is not a good place to use a template method - violates Liskov substitution principle
3
1
  module Magellan
4
2
  module Rake
3
+ # The base magellan rake task, defines most attributes associated with running a magellan task
4
+ #TODO: this is not a good place to use a template method - violates Liskov substitution principle
5
5
  class BaseMagellanTask < ::Rake::TaskLib
6
+ # The url to start the crawl at
6
7
  attr_accessor :origin_url
8
+ # How deep to explore
7
9
  attr_accessor :explore_depth
10
+ # An array of urls to not crawl
8
11
  attr_accessor :ignored_urls
12
+ # The kind of links you would like
9
13
  attr_accessor :links_to_explore
14
+ # The success message for the task, this is set by the broken link and expected links task.
10
15
  attr_accessor :success_message
16
+ # If this is set the logger will log out failures to a file that you specify here, you can tail this log
17
+ # while the crawl is running so you can see what is failing
11
18
  attr_accessor :failure_log
12
19
 
13
- def initialize(name)
20
+ def initialize(name) # :nodoc:
14
21
  @ignored_urls = []
15
22
  @name=name
16
23
  yield self if block_given?
17
24
  define
18
25
  end
19
26
 
20
- def define
27
+ def define # :nodoc:
21
28
  desc description
22
29
  task @name do
23
30
  settings = {:origin_url => origin_url, :depth_to_explore => explore_depth, :domains => [origin_url],
@@ -37,7 +44,6 @@ module Magellan
37
44
 
38
45
  end
39
46
 
40
-
41
47
  end
42
48
  end
43
49
  end
@@ -5,19 +5,26 @@ require 'magellan/rake/base_magellan_task'
5
5
 
6
6
  module Magellan
7
7
  module Rake
8
-
8
+ # Example:
9
+ # require 'magellan/rake/broken_link_task'
10
+ # Magellan::Rake::BrokenLinkTask.new("digg") do |t|
11
+ # t.origin_url = "http://digg.com/"
12
+ # t.explore_depth = 3
13
+ # end
9
14
  class BrokenLinkTask < BaseMagellanTask
15
+
16
+ # Defines a new task, using the name +name+.
10
17
  def initialize(name="magellan:explore")
11
18
  @links_to_explore = [["a","href"],["script","src"],["img","src"]]
12
19
  @success_message = "No broken links were found!"
13
20
  super(name)
14
21
  end
15
22
 
16
- def create_observer
23
+ def create_observer # :nodoc:
17
24
  Magellan::BrokenLinkTracker.new
18
25
  end
19
26
 
20
- def description
27
+ def description # :nodoc:
21
28
  "explore #{@origin_url} for broken links"
22
29
  end
23
30
  end
@@ -5,21 +5,30 @@ require 'magellan/rake/base_magellan_task'
5
5
 
6
6
  module Magellan
7
7
  module Rake
8
-
8
+ # Example:
9
+ # Magellan::Rake::ExpectedLinksTask.new("digg") do |t|
10
+ # t.origin_url = "http://digg.com/"
11
+ # t.explore_depth = 2
12
+ # t.patterns_and_expected_links = YAML.load_file("digg.yml")
13
+ # end
9
14
  class ExpectedLinksTask < BaseMagellanTask
15
+ # Tuple of patterns and expected links at a given pattern
16
+ # Example:
17
+ # patterns_and_expected_links = [[/.*/,'/about_us.html']] # => this says all pages should have a link to the about us page.
10
18
  attr_accessor :patterns_and_expected_links
11
19
 
20
+ # Defines a new task, using the name +name+.
12
21
  def initialize(name="magellan:check_links")
13
22
  @success_message = "All expected links found!"
14
23
  @links_to_explore = [["a","href"]]
15
24
  super(name)
16
25
  end
17
26
 
18
- def description
27
+ def description # :nodoc:
19
28
  "Explore #{@origin_url} and find check if all given patterns are matched"
20
29
  end
21
30
 
22
- def create_observer
31
+ def create_observer # :nodoc:
23
32
  Magellan::ExpectedLinksTracker.new(@patterns_and_expected_links)
24
33
  end
25
34
  end
@@ -1,7 +1,17 @@
1
1
  module Magellan
2
+ # The resulting data from crawling a url
2
3
  class Result
3
- attr_reader :status_code,:url,:destination_url,:linked_resources
4
- def initialize(status_code,url,destination_url,linked_resources,content_type)
4
+ # The http status code returned by the request for the specified url
5
+ attr_reader :status_code
6
+ # The original URL requested
7
+ attr_reader :url
8
+ # The destination URL after following redirects
9
+ attr_reader :destination_url
10
+ # Relative linked resources (based off of the kinds of links you are looking for)
11
+ attr_reader :linked_resources
12
+
13
+ # create a new result, with the status code, url, destination url, linked resources and content type, see attr_readers for more information about these fields
14
+ def initialize(status_code,url,destination_url,linked_resources,content_type) # :nodoc:
5
15
  @status_code = status_code
6
16
  @url = url
7
17
  @destination_url = destination_url
@@ -9,10 +19,12 @@ module Magellan
9
19
  @content_type = content_type
10
20
  end
11
21
 
12
- def absolute_linked_resources
22
+ # Absolute links to resources
23
+ def absolute_linked_resources # :nodoc:
13
24
  absolute_links = linked_resources.map { |linked_resource| linked_resource.to_s.to_absolute_url(destination_url) }.compact
14
25
  end
15
26
 
27
+ # Was the document text/html
16
28
  def html_content?
17
29
  @content_type.starts_with?("text/html")
18
30
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: magellan
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nolan Evans
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-07 00:00:00 -07:00
12
+ date: 2009-04-09 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -39,8 +39,9 @@ executables: []
39
39
  extensions: []
40
40
 
41
41
  extra_rdoc_files:
42
- - README
42
+ - README.rdoc
43
43
  files:
44
+ - README.rdoc
44
45
  - VERSION.yml
45
46
  - lib/magellan
46
47
  - lib/magellan/broken_link_tracker.rb
@@ -70,7 +71,6 @@ files:
70
71
  - spec/result_spec.rb
71
72
  - spec/spec_helper.rb
72
73
  - spec/string_extensions_spec.rb
73
- - README
74
74
  has_rdoc: true
75
75
  homepage: http://github.com/nolman/magellan
76
76
  post_install_message:
data/README DELETED
@@ -1,11 +0,0 @@
1
- Magellan: (alpha)
2
-
3
- Currently the supported functionality is a rake task that crawl your website and find any broken a[@href], img[@src], or script[@src] links.
4
-
5
- Magellan::Rake::Task.new do |t|
6
- t.origin_url = "http://localhost:3000/"
7
- t.explore_depth = 100
8
- end
9
-
10
- Assumptions:
11
- This tool works best if you follow the rules of unobtrusive javascript and property set the http status code header.