bad_link_finder 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/LICENCE.txt +22 -0
- data/README.md +13 -0
- data/bin/bad_link_finder +9 -0
- data/lib/bad_link_finder/csv_builder.rb +22 -0
- data/lib/bad_link_finder/link.rb +68 -0
- data/lib/bad_link_finder/page.rb +31 -0
- data/lib/bad_link_finder/page_checker.rb +22 -0
- data/lib/bad_link_finder/result_cache.rb +24 -0
- data/lib/bad_link_finder/site.rb +21 -0
- data/lib/bad_link_finder/site_checker.rb +26 -0
- data/lib/bad_link_finder/version.rb +3 -0
- data/lib/bad_link_finder.rb +24 -0
- data/test/fixtures/www.example.com/example/index.html +9 -0
- data/test/fixtures/www.example.com/example/relative-example.html +14 -0
- data/test/fixtures/www.example.com/index.html +7 -0
- data/test/integration/bad_link_finder_test.rb +44 -0
- data/test/support/matchers.rb +5 -0
- data/test/test_helper.rb +11 -0
- data/test/unit/csv_builder_test.rb +45 -0
- data/test/unit/link_test.rb +70 -0
- data/test/unit/page_checker_test.rb +22 -0
- data/test/unit/page_test.rb +50 -0
- data/test/unit/result_cache_test.rb +38 -0
- data/test/unit/site_test.rb +22 -0
- metadata +166 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
N2ZkYWM1MjViZmM0M2E3ZDUyZDQ1NmU2MjQ4NDU5Yzk4YjMwZTY3Yg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MTljZDc2YzUxNmFkZmZjMWNhMzlkYWE1MDg1OWQ2YzU2OWFkYzUzNw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NjM3MjRkOTVlMGRlYWI0NTliMDViMDI2ZjlkZjI1MGZjNDhjYTMwYWIyNjUy
|
10
|
+
OGJjODRiZmJlYzMzMzI2NzcyZGJhNjE4ZmY4ZjQzZjFlZTMyOWQ4ZDk5MzZm
|
11
|
+
MmRiZWYxYmViYmMwYzJjMjdmNTQyNWU2MTIzMDUzYWE0MmFiYmY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MTBlYjc4OTU5OWFkOWM5YTE4MDVhNjgzNDI0ZmQxNmQwODcwZWU3NzhkMTlh
|
14
|
+
MzQ0YjhhMzExY2JmZGFmN2M2YmYxOTNhNmRmNDg3M2I3MTQ3NjljMmU2NTg4
|
15
|
+
NmNlZWNkYzIwOTc1ZWRlMjU4ODE2MDI3NDgxODc4NTZkODA4YTc=
|
data/LICENCE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Elliot Crosby-McCullough
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Bad link finder
|
2
|
+
|
3
|
+
Crawls a mirrored site and checks that all links either return a successful response or redirect to somewhere that does.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
Set environment variables:
|
8
|
+
|
9
|
+
- `MIRROR_DIR`, to the location of your mirrored site.
|
10
|
+
- `REPORT_OUTPUT_FILE`, to the location you'd like the CSV saved to.
|
11
|
+
- `SITE_HOST`, to the full host of the live site you'd like to test against, including protocol. For example, `https://www.example.com`
|
12
|
+
|
13
|
+
Then execute `bad_link_finder`.
|
data/bin/bad_link_finder
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module BadLinkFinder
|
4
|
+
class CSVBuilder
|
5
|
+
def initialize(bad_link_map)
|
6
|
+
@bad_link_map = bad_link_map
|
7
|
+
end
|
8
|
+
|
9
|
+
def to_s
|
10
|
+
@to_s ||= CSV.generate(encoding: 'UTF-8') do |csv|
|
11
|
+
csv << ['page_url', 'link', 'error_message', 'raw_error_message']
|
12
|
+
|
13
|
+
@bad_link_map.each do |page_url, bad_links|
|
14
|
+
bad_links.each do |bad_link|
|
15
|
+
exception_message = bad_link.exception.message if bad_link.exception
|
16
|
+
csv << [page_url, bad_link.link, bad_link.error_message, exception_message]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module BadLinkFinder
|
4
|
+
class Link
|
5
|
+
attr_reader :link, :url, :error_message, :exception
|
6
|
+
|
7
|
+
def initialize(page_url, link)
|
8
|
+
@page_url = page_url
|
9
|
+
@link = link
|
10
|
+
@url = get_url_from_link(link)
|
11
|
+
|
12
|
+
validate_with_request
|
13
|
+
|
14
|
+
rescue URI::InvalidURIError => exception
|
15
|
+
record_error("This link is in a bad format", exception)
|
16
|
+
rescue Mechanize::ResponseCodeError => exception
|
17
|
+
if exception.response_code.to_i == 405 && !@head_unsupported
|
18
|
+
@head_unsupported = true
|
19
|
+
retry
|
20
|
+
else
|
21
|
+
record_error("This request returned a #{exception.response_code}", exception)
|
22
|
+
end
|
23
|
+
rescue Mechanize::UnauthorizedError => exception
|
24
|
+
record_error("This link requires authorisation", exception)
|
25
|
+
rescue Mechanize::UnsupportedSchemeError => exception
|
26
|
+
record_error("This link has a scheme we can't load (should be http or https)", exception)
|
27
|
+
rescue Mechanize::RedirectLimitReachedError => exception
|
28
|
+
record_error("This link might be in a redirect loop", exception)
|
29
|
+
rescue Mechanize::RobotsDisallowedError => exception
|
30
|
+
record_error("This link is blocked by robots.txt or nofollow attributes", exception)
|
31
|
+
rescue Mechanize::Error, Net::HTTP::Persistent::Error, Timeout::Error, Errno::EINVAL,
|
32
|
+
Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
33
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError, SocketError => exception # Thanks Net::HTTP
|
34
|
+
record_error("The server failed to serve this page properly", exception)
|
35
|
+
end
|
36
|
+
|
37
|
+
def valid?
|
38
|
+
@error_message.nil?
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def validate_with_request
|
44
|
+
puts "-- testing link #{@link} using #{@url}"
|
45
|
+
sleep 0.1 # Recommended pause for gov.uk rate limiting
|
46
|
+
|
47
|
+
browser = Mechanize.new
|
48
|
+
browser.user_agent = 'GOV.UK link checker'
|
49
|
+
|
50
|
+
if @head_unsupported
|
51
|
+
browser.get(@url)
|
52
|
+
else
|
53
|
+
browser.head(@url)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_url_from_link(link)
|
58
|
+
URI.join(@page_url, link).to_s
|
59
|
+
end
|
60
|
+
|
61
|
+
def record_error(message, exception = nil)
|
62
|
+
@error_message = message
|
63
|
+
@exception = exception
|
64
|
+
|
65
|
+
puts "---- found broken link #{@url}: #{message}: #{exception.message if exception}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module BadLinkFinder
|
4
|
+
class Page
|
5
|
+
def initialize(mirror_dir, path)
|
6
|
+
@path = strip_html_ending(path)
|
7
|
+
|
8
|
+
file = mirror_dir + path
|
9
|
+
doc = Nokogiri::HTML(file.read)
|
10
|
+
@links = doc.css('a').map do |a|
|
11
|
+
strip_html_ending(a['href']) unless ignore_link?(a['href'])
|
12
|
+
end.compact
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :path, :links
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
def strip_html_ending(href)
|
20
|
+
if href.start_with?('http')
|
21
|
+
href
|
22
|
+
else
|
23
|
+
href.sub(%r{(?<!\?)(?:index\.html|\.html)(.*)}, '\1')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def ignore_link?(href)
|
28
|
+
href.nil? || href.start_with?('#', 'mailto:')
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'bad_link_finder/link'
|
2
|
+
|
3
|
+
module BadLinkFinder
|
4
|
+
class PageChecker
|
5
|
+
def initialize(host, page, result_cache)
|
6
|
+
host = host.chomp('/') + '/'
|
7
|
+
@page = page
|
8
|
+
@page_url = URI.join(host, page.path).to_s
|
9
|
+
@result_cache = result_cache
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :page_url
|
13
|
+
|
14
|
+
def bad_links
|
15
|
+
@bad_links ||= @page.links.map do |raw_link|
|
16
|
+
link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
|
17
|
+
|
18
|
+
link unless link.valid?
|
19
|
+
end.compact
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# If/when the bad link finder is converted to a set of parallel processes
|
2
|
+
# this cache will need to be backed by something threadsafe.
|
3
|
+
|
4
|
+
module BadLinkFinder
|
5
|
+
class ResultCache
|
6
|
+
def initialize
|
7
|
+
@cache = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def store(key, link)
|
11
|
+
@cache[stripped_key(key)] = link
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetch(key)
|
15
|
+
@cache[stripped_key(key)]
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
def stripped_key(key)
|
21
|
+
key.sub(/#.*$/, '')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'bad_link_finder/page'
|
3
|
+
|
4
|
+
module BadLinkFinder
|
5
|
+
class Site
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(mirror_dir)
|
9
|
+
@mirror_dir = mirror_dir.is_a?(String) ? Pathname.new(mirror_dir) : mirror_dir
|
10
|
+
end
|
11
|
+
|
12
|
+
def each
|
13
|
+
Dir.chdir(@mirror_dir) do
|
14
|
+
Dir.glob('**/*').each do |path|
|
15
|
+
next if File.directory?(path)
|
16
|
+
yield BadLinkFinder::Page.new(@mirror_dir, path)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'bad_link_finder/site'
|
2
|
+
require 'bad_link_finder/result_cache'
|
3
|
+
require 'bad_link_finder/page_checker'
|
4
|
+
|
5
|
+
module BadLinkFinder
|
6
|
+
class SiteChecker
|
7
|
+
def initialize(mirror_dir, host)
|
8
|
+
@mirror_dir = File.expand_path(mirror_dir)
|
9
|
+
@host = host
|
10
|
+
@result_cache = BadLinkFinder::ResultCache.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
bad_link_map = {}
|
15
|
+
BadLinkFinder::Site.new(@mirror_dir).map do |page|
|
16
|
+
page_checker = BadLinkFinder::PageChecker.new(@host, page, @result_cache)
|
17
|
+
puts "Checking page #{page.path} as #{page_checker.page_url}"
|
18
|
+
|
19
|
+
bad_links = page_checker.bad_links
|
20
|
+
bad_link_map[page_checker.page_url] = bad_links if bad_links.any?
|
21
|
+
end
|
22
|
+
|
23
|
+
return bad_link_map
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'bad_link_finder/site_checker'
|
2
|
+
require 'bad_link_finder/csv_builder'
|
3
|
+
require 'pathname'
|
4
|
+
|
5
|
+
module BadLinkFinder
|
6
|
+
def self.run
|
7
|
+
['MIRROR_DIR', 'REPORT_OUTPUT_FILE', 'SITE_HOST'].each do |var|
|
8
|
+
raise EnvironmentVariableError.new("Missing environment variable #{var}") unless ENV.has_key?(var)
|
9
|
+
end
|
10
|
+
|
11
|
+
raise EnvironmentVariableError.new("MIRROR_DIR '#{ENV['MIRROR_DIR']}' does not exist") unless Dir.exist?(ENV['MIRROR_DIR'])
|
12
|
+
|
13
|
+
bad_link_map = BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST']).run
|
14
|
+
csv_builder = CSVBuilder.new(bad_link_map)
|
15
|
+
|
16
|
+
report_path = Pathname.new(ENV['REPORT_OUTPUT_FILE'])
|
17
|
+
report_path.parent.mkpath
|
18
|
+
report_path.open('w') do |file|
|
19
|
+
file.write(csv_builder)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class EnvironmentVariableError < ArgumentError; end
|
24
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head><title>Example site</title></head>
|
4
|
+
<body>
|
5
|
+
<a href='relative-example.html'>Relative example</a>
|
6
|
+
<a href='relative-example.html'>Relative example</a>
|
7
|
+
<a href='https://www.example.net/external-example.html'>External example</a>
|
8
|
+
</body>
|
9
|
+
</html>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head><title>Example site</title></head>
|
4
|
+
<body>
|
5
|
+
<!-- Included -->
|
6
|
+
<a href='/example/index.html?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1'></a>
|
7
|
+
<a href=''></a>
|
8
|
+
|
9
|
+
<!-- Excluded -->
|
10
|
+
<a></a>
|
11
|
+
<a href='#section-2'></a>
|
12
|
+
<a href='mailto:test@example.com'></a>
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'webmock/minitest'
|
3
|
+
require 'bad_link_finder'
|
4
|
+
|
5
|
+
describe BadLinkFinder do
|
6
|
+
|
7
|
+
before do
|
8
|
+
stub_request(:any, 'http://www.example.com/example/').to_return(status: 200)
|
9
|
+
stub_request(:any, 'http://www.example.com/example/relative-example').to_return(status: 302, headers: {'Location' => 'http://www.example.com/example/'})
|
10
|
+
stub_request(:any, 'https://www.example.net/external-example.html').to_return(status: 500)
|
11
|
+
stub_request(:any, 'http://www.example.com/example/?test=true&redirect=http://www.example.com/in-param-url/index.html').to_return(status: 404)
|
12
|
+
|
13
|
+
ENV['MIRROR_DIR'] = (FIXTURES_ROOT+'www.example.com').to_s
|
14
|
+
ENV['REPORT_OUTPUT_FILE'] = (TMP_ROOT+'bad_links.csv').to_s
|
15
|
+
ENV['SITE_HOST'] = 'http://www.example.com/'
|
16
|
+
end
|
17
|
+
|
18
|
+
it "finds all broken links and exports to a CSV" do
|
19
|
+
BadLinkFinder.run
|
20
|
+
|
21
|
+
csv_string = File.read(ENV['REPORT_OUTPUT_FILE'])
|
22
|
+
|
23
|
+
assert_match 'http://www.example.com/example/', csv_string
|
24
|
+
end
|
25
|
+
|
26
|
+
it "complains if key variables are missing" do
|
27
|
+
['MIRROR_DIR', 'REPORT_OUTPUT_FILE', 'SITE_HOST'].each do |var|
|
28
|
+
ENV.delete(var)
|
29
|
+
|
30
|
+
assert_raises(BadLinkFinder::EnvironmentVariableError) do
|
31
|
+
BadLinkFinder.run
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "complains if the MIRROR_DIR does not exist" do
|
37
|
+
ENV['MIRROR_DIR'] = (FIXTURES_ROOT+'this_does_not_exist').to_s
|
38
|
+
|
39
|
+
assert_raises(BadLinkFinder::EnvironmentVariableError) do
|
40
|
+
BadLinkFinder.run
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
$LOAD_PATH.unshift File.expand_path("../../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift File.expand_path("..", __FILE__)
|
4
|
+
|
5
|
+
require 'minitest/autorun'
|
6
|
+
require 'support/matchers'
|
7
|
+
|
8
|
+
require 'pathname'
|
9
|
+
APP_ROOT = Pathname.new(File.join(File.dirname(__FILE__), '..'))
|
10
|
+
FIXTURES_ROOT = APP_ROOT+'test/fixtures'
|
11
|
+
TMP_ROOT = APP_ROOT+'tmp'
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'bad_link_finder/csv_builder'
|
3
|
+
require 'ostruct'
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
describe BadLinkFinder::CSVBuilder do
|
7
|
+
|
8
|
+
it "flattens out the bad links map into a CSV structure" do
|
9
|
+
bad_link_map = {
|
10
|
+
'http://www.example.com/example/' => [
|
11
|
+
mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found')),
|
12
|
+
mock_link(link: 'relative-example', error_message: "Nope")
|
13
|
+
],
|
14
|
+
'http://www.example.com/example/relative-example' => [
|
15
|
+
mock_link(
|
16
|
+
link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
17
|
+
error_message: "What even is this?",
|
18
|
+
exception: TestException.new('Test exception')
|
19
|
+
)
|
20
|
+
]
|
21
|
+
}
|
22
|
+
|
23
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(bad_link_map)
|
24
|
+
|
25
|
+
parsed_csv = CSV.parse(csv_builder.to_s)
|
26
|
+
|
27
|
+
headers = parsed_csv.shift
|
28
|
+
assert_equal ['page_url', 'link', 'error_message', 'raw_error_message'], headers
|
29
|
+
|
30
|
+
assert_equal bad_link_map.values.flatten.count, parsed_csv.count
|
31
|
+
|
32
|
+
bad_link_map.each do |page_url, links|
|
33
|
+
links.each do |link|
|
34
|
+
assert parsed_csv.include?([page_url, link.link, link.error_message, (link.exception.message if link.exception)])
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def mock_link(attrs)
|
40
|
+
OpenStruct.new(attrs)
|
41
|
+
end
|
42
|
+
|
43
|
+
class TestException < Exception; end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'webmock/minitest'
|
3
|
+
require 'bad_link_finder/link'
|
4
|
+
|
5
|
+
describe BadLinkFinder::Link do
|
6
|
+
|
7
|
+
describe '#valid?' do
|
8
|
+
it "approves fully qualified urls which get a good response" do
|
9
|
+
stub_url("http://www.example.com", 200)
|
10
|
+
link = build_link('http://www.example.com')
|
11
|
+
|
12
|
+
assert link.valid?
|
13
|
+
end
|
14
|
+
|
15
|
+
it "approves relative paths which get a good response" do
|
16
|
+
stub_url("http://www.example.com/somewhere/an-example-path", 200)
|
17
|
+
link = build_link('an-example-path', page_url: 'http://www.example.com/somewhere/')
|
18
|
+
|
19
|
+
assert link.valid?
|
20
|
+
end
|
21
|
+
|
22
|
+
it "approves absolute paths which get a good response" do
|
23
|
+
stub_url("http://www.example.com/an-example-path", 200)
|
24
|
+
link = build_link('/an-example-path', page_url: 'http://www.example.com/somewhere/')
|
25
|
+
|
26
|
+
assert link.valid?
|
27
|
+
end
|
28
|
+
|
29
|
+
it "reports malformed links without checking the internet" do
|
30
|
+
link = build_link('htt[]://{an-example-path}')
|
31
|
+
|
32
|
+
refute link.valid?
|
33
|
+
assert_equal "This link is in a bad format", link.error_message
|
34
|
+
end
|
35
|
+
|
36
|
+
it "reports links returning failure status codes" do
|
37
|
+
stub_url("http://www.example.com/an-example-path", 404)
|
38
|
+
link = build_link('/an-example-path')
|
39
|
+
|
40
|
+
refute link.valid?
|
41
|
+
assert_equal "This request returned a 404", link.error_message
|
42
|
+
end
|
43
|
+
|
44
|
+
it "reports URLs returning failure status codes" do
|
45
|
+
stub_url("https://www.example.net/an-external-failure", 500)
|
46
|
+
link = build_link('https://www.example.net/an-external-failure')
|
47
|
+
|
48
|
+
refute link.valid?
|
49
|
+
assert_equal "This request returned a 500", link.error_message
|
50
|
+
end
|
51
|
+
|
52
|
+
it "retries 405s as GET requests" do
|
53
|
+
stub_request(:head, "http://www.example.com/an-example-path").to_return(status: 405)
|
54
|
+
stub_request(:get, "http://www.example.com/an-example-path").to_return(status: 200)
|
55
|
+
link = build_link('/an-example-path')
|
56
|
+
|
57
|
+
assert link.valid?
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def stub_url(url, status)
|
62
|
+
stub_request(:any, url).to_return(status: status)
|
63
|
+
end
|
64
|
+
|
65
|
+
def build_link(link_path, opts = {})
|
66
|
+
page_url = opts[:page_url] || 'http://www.example.com'
|
67
|
+
BadLinkFinder::Link.new(page_url, link_path)
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'bad_link_finder/page_checker'
|
3
|
+
require 'bad_link_finder/page'
|
4
|
+
require 'bad_link_finder/result_cache'
|
5
|
+
|
6
|
+
describe BadLinkFinder::PageChecker do
|
7
|
+
|
8
|
+
describe "#page_url" do
|
9
|
+
it "correctly merges the host with the page path" do
|
10
|
+
assert_equal 'http://www.example.com/', build_page_checker('index.html').page_url.to_s
|
11
|
+
assert_equal 'http://www.example.com/example/', build_page_checker('example/index.html').page_url.to_s
|
12
|
+
assert_equal 'http://www.example.com/example/relative-example', build_page_checker('example/relative-example.html').page_url.to_s
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def build_page_checker(path)
|
17
|
+
site_mirror = FIXTURES_ROOT+'www.example.com'
|
18
|
+
page = BadLinkFinder::Page.new(site_mirror, path)
|
19
|
+
BadLinkFinder::PageChecker.new('http://www.example.com/', page, BadLinkFinder::ResultCache.new)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'bad_link_finder/page'
|
3
|
+
|
4
|
+
describe BadLinkFinder::Page do
|
5
|
+
|
6
|
+
it "strips index.html and .html from the page path" do
|
7
|
+
assert_equal '', build_page('index.html').path.to_s
|
8
|
+
assert_equal 'example/', build_page('example/index.html').path.to_s
|
9
|
+
assert_equal 'example/relative-example', build_page('example/relative-example.html').path.to_s
|
10
|
+
end
|
11
|
+
|
12
|
+
it "finds absolute paths, stripping index.html and .html" do
|
13
|
+
assert_equal ['/example/'], build_page('index.html').links.map(&:to_s)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "finds relative paths, stripping index.html and .html" do
|
17
|
+
assert build_page('example/index.html').links.map(&:to_s).include?('relative-example')
|
18
|
+
end
|
19
|
+
|
20
|
+
it "finds and preserves external URLs" do
|
21
|
+
assert build_page('example/index.html').links.map(&:to_s).include?('https://www.example.net/external-example.html')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "preserves params and anchors on internal links" do
|
25
|
+
page = build_page('example/relative-example.html')
|
26
|
+
assert page.links.map(&:to_s).include?('/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1')
|
27
|
+
end
|
28
|
+
|
29
|
+
it "includes links with empty href" do
|
30
|
+
assert build_page('example/relative-example.html').links.map(&:to_s).include?('')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "excludes links with no href" do
|
34
|
+
refute build_page('example/relative-example.html').links.include?(nil)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "excludes links with an href containing only an anchor reference" do
|
38
|
+
refute build_page('example/relative-example.html').links.map(&:to_s).include?('#section-2')
|
39
|
+
end
|
40
|
+
|
41
|
+
it "excludes mailto links" do
|
42
|
+
refute build_page('example/relative-example.html').links.map(&:to_s).include?('mailto:test@example.com')
|
43
|
+
end
|
44
|
+
|
45
|
+
def build_page(path)
|
46
|
+
site_mirror = FIXTURES_ROOT+'www.example.com'
|
47
|
+
BadLinkFinder::Page.new(site_mirror, path)
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'bad_link_finder/result_cache'
|
3
|
+
|
4
|
+
describe BadLinkFinder::ResultCache do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@cache = BadLinkFinder::ResultCache.new
|
8
|
+
end
|
9
|
+
|
10
|
+
it "returns a cache hit for URLs which differ only by anchor" do
|
11
|
+
@cache.store('http://www.example.com#test123', 'value')
|
12
|
+
assert_equal 'value', @cache.fetch('http://www.example.com#test567')
|
13
|
+
|
14
|
+
@cache.store('http://www.example.com?test=true#test123', 'value')
|
15
|
+
assert_equal 'value', @cache.fetch('http://www.example.com?test=true#test567')
|
16
|
+
|
17
|
+
@cache.store('http://www.example.com?test=true#test123', 'value')
|
18
|
+
refute_equal 'value', @cache.fetch('http://www.example.com?test=false#test567')
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#store" do
|
22
|
+
it "returns the item stored" do
|
23
|
+
assert_equal 'value', @cache.store('key', 'value')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "#fetch" do
|
28
|
+
it "returns fetched items on a hit" do
|
29
|
+
@cache.store('key', 'value')
|
30
|
+
assert_equal 'value', @cache.fetch('key')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "returns nil on a miss" do
|
34
|
+
assert_nil @cache.fetch('missing-key')
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'bad_link_finder/site'
|
3
|
+
|
4
|
+
describe BadLinkFinder::Site do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@site_mirror = FIXTURES_ROOT+'www.example.com'
|
8
|
+
end
|
9
|
+
|
10
|
+
describe '#each' do
|
11
|
+
it "loads all files from a directory and passes on the host" do
|
12
|
+
site_map = [
|
13
|
+
'',
|
14
|
+
'example/',
|
15
|
+
'example/relative-example'
|
16
|
+
]
|
17
|
+
|
18
|
+
assert_same_elements site_map, BadLinkFinder::Site.new(@site_mirror).map { |page| page.path.to_s }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bad_link_finder
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Elliot Crosby-McCullough
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-11-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.6'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.6'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: webmock
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.3'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.3'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: gem_publisher
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.3.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.3.0
|
97
|
+
description: Crawls a static site mirror testing all links. Reports links which don't
|
98
|
+
return 200 or redirect to a 200.
|
99
|
+
email:
|
100
|
+
- elliot.cm@gmail.com
|
101
|
+
executables:
|
102
|
+
- bad_link_finder
|
103
|
+
extensions: []
|
104
|
+
extra_rdoc_files: []
|
105
|
+
files:
|
106
|
+
- bin/bad_link_finder
|
107
|
+
- lib/bad_link_finder/csv_builder.rb
|
108
|
+
- lib/bad_link_finder/link.rb
|
109
|
+
- lib/bad_link_finder/page.rb
|
110
|
+
- lib/bad_link_finder/page_checker.rb
|
111
|
+
- lib/bad_link_finder/result_cache.rb
|
112
|
+
- lib/bad_link_finder/site.rb
|
113
|
+
- lib/bad_link_finder/site_checker.rb
|
114
|
+
- lib/bad_link_finder/version.rb
|
115
|
+
- lib/bad_link_finder.rb
|
116
|
+
- README.md
|
117
|
+
- LICENCE.txt
|
118
|
+
- test/fixtures/www.example.com/example/index.html
|
119
|
+
- test/fixtures/www.example.com/example/relative-example.html
|
120
|
+
- test/fixtures/www.example.com/index.html
|
121
|
+
- test/integration/bad_link_finder_test.rb
|
122
|
+
- test/support/matchers.rb
|
123
|
+
- test/test_helper.rb
|
124
|
+
- test/unit/csv_builder_test.rb
|
125
|
+
- test/unit/link_test.rb
|
126
|
+
- test/unit/page_checker_test.rb
|
127
|
+
- test/unit/page_test.rb
|
128
|
+
- test/unit/result_cache_test.rb
|
129
|
+
- test/unit/site_test.rb
|
130
|
+
homepage: http://github.com/alphagov/bad_link_finder
|
131
|
+
licenses:
|
132
|
+
- MIT
|
133
|
+
metadata: {}
|
134
|
+
post_install_message:
|
135
|
+
rdoc_options: []
|
136
|
+
require_paths:
|
137
|
+
- lib
|
138
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ! '>='
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: 2.1.11
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 2.1.11
|
151
|
+
signing_key:
|
152
|
+
specification_version: 4
|
153
|
+
summary: Tests links in static site mirrors
|
154
|
+
test_files:
|
155
|
+
- test/fixtures/www.example.com/example/index.html
|
156
|
+
- test/fixtures/www.example.com/example/relative-example.html
|
157
|
+
- test/fixtures/www.example.com/index.html
|
158
|
+
- test/integration/bad_link_finder_test.rb
|
159
|
+
- test/support/matchers.rb
|
160
|
+
- test/test_helper.rb
|
161
|
+
- test/unit/csv_builder_test.rb
|
162
|
+
- test/unit/link_test.rb
|
163
|
+
- test/unit/page_checker_test.rb
|
164
|
+
- test/unit/page_test.rb
|
165
|
+
- test/unit/result_cache_test.rb
|
166
|
+
- test/unit/site_test.rb
|