bad_link_finder 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/bad_link_finder/csv_builder.rb +3 -3
- data/lib/bad_link_finder/page.rb +14 -3
- data/lib/bad_link_finder/site_checker.rb +9 -1
- data/lib/bad_link_finder/version.rb +1 -1
- data/test/fixtures/www.example.com/example/relative-example.html +17 -7
- data/test/integration/bad_link_finder_test.rb +2 -1
- data/test/unit/csv_builder_test.rb +11 -5
- data/test/unit/page_test.rb +37 -27
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NDQ1NTEwMzllMDZmMTJiNWUxYWYwMTcyNGI4NTg1YWVjMDQ1ZDNlYg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZTA2ZWJkMGI3N2Q5ZTgyODc0ZDU3YjBmYWNjODc2ODkzMzBhMDFlYg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YWUwODk5YTZjYTg4NWEyZjQ5MzdkMTJlOTIyZmMzYjA3NDcyNGFjYjBkMDdl
|
10
|
+
YTMyYWFlMjI3MWQ3NjkyOTliZDMzZTJkZmIxY2MwZjk3MzIyM2EwNzNjYzU0
|
11
|
+
ZGJhODYxYzk5MmY0MGFlMDIwNWRlNDg3YmI2ZWY0YzI5MWEzN2I=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NjZlNGE5YzgwY2NiMjA5YTNiNjc5NTcyZTE3ZjY5ZWFlNTU2M2EyYjgzZmEz
|
14
|
+
NTBiZmY3YzQwNTM5YmFjNWI4OTA1NzY3N2E5Mjk5OWU5ZTA3MjRhMjA1MDMz
|
15
|
+
ZjM3NGU3NzkyYjkxYmNkMWEzMGQzYzRlOTEyMDZhZDAyODk5ZjA=
|
@@ -8,12 +8,12 @@ module BadLinkFinder
|
|
8
8
|
|
9
9
|
def to_s
|
10
10
|
@to_s ||= CSV.generate(encoding: 'UTF-8') do |csv|
|
11
|
-
csv << ['page_url', 'link', 'error_message', 'raw_error_message']
|
11
|
+
csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
|
12
12
|
|
13
|
-
@bad_link_map.each do |
|
13
|
+
@bad_link_map.each do |page_info, bad_links|
|
14
14
|
bad_links.each do |bad_link|
|
15
15
|
exception_message = bad_link.exception.message if bad_link.exception
|
16
|
-
csv << [
|
16
|
+
csv << [page_info[:url], page_info[:id], bad_link.link, bad_link.error_message, exception_message]
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
data/lib/bad_link_finder/page.rb
CHANGED
@@ -6,13 +6,24 @@ module BadLinkFinder
|
|
6
6
|
@path = strip_html_ending(path)
|
7
7
|
|
8
8
|
file = mirror_dir + path
|
9
|
-
doc = Nokogiri::HTML(file.read)
|
10
|
-
|
9
|
+
@doc = Nokogiri::HTML(file.read)
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :path
|
13
|
+
|
14
|
+
def links
|
15
|
+
@links ||= @doc.css('a').map do |a|
|
11
16
|
strip_html_ending(a['href']) unless ignore_link?(a['href'])
|
12
17
|
end.compact
|
13
18
|
end
|
14
19
|
|
15
|
-
|
20
|
+
def id
|
21
|
+
@id ||= begin
|
22
|
+
if (article = @doc.xpath('(//article[not(ancestor::article)])').first)
|
23
|
+
article['id']
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
16
27
|
|
17
28
|
protected
|
18
29
|
|
@@ -17,7 +17,15 @@ module BadLinkFinder
|
|
17
17
|
puts "Checking page #{page.path} as #{page_checker.page_url}"
|
18
18
|
|
19
19
|
bad_links = page_checker.bad_links
|
20
|
-
|
20
|
+
|
21
|
+
if bad_links.any?
|
22
|
+
page_info = {
|
23
|
+
id: page.id,
|
24
|
+
url: page_checker.page_url
|
25
|
+
}
|
26
|
+
|
27
|
+
bad_link_map[page_info] = bad_links
|
28
|
+
end
|
21
29
|
end
|
22
30
|
|
23
31
|
return bad_link_map
|
@@ -2,13 +2,23 @@
|
|
2
2
|
<html>
|
3
3
|
<head><title>Example site</title></head>
|
4
4
|
<body>
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
<article id='correct-article-id'>
|
6
|
+
<!-- Included -->
|
7
|
+
<a href='/example/index.html?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1'></a>
|
8
|
+
<a href=''></a>
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
<article id='incorrect-sub-article-id'>
|
11
|
+
<p>Maecenas sed diam eget risus varius blandit sit amet non magna.</p>
|
12
|
+
</article>
|
13
|
+
|
14
|
+
<!-- Excluded -->
|
15
|
+
<a></a>
|
16
|
+
<a href='#section-2'></a>
|
17
|
+
<a href='mailto:test@example.com'></a>
|
18
|
+
</article>
|
19
|
+
|
20
|
+
<article id='incorrect-second-article-id'>
|
21
|
+
Praesent commodo cursus magna, vel scelerisque nisl consectetur et.
|
22
|
+
</article>
|
13
23
|
</body>
|
14
24
|
</html>
|
@@ -20,7 +20,8 @@ describe BadLinkFinder do
|
|
20
20
|
|
21
21
|
csv_string = File.read(ENV['REPORT_OUTPUT_FILE'])
|
22
22
|
|
23
|
-
assert_match 'http://www.example.com/example/', csv_string
|
23
|
+
assert_match 'http://www.example.com/example/relative-example', csv_string
|
24
|
+
assert_match 'correct-article-id', csv_string
|
24
25
|
end
|
25
26
|
|
26
27
|
it "complains if key variables are missing" do
|
@@ -7,11 +7,11 @@ describe BadLinkFinder::CSVBuilder do
|
|
7
7
|
|
8
8
|
it "flattens out the bad links map into a CSV structure" do
|
9
9
|
bad_link_map = {
|
10
|
-
'http://www.example.com/example/' => [
|
10
|
+
{url: 'http://www.example.com/example/', id: 'some-article-id'} => [
|
11
11
|
mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found')),
|
12
12
|
mock_link(link: 'relative-example', error_message: "Nope")
|
13
13
|
],
|
14
|
-
'http://www.example.com/example/relative-example' => [
|
14
|
+
{url: 'http://www.example.com/example/relative-example'} => [
|
15
15
|
mock_link(
|
16
16
|
link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
17
17
|
error_message: "What even is this?",
|
@@ -25,13 +25,19 @@ describe BadLinkFinder::CSVBuilder do
|
|
25
25
|
parsed_csv = CSV.parse(csv_builder.to_s)
|
26
26
|
|
27
27
|
headers = parsed_csv.shift
|
28
|
-
assert_equal ['page_url', 'link', 'error_message', 'raw_error_message'], headers
|
28
|
+
assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], headers
|
29
29
|
|
30
30
|
assert_equal bad_link_map.values.flatten.count, parsed_csv.count
|
31
31
|
|
32
|
-
bad_link_map.each do |
|
32
|
+
bad_link_map.each do |page_info, links|
|
33
33
|
links.each do |link|
|
34
|
-
assert parsed_csv.include?([
|
34
|
+
assert parsed_csv.include?([
|
35
|
+
page_info[:url],
|
36
|
+
page_info[:id],
|
37
|
+
link.link,
|
38
|
+
link.error_message,
|
39
|
+
(link.exception.message if link.exception),
|
40
|
+
])
|
35
41
|
end
|
36
42
|
end
|
37
43
|
end
|
data/test/unit/page_test.rb
CHANGED
@@ -3,43 +3,53 @@ require 'bad_link_finder/page'
|
|
3
3
|
|
4
4
|
describe BadLinkFinder::Page do
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
describe '#path' do
|
7
|
+
it "strips index.html and .html" do
|
8
|
+
assert_equal '', build_page('index.html').path.to_s
|
9
|
+
assert_equal 'example/', build_page('example/index.html').path.to_s
|
10
|
+
assert_equal 'example/relative-example', build_page('example/relative-example.html').path.to_s
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
describe '#links' do
|
15
|
+
it "finds absolute paths, stripping index.html and .html" do
|
16
|
+
assert_equal ['/example/'], build_page('index.html').links.map(&:to_s)
|
17
|
+
end
|
15
18
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
+
it "finds relative paths, stripping index.html and .html" do
|
20
|
+
assert build_page('example/index.html').links.map(&:to_s).include?('relative-example')
|
21
|
+
end
|
19
22
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
+
it "finds and preserves external URLs" do
|
24
|
+
assert build_page('example/index.html').links.map(&:to_s).include?('https://www.example.net/external-example.html')
|
25
|
+
end
|
23
26
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
27
|
+
it "preserves params and anchors on internal links" do
|
28
|
+
page = build_page('example/relative-example.html')
|
29
|
+
assert page.links.map(&:to_s).include?('/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1')
|
30
|
+
end
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
+
it "includes links with empty href" do
|
33
|
+
assert build_page('example/relative-example.html').links.map(&:to_s).include?('')
|
34
|
+
end
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
+
it "excludes links with no href" do
|
37
|
+
refute build_page('example/relative-example.html').links.include?(nil)
|
38
|
+
end
|
39
|
+
|
40
|
+
it "excludes links with an href containing only an anchor reference" do
|
41
|
+
refute build_page('example/relative-example.html').links.map(&:to_s).include?('#section-2')
|
42
|
+
end
|
36
43
|
|
37
|
-
|
38
|
-
|
44
|
+
it "excludes mailto links" do
|
45
|
+
refute build_page('example/relative-example.html').links.map(&:to_s).include?('mailto:test@example.com')
|
46
|
+
end
|
39
47
|
end
|
40
48
|
|
41
|
-
|
42
|
-
|
49
|
+
describe '#page_id' do
|
50
|
+
it "returns the id of the first topmost article" do
|
51
|
+
assert_equal 'correct-article-id', build_page('example/relative-example.html').id
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
def build_page(path)
|