bad_link_finder 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/bad_link_finder/csv_builder.rb +13 -12
- data/lib/bad_link_finder/link.rb +1 -1
- data/lib/bad_link_finder/page_checker.rb +12 -5
- data/lib/bad_link_finder/site_checker.rb +8 -11
- data/lib/bad_link_finder/version.rb +1 -1
- data/lib/bad_link_finder.rb +9 -6
- data/test/unit/csv_builder_test.rb +85 -28
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
M2ZkZmE5YjJiNTMxYThlYmFlYWE2NGM5ZWU5MTExYTA5ZWFiMzJjNw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YTEwMTBlYWNmMmVlZjY1MTI2YjQ2OWU2ZjVhZTRkZDU1YmE3MWIxYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
M2Y0ZDhkOGRhOTU2ZmIyY2EwOTY4Njk3YTljOTFkMDZjNDZlZDE5YjU3NDI4
|
10
|
+
NmNjMDM1NWQ1NThkY2ZhMzdmNjRhODY3NGY2NGQ1NGI2MzEyNGFmOTM2ZGY1
|
11
|
+
ZGU5NTI4YWUzYWI3NGM1YzRhYWM2MTYxZjhlNWIyOTUzNzlhZGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MWQzMjY3NzMyMDBmMThlMWRmZGZkMGE4YjVmNDRjYTg0ZjA4NDQwNzYyY2Mx
|
14
|
+
ODM5NTFjNjU0MTQ1NzY1ZTM0YTg5OGE3MDAyMTJjYTM0OWU5YmQ5MzZkYTA5
|
15
|
+
ZjNlMzQ0ZDcxNDZjYmE0Y2U1YTU1ZmE2NTAyZGIzYmQ2MThjMzE=
|
@@ -2,21 +2,22 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module BadLinkFinder
|
4
4
|
class CSVBuilder
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(csv_output_file)
|
6
|
+
@csv = CSV.new(csv_output_file, encoding: 'UTF-8')
|
7
|
+
|
8
|
+
@csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
|
7
9
|
end
|
8
10
|
|
9
|
-
def
|
10
|
-
|
11
|
-
csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
|
11
|
+
def <<(csv_data)
|
12
|
+
link = csv_data[:link]
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
@csv << [
|
15
|
+
csv_data[:url],
|
16
|
+
csv_data[:id],
|
17
|
+
link.link,
|
18
|
+
link.error_message,
|
19
|
+
(link.exception.message if link.exception)
|
20
|
+
]
|
20
21
|
end
|
21
22
|
end
|
22
23
|
end
|
data/lib/bad_link_finder/link.rb
CHANGED
@@ -29,7 +29,7 @@ module BadLinkFinder
|
|
29
29
|
rescue Mechanize::RobotsDisallowedError => exception
|
30
30
|
record_error("This link is blocked by robots.txt or nofollow attributes", exception)
|
31
31
|
rescue Mechanize::Error, Net::HTTP::Persistent::Error, Timeout::Error, Errno::EINVAL,
|
32
|
-
Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
32
|
+
Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
33
33
|
Net::ProtocolError, OpenSSL::SSL::SSLError, SocketError => exception # Thanks Net::HTTP
|
34
34
|
record_error("The server failed to serve this page properly", exception)
|
35
35
|
end
|
@@ -11,12 +11,19 @@ module BadLinkFinder
|
|
11
11
|
|
12
12
|
attr_reader :page_url
|
13
13
|
|
14
|
-
def
|
15
|
-
@bad_links
|
16
|
-
|
14
|
+
def each_bad_link(&block)
|
15
|
+
if @bad_links
|
16
|
+
@bad_links.each(&block)
|
17
|
+
else
|
18
|
+
@bad_links = @page.links.map do |raw_link|
|
19
|
+
link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
|
17
20
|
|
18
|
-
|
19
|
-
|
21
|
+
unless link.valid?
|
22
|
+
yield link
|
23
|
+
next link
|
24
|
+
end
|
25
|
+
end.compact
|
26
|
+
end
|
20
27
|
end
|
21
28
|
end
|
22
29
|
end
|
@@ -4,31 +4,28 @@ require 'bad_link_finder/page_checker'
|
|
4
4
|
|
5
5
|
module BadLinkFinder
|
6
6
|
class SiteChecker
|
7
|
-
def initialize(mirror_dir, host)
|
7
|
+
def initialize(mirror_dir, host, csv_builder)
|
8
8
|
@mirror_dir = File.expand_path(mirror_dir)
|
9
9
|
@host = host
|
10
|
+
@csv_builder = csv_builder
|
10
11
|
@result_cache = BadLinkFinder::ResultCache.new
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
14
|
-
|
15
|
-
BadLinkFinder::Site.new(@mirror_dir).map do |page|
|
15
|
+
BadLinkFinder::Site.new(@mirror_dir).each do |page|
|
16
16
|
page_checker = BadLinkFinder::PageChecker.new(@host, page, @result_cache)
|
17
17
|
puts "Checking page #{page.path} as #{page_checker.page_url}"
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
page_info = {
|
19
|
+
page_checker.each_bad_link do |link|
|
20
|
+
@csv_builder << {
|
21
|
+
url: page_checker.page_url,
|
23
22
|
id: page.id,
|
24
|
-
|
23
|
+
link: link
|
25
24
|
}
|
26
|
-
|
27
|
-
bad_link_map[page_info] = bad_links
|
28
25
|
end
|
29
26
|
end
|
30
27
|
|
31
|
-
|
28
|
+
nil
|
32
29
|
end
|
33
30
|
end
|
34
31
|
end
|
data/lib/bad_link_finder.rb
CHANGED
@@ -10,14 +10,17 @@ module BadLinkFinder
|
|
10
10
|
|
11
11
|
raise EnvironmentVariableError.new("MIRROR_DIR '#{ENV['MIRROR_DIR']}' does not exist") unless Dir.exist?(ENV['MIRROR_DIR'])
|
12
12
|
|
13
|
-
bad_link_map = BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST']).run
|
14
|
-
csv_builder = CSVBuilder.new(bad_link_map)
|
15
|
-
|
16
13
|
report_path = Pathname.new(ENV['REPORT_OUTPUT_FILE'])
|
17
14
|
report_path.parent.mkpath
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
|
16
|
+
csv_file = report_path.open('w')
|
17
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(csv_file)
|
18
|
+
|
19
|
+
BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST'], csv_builder).run
|
20
|
+
|
21
|
+
csv_file.close
|
22
|
+
|
23
|
+
nil
|
21
24
|
end
|
22
25
|
|
23
26
|
class EnvironmentVariableError < ArgumentError; end
|
@@ -1,44 +1,101 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
require 'bad_link_finder/csv_builder'
|
3
|
+
|
4
|
+
require 'tempfile'
|
3
5
|
require 'ostruct'
|
4
6
|
require 'csv'
|
5
7
|
|
6
8
|
describe BadLinkFinder::CSVBuilder do
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
before do
|
11
|
+
@report_output_file = Tempfile.new('csv')
|
12
|
+
@report_output_file.unlink
|
13
|
+
end
|
14
|
+
|
15
|
+
after do
|
16
|
+
@report_output_file.close
|
17
|
+
end
|
18
|
+
|
19
|
+
it "writes headers to the output file on creation" do
|
20
|
+
BadLinkFinder::CSVBuilder.new(@report_output_file)
|
21
|
+
@report_output_file.rewind
|
22
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
23
|
+
|
24
|
+
assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], parsed_csv.shift
|
25
|
+
assert_empty parsed_csv
|
26
|
+
end
|
27
|
+
|
28
|
+
describe '#<<' do
|
29
|
+
it "writes a link to the CSV" do
|
30
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
31
|
+
|
32
|
+
csv_builder << {
|
33
|
+
url: 'http://www.example.com/example/',
|
34
|
+
id: 'some-article-id',
|
35
|
+
link: mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found'))
|
36
|
+
}
|
37
|
+
|
38
|
+
@report_output_file.rewind
|
39
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
40
|
+
|
41
|
+
parsed_csv.shift # drop headers
|
42
|
+
|
43
|
+
assert_equal [
|
44
|
+
'http://www.example.com/example/',
|
45
|
+
'some-article-id',
|
46
|
+
'https://www.example.net/external-example.html',
|
47
|
+
'This link returned a 404',
|
48
|
+
'404 not found'
|
49
|
+
], parsed_csv.shift
|
50
|
+
end
|
51
|
+
|
52
|
+
it "ignores missing exceptions" do
|
53
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
54
|
+
|
55
|
+
csv_builder << {
|
56
|
+
url: 'http://www.example.com/example/',
|
57
|
+
id: 'some-article-id',
|
58
|
+
link: mock_link(link: 'relative-example', error_message: "Nope")
|
59
|
+
}
|
60
|
+
|
61
|
+
@report_output_file.rewind
|
62
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
63
|
+
|
64
|
+
parsed_csv.shift # drop headers
|
65
|
+
|
66
|
+
assert_equal [
|
67
|
+
'http://www.example.com/example/',
|
68
|
+
'some-article-id',
|
69
|
+
'relative-example',
|
70
|
+
'Nope',
|
71
|
+
nil
|
72
|
+
], parsed_csv.shift
|
73
|
+
end
|
74
|
+
|
75
|
+
it "ignores missing ids" do
|
76
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
77
|
+
|
78
|
+
csv_builder << {
|
79
|
+
url: 'http://www.example.com/example/relative-example',
|
80
|
+
link: mock_link(
|
16
81
|
link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
17
|
-
error_message:
|
82
|
+
error_message: 'What even is this?',
|
18
83
|
exception: TestException.new('Test exception')
|
19
84
|
)
|
20
|
-
|
21
|
-
}
|
22
|
-
|
23
|
-
csv_builder = BadLinkFinder::CSVBuilder.new(bad_link_map)
|
24
|
-
|
25
|
-
parsed_csv = CSV.parse(csv_builder.to_s)
|
85
|
+
}
|
26
86
|
|
27
|
-
|
28
|
-
|
87
|
+
@report_output_file.rewind
|
88
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
29
89
|
|
30
|
-
|
90
|
+
parsed_csv.shift # drop headers
|
31
91
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
(link.exception.message if link.exception),
|
40
|
-
])
|
41
|
-
end
|
92
|
+
assert_equal [
|
93
|
+
'http://www.example.com/example/relative-example',
|
94
|
+
nil,
|
95
|
+
'/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
96
|
+
'What even is this?',
|
97
|
+
'Test exception'
|
98
|
+
], parsed_csv.shift
|
42
99
|
end
|
43
100
|
end
|
44
101
|
|