bad_link_finder 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/bad_link_finder/csv_builder.rb +13 -12
- data/lib/bad_link_finder/link.rb +1 -1
- data/lib/bad_link_finder/page_checker.rb +12 -5
- data/lib/bad_link_finder/site_checker.rb +8 -11
- data/lib/bad_link_finder/version.rb +1 -1
- data/lib/bad_link_finder.rb +9 -6
- data/test/unit/csv_builder_test.rb +85 -28
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
M2ZkZmE5YjJiNTMxYThlYmFlYWE2NGM5ZWU5MTExYTA5ZWFiMzJjNw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YTEwMTBlYWNmMmVlZjY1MTI2YjQ2OWU2ZjVhZTRkZDU1YmE3MWIxYw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
M2Y0ZDhkOGRhOTU2ZmIyY2EwOTY4Njk3YTljOTFkMDZjNDZlZDE5YjU3NDI4
|
10
|
+
NmNjMDM1NWQ1NThkY2ZhMzdmNjRhODY3NGY2NGQ1NGI2MzEyNGFmOTM2ZGY1
|
11
|
+
ZGU5NTI4YWUzYWI3NGM1YzRhYWM2MTYxZjhlNWIyOTUzNzlhZGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MWQzMjY3NzMyMDBmMThlMWRmZGZkMGE4YjVmNDRjYTg0ZjA4NDQwNzYyY2Mx
|
14
|
+
ODM5NTFjNjU0MTQ1NzY1ZTM0YTg5OGE3MDAyMTJjYTM0OWU5YmQ5MzZkYTA5
|
15
|
+
ZjNlMzQ0ZDcxNDZjYmE0Y2U1YTU1ZmE2NTAyZGIzYmQ2MThjMzE=
|
@@ -2,21 +2,22 @@ require 'csv'
|
|
2
2
|
|
3
3
|
module BadLinkFinder
|
4
4
|
class CSVBuilder
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(csv_output_file)
|
6
|
+
@csv = CSV.new(csv_output_file, encoding: 'UTF-8')
|
7
|
+
|
8
|
+
@csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
|
7
9
|
end
|
8
10
|
|
9
|
-
def
|
10
|
-
|
11
|
-
csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
|
11
|
+
def <<(csv_data)
|
12
|
+
link = csv_data[:link]
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
@csv << [
|
15
|
+
csv_data[:url],
|
16
|
+
csv_data[:id],
|
17
|
+
link.link,
|
18
|
+
link.error_message,
|
19
|
+
(link.exception.message if link.exception)
|
20
|
+
]
|
20
21
|
end
|
21
22
|
end
|
22
23
|
end
|
data/lib/bad_link_finder/link.rb
CHANGED
@@ -29,7 +29,7 @@ module BadLinkFinder
|
|
29
29
|
rescue Mechanize::RobotsDisallowedError => exception
|
30
30
|
record_error("This link is blocked by robots.txt or nofollow attributes", exception)
|
31
31
|
rescue Mechanize::Error, Net::HTTP::Persistent::Error, Timeout::Error, Errno::EINVAL,
|
32
|
-
Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
32
|
+
Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
33
33
|
Net::ProtocolError, OpenSSL::SSL::SSLError, SocketError => exception # Thanks Net::HTTP
|
34
34
|
record_error("The server failed to serve this page properly", exception)
|
35
35
|
end
|
@@ -11,12 +11,19 @@ module BadLinkFinder
|
|
11
11
|
|
12
12
|
attr_reader :page_url
|
13
13
|
|
14
|
-
def
|
15
|
-
@bad_links
|
16
|
-
|
14
|
+
def each_bad_link(&block)
|
15
|
+
if @bad_links
|
16
|
+
@bad_links.each(&block)
|
17
|
+
else
|
18
|
+
@bad_links = @page.links.map do |raw_link|
|
19
|
+
link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
|
17
20
|
|
18
|
-
|
19
|
-
|
21
|
+
unless link.valid?
|
22
|
+
yield link
|
23
|
+
next link
|
24
|
+
end
|
25
|
+
end.compact
|
26
|
+
end
|
20
27
|
end
|
21
28
|
end
|
22
29
|
end
|
@@ -4,31 +4,28 @@ require 'bad_link_finder/page_checker'
|
|
4
4
|
|
5
5
|
module BadLinkFinder
|
6
6
|
class SiteChecker
|
7
|
-
def initialize(mirror_dir, host)
|
7
|
+
def initialize(mirror_dir, host, csv_builder)
|
8
8
|
@mirror_dir = File.expand_path(mirror_dir)
|
9
9
|
@host = host
|
10
|
+
@csv_builder = csv_builder
|
10
11
|
@result_cache = BadLinkFinder::ResultCache.new
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
14
|
-
|
15
|
-
BadLinkFinder::Site.new(@mirror_dir).map do |page|
|
15
|
+
BadLinkFinder::Site.new(@mirror_dir).each do |page|
|
16
16
|
page_checker = BadLinkFinder::PageChecker.new(@host, page, @result_cache)
|
17
17
|
puts "Checking page #{page.path} as #{page_checker.page_url}"
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
page_info = {
|
19
|
+
page_checker.each_bad_link do |link|
|
20
|
+
@csv_builder << {
|
21
|
+
url: page_checker.page_url,
|
23
22
|
id: page.id,
|
24
|
-
|
23
|
+
link: link
|
25
24
|
}
|
26
|
-
|
27
|
-
bad_link_map[page_info] = bad_links
|
28
25
|
end
|
29
26
|
end
|
30
27
|
|
31
|
-
|
28
|
+
nil
|
32
29
|
end
|
33
30
|
end
|
34
31
|
end
|
data/lib/bad_link_finder.rb
CHANGED
@@ -10,14 +10,17 @@ module BadLinkFinder
|
|
10
10
|
|
11
11
|
raise EnvironmentVariableError.new("MIRROR_DIR '#{ENV['MIRROR_DIR']}' does not exist") unless Dir.exist?(ENV['MIRROR_DIR'])
|
12
12
|
|
13
|
-
bad_link_map = BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST']).run
|
14
|
-
csv_builder = CSVBuilder.new(bad_link_map)
|
15
|
-
|
16
13
|
report_path = Pathname.new(ENV['REPORT_OUTPUT_FILE'])
|
17
14
|
report_path.parent.mkpath
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
|
16
|
+
csv_file = report_path.open('w')
|
17
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(csv_file)
|
18
|
+
|
19
|
+
BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST'], csv_builder).run
|
20
|
+
|
21
|
+
csv_file.close
|
22
|
+
|
23
|
+
nil
|
21
24
|
end
|
22
25
|
|
23
26
|
class EnvironmentVariableError < ArgumentError; end
|
@@ -1,44 +1,101 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
require 'bad_link_finder/csv_builder'
|
3
|
+
|
4
|
+
require 'tempfile'
|
3
5
|
require 'ostruct'
|
4
6
|
require 'csv'
|
5
7
|
|
6
8
|
describe BadLinkFinder::CSVBuilder do
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
before do
|
11
|
+
@report_output_file = Tempfile.new('csv')
|
12
|
+
@report_output_file.unlink
|
13
|
+
end
|
14
|
+
|
15
|
+
after do
|
16
|
+
@report_output_file.close
|
17
|
+
end
|
18
|
+
|
19
|
+
it "writes headers to the output file on creation" do
|
20
|
+
BadLinkFinder::CSVBuilder.new(@report_output_file)
|
21
|
+
@report_output_file.rewind
|
22
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
23
|
+
|
24
|
+
assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], parsed_csv.shift
|
25
|
+
assert_empty parsed_csv
|
26
|
+
end
|
27
|
+
|
28
|
+
describe '#<<' do
|
29
|
+
it "writes a link to the CSV" do
|
30
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
31
|
+
|
32
|
+
csv_builder << {
|
33
|
+
url: 'http://www.example.com/example/',
|
34
|
+
id: 'some-article-id',
|
35
|
+
link: mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found'))
|
36
|
+
}
|
37
|
+
|
38
|
+
@report_output_file.rewind
|
39
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
40
|
+
|
41
|
+
parsed_csv.shift # drop headers
|
42
|
+
|
43
|
+
assert_equal [
|
44
|
+
'http://www.example.com/example/',
|
45
|
+
'some-article-id',
|
46
|
+
'https://www.example.net/external-example.html',
|
47
|
+
'This link returned a 404',
|
48
|
+
'404 not found'
|
49
|
+
], parsed_csv.shift
|
50
|
+
end
|
51
|
+
|
52
|
+
it "ignores missing exceptions" do
|
53
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
54
|
+
|
55
|
+
csv_builder << {
|
56
|
+
url: 'http://www.example.com/example/',
|
57
|
+
id: 'some-article-id',
|
58
|
+
link: mock_link(link: 'relative-example', error_message: "Nope")
|
59
|
+
}
|
60
|
+
|
61
|
+
@report_output_file.rewind
|
62
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
63
|
+
|
64
|
+
parsed_csv.shift # drop headers
|
65
|
+
|
66
|
+
assert_equal [
|
67
|
+
'http://www.example.com/example/',
|
68
|
+
'some-article-id',
|
69
|
+
'relative-example',
|
70
|
+
'Nope',
|
71
|
+
nil
|
72
|
+
], parsed_csv.shift
|
73
|
+
end
|
74
|
+
|
75
|
+
it "ignores missing ids" do
|
76
|
+
csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
|
77
|
+
|
78
|
+
csv_builder << {
|
79
|
+
url: 'http://www.example.com/example/relative-example',
|
80
|
+
link: mock_link(
|
16
81
|
link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
17
|
-
error_message:
|
82
|
+
error_message: 'What even is this?',
|
18
83
|
exception: TestException.new('Test exception')
|
19
84
|
)
|
20
|
-
|
21
|
-
}
|
22
|
-
|
23
|
-
csv_builder = BadLinkFinder::CSVBuilder.new(bad_link_map)
|
24
|
-
|
25
|
-
parsed_csv = CSV.parse(csv_builder.to_s)
|
85
|
+
}
|
26
86
|
|
27
|
-
|
28
|
-
|
87
|
+
@report_output_file.rewind
|
88
|
+
parsed_csv = CSV.parse(@report_output_file.read)
|
29
89
|
|
30
|
-
|
90
|
+
parsed_csv.shift # drop headers
|
31
91
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
(link.exception.message if link.exception),
|
40
|
-
])
|
41
|
-
end
|
92
|
+
assert_equal [
|
93
|
+
'http://www.example.com/example/relative-example',
|
94
|
+
nil,
|
95
|
+
'/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
|
96
|
+
'What even is this?',
|
97
|
+
'Test exception'
|
98
|
+
], parsed_csv.shift
|
42
99
|
end
|
43
100
|
end
|
44
101
|
|