bad_link_finder 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjQxZjYzNjJjOWExYjY1MDE0Zjk4OGE0YjgwMWQ0YzUxZmIzOWZlMg==
4
+ M2ZkZmE5YjJiNTMxYThlYmFlYWE2NGM5ZWU5MTExYTA5ZWFiMzJjNw==
5
5
  data.tar.gz: !binary |-
6
- YjM2ODVlNWE1YmE4ZTllZWUzNDM2NWRhM2I2YjJkNmQyOWYzNjUxZg==
6
+ YTEwMTBlYWNmMmVlZjY1MTI2YjQ2OWU2ZjVhZTRkZDU1YmE3MWIxYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- N2VhZjg2OWZmZGUzOWQ0YjhmMTEwZWQ1YjE2YjBiOTg5ZTJiMTA0M2ZjM2M3
10
- NWY4OTY4ZWMxNGQxMjQ1ZmE3ZTYxNTQyOGJmYjU4OTZkOGI0NmY4NzQzZjhh
11
- MDZlYmZiZDdhM2M3M2M0MzA2NWU2NDRlNDBjZTk0YWFiZWJiYzM=
9
+ M2Y0ZDhkOGRhOTU2ZmIyY2EwOTY4Njk3YTljOTFkMDZjNDZlZDE5YjU3NDI4
10
+ NmNjMDM1NWQ1NThkY2ZhMzdmNjRhODY3NGY2NGQ1NGI2MzEyNGFmOTM2ZGY1
11
+ ZGU5NTI4YWUzYWI3NGM1YzRhYWM2MTYxZjhlNWIyOTUzNzlhZGU=
12
12
  data.tar.gz: !binary |-
13
- ZDIzNjUwYmM0OGRiOTFiNWU0NTQ2MThiMGViOGE5NWJlYWFlMDJjMGI1ZDQ0
14
- YjIxZjNlMWY5OWYzMDI1YjEwYWIwYzI1ZWFmMjJkMGUyZWRhOTE4ZGZiMmNh
15
- MjA3MmNkNGIwMjE1NTFjNzVkMzQ1NzgzNzNhMTE3ZTZmZjdkMzQ=
13
+ MWQzMjY3NzMyMDBmMThlMWRmZGZkMGE4YjVmNDRjYTg0ZjA4NDQwNzYyY2Mx
14
+ ODM5NTFjNjU0MTQ1NzY1ZTM0YTg5OGE3MDAyMTJjYTM0OWU5YmQ5MzZkYTA5
15
+ ZjNlMzQ0ZDcxNDZjYmE0Y2U1YTU1ZmE2NTAyZGIzYmQ2MThjMzE=
@@ -2,21 +2,22 @@ require 'csv'
2
2
 
3
3
  module BadLinkFinder
4
4
  class CSVBuilder
5
- def initialize(bad_link_map)
6
- @bad_link_map = bad_link_map
5
+ def initialize(csv_output_file)
6
+ @csv = CSV.new(csv_output_file, encoding: 'UTF-8')
7
+
8
+ @csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
7
9
  end
8
10
 
9
- def to_s
10
- @to_s ||= CSV.generate(encoding: 'UTF-8') do |csv|
11
- csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
11
+ def <<(csv_data)
12
+ link = csv_data[:link]
12
13
 
13
- @bad_link_map.each do |page_info, bad_links|
14
- bad_links.each do |bad_link|
15
- exception_message = bad_link.exception.message if bad_link.exception
16
- csv << [page_info[:url], page_info[:id], bad_link.link, bad_link.error_message, exception_message]
17
- end
18
- end
19
- end
14
+ @csv << [
15
+ csv_data[:url],
16
+ csv_data[:id],
17
+ link.link,
18
+ link.error_message,
19
+ (link.exception.message if link.exception)
20
+ ]
20
21
  end
21
22
  end
22
23
  end
@@ -29,7 +29,7 @@ module BadLinkFinder
29
29
  rescue Mechanize::RobotsDisallowedError => exception
30
30
  record_error("This link is blocked by robots.txt or nofollow attributes", exception)
31
31
  rescue Mechanize::Error, Net::HTTP::Persistent::Error, Timeout::Error, Errno::EINVAL,
32
- Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
32
+ Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
33
33
  Net::ProtocolError, OpenSSL::SSL::SSLError, SocketError => exception # Thanks Net::HTTP
34
34
  record_error("The server failed to serve this page properly", exception)
35
35
  end
@@ -11,12 +11,19 @@ module BadLinkFinder
11
11
 
12
12
  attr_reader :page_url
13
13
 
14
- def bad_links
15
- @bad_links ||= @page.links.map do |raw_link|
16
- link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
14
+ def each_bad_link(&block)
15
+ if @bad_links
16
+ @bad_links.each(&block)
17
+ else
18
+ @bad_links = @page.links.map do |raw_link|
19
+ link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
17
20
 
18
- link unless link.valid?
19
- end.compact
21
+ unless link.valid?
22
+ yield link
23
+ next link
24
+ end
25
+ end.compact
26
+ end
20
27
  end
21
28
  end
22
29
  end
@@ -4,31 +4,28 @@ require 'bad_link_finder/page_checker'
4
4
 
5
5
  module BadLinkFinder
6
6
  class SiteChecker
7
- def initialize(mirror_dir, host)
7
+ def initialize(mirror_dir, host, csv_builder)
8
8
  @mirror_dir = File.expand_path(mirror_dir)
9
9
  @host = host
10
+ @csv_builder = csv_builder
10
11
  @result_cache = BadLinkFinder::ResultCache.new
11
12
  end
12
13
 
13
14
  def run
14
- bad_link_map = {}
15
- BadLinkFinder::Site.new(@mirror_dir).map do |page|
15
+ BadLinkFinder::Site.new(@mirror_dir).each do |page|
16
16
  page_checker = BadLinkFinder::PageChecker.new(@host, page, @result_cache)
17
17
  puts "Checking page #{page.path} as #{page_checker.page_url}"
18
18
 
19
- bad_links = page_checker.bad_links
20
-
21
- if bad_links.any?
22
- page_info = {
19
+ page_checker.each_bad_link do |link|
20
+ @csv_builder << {
21
+ url: page_checker.page_url,
23
22
  id: page.id,
24
- url: page_checker.page_url
23
+ link: link
25
24
  }
26
-
27
- bad_link_map[page_info] = bad_links
28
25
  end
29
26
  end
30
27
 
31
- return bad_link_map
28
+ nil
32
29
  end
33
30
  end
34
31
  end
@@ -1,3 +1,3 @@
1
1
  module BadLinkFinder
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -10,14 +10,17 @@ module BadLinkFinder
10
10
 
11
11
  raise EnvironmentVariableError.new("MIRROR_DIR '#{ENV['MIRROR_DIR']}' does not exist") unless Dir.exist?(ENV['MIRROR_DIR'])
12
12
 
13
- bad_link_map = BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST']).run
14
- csv_builder = CSVBuilder.new(bad_link_map)
15
-
16
13
  report_path = Pathname.new(ENV['REPORT_OUTPUT_FILE'])
17
14
  report_path.parent.mkpath
18
- report_path.open('w') do |file|
19
- file.write(csv_builder)
20
- end
15
+
16
+ csv_file = report_path.open('w')
17
+ csv_builder = BadLinkFinder::CSVBuilder.new(csv_file)
18
+
19
+ BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST'], csv_builder).run
20
+
21
+ csv_file.close
22
+
23
+ nil
21
24
  end
22
25
 
23
26
  class EnvironmentVariableError < ArgumentError; end
@@ -1,44 +1,101 @@
1
1
  require 'test_helper'
2
2
  require 'bad_link_finder/csv_builder'
3
+
4
+ require 'tempfile'
3
5
  require 'ostruct'
4
6
  require 'csv'
5
7
 
6
8
  describe BadLinkFinder::CSVBuilder do
7
9
 
8
- it "flattens out the bad links map into a CSV structure" do
9
- bad_link_map = {
10
- {url: 'http://www.example.com/example/', id: 'some-article-id'} => [
11
- mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found')),
12
- mock_link(link: 'relative-example', error_message: "Nope")
13
- ],
14
- {url: 'http://www.example.com/example/relative-example'} => [
15
- mock_link(
10
+ before do
11
+ @report_output_file = Tempfile.new('csv')
12
+ @report_output_file.unlink
13
+ end
14
+
15
+ after do
16
+ @report_output_file.close
17
+ end
18
+
19
+ it "writes headers to the output file on creation" do
20
+ BadLinkFinder::CSVBuilder.new(@report_output_file)
21
+ @report_output_file.rewind
22
+ parsed_csv = CSV.parse(@report_output_file.read)
23
+
24
+ assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], parsed_csv.shift
25
+ assert_empty parsed_csv
26
+ end
27
+
28
+ describe '#<<' do
29
+ it "writes a link to the CSV" do
30
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
31
+
32
+ csv_builder << {
33
+ url: 'http://www.example.com/example/',
34
+ id: 'some-article-id',
35
+ link: mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found'))
36
+ }
37
+
38
+ @report_output_file.rewind
39
+ parsed_csv = CSV.parse(@report_output_file.read)
40
+
41
+ parsed_csv.shift # drop headers
42
+
43
+ assert_equal [
44
+ 'http://www.example.com/example/',
45
+ 'some-article-id',
46
+ 'https://www.example.net/external-example.html',
47
+ 'This link returned a 404',
48
+ '404 not found'
49
+ ], parsed_csv.shift
50
+ end
51
+
52
+ it "ignores missing exceptions" do
53
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
54
+
55
+ csv_builder << {
56
+ url: 'http://www.example.com/example/',
57
+ id: 'some-article-id',
58
+ link: mock_link(link: 'relative-example', error_message: "Nope")
59
+ }
60
+
61
+ @report_output_file.rewind
62
+ parsed_csv = CSV.parse(@report_output_file.read)
63
+
64
+ parsed_csv.shift # drop headers
65
+
66
+ assert_equal [
67
+ 'http://www.example.com/example/',
68
+ 'some-article-id',
69
+ 'relative-example',
70
+ 'Nope',
71
+ nil
72
+ ], parsed_csv.shift
73
+ end
74
+
75
+ it "ignores missing ids" do
76
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
77
+
78
+ csv_builder << {
79
+ url: 'http://www.example.com/example/relative-example',
80
+ link: mock_link(
16
81
  link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
17
- error_message: "What even is this?",
82
+ error_message: 'What even is this?',
18
83
  exception: TestException.new('Test exception')
19
84
  )
20
- ]
21
- }
22
-
23
- csv_builder = BadLinkFinder::CSVBuilder.new(bad_link_map)
24
-
25
- parsed_csv = CSV.parse(csv_builder.to_s)
85
+ }
26
86
 
27
- headers = parsed_csv.shift
28
- assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], headers
87
+ @report_output_file.rewind
88
+ parsed_csv = CSV.parse(@report_output_file.read)
29
89
 
30
- assert_equal bad_link_map.values.flatten.count, parsed_csv.count
90
+ parsed_csv.shift # drop headers
31
91
 
32
- bad_link_map.each do |page_info, links|
33
- links.each do |link|
34
- assert parsed_csv.include?([
35
- page_info[:url],
36
- page_info[:id],
37
- link.link,
38
- link.error_message,
39
- (link.exception.message if link.exception),
40
- ])
41
- end
92
+ assert_equal [
93
+ 'http://www.example.com/example/relative-example',
94
+ nil,
95
+ '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
96
+ 'What even is this?',
97
+ 'Test exception'
98
+ ], parsed_csv.shift
42
99
  end
43
100
  end
44
101
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bad_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elliot Crosby-McCullough