bad_link_finder 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjQxZjYzNjJjOWExYjY1MDE0Zjk4OGE0YjgwMWQ0YzUxZmIzOWZlMg==
4
+ M2ZkZmE5YjJiNTMxYThlYmFlYWE2NGM5ZWU5MTExYTA5ZWFiMzJjNw==
5
5
  data.tar.gz: !binary |-
6
- YjM2ODVlNWE1YmE4ZTllZWUzNDM2NWRhM2I2YjJkNmQyOWYzNjUxZg==
6
+ YTEwMTBlYWNmMmVlZjY1MTI2YjQ2OWU2ZjVhZTRkZDU1YmE3MWIxYw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- N2VhZjg2OWZmZGUzOWQ0YjhmMTEwZWQ1YjE2YjBiOTg5ZTJiMTA0M2ZjM2M3
10
- NWY4OTY4ZWMxNGQxMjQ1ZmE3ZTYxNTQyOGJmYjU4OTZkOGI0NmY4NzQzZjhh
11
- MDZlYmZiZDdhM2M3M2M0MzA2NWU2NDRlNDBjZTk0YWFiZWJiYzM=
9
+ M2Y0ZDhkOGRhOTU2ZmIyY2EwOTY4Njk3YTljOTFkMDZjNDZlZDE5YjU3NDI4
10
+ NmNjMDM1NWQ1NThkY2ZhMzdmNjRhODY3NGY2NGQ1NGI2MzEyNGFmOTM2ZGY1
11
+ ZGU5NTI4YWUzYWI3NGM1YzRhYWM2MTYxZjhlNWIyOTUzNzlhZGU=
12
12
  data.tar.gz: !binary |-
13
- ZDIzNjUwYmM0OGRiOTFiNWU0NTQ2MThiMGViOGE5NWJlYWFlMDJjMGI1ZDQ0
14
- YjIxZjNlMWY5OWYzMDI1YjEwYWIwYzI1ZWFmMjJkMGUyZWRhOTE4ZGZiMmNh
15
- MjA3MmNkNGIwMjE1NTFjNzVkMzQ1NzgzNzNhMTE3ZTZmZjdkMzQ=
13
+ MWQzMjY3NzMyMDBmMThlMWRmZGZkMGE4YjVmNDRjYTg0ZjA4NDQwNzYyY2Mx
14
+ ODM5NTFjNjU0MTQ1NzY1ZTM0YTg5OGE3MDAyMTJjYTM0OWU5YmQ5MzZkYTA5
15
+ ZjNlMzQ0ZDcxNDZjYmE0Y2U1YTU1ZmE2NTAyZGIzYmQ2MThjMzE=
@@ -2,21 +2,22 @@ require 'csv'
2
2
 
3
3
  module BadLinkFinder
4
4
  class CSVBuilder
5
- def initialize(bad_link_map)
6
- @bad_link_map = bad_link_map
5
+ def initialize(csv_output_file)
6
+ @csv = CSV.new(csv_output_file, encoding: 'UTF-8')
7
+
8
+ @csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
7
9
  end
8
10
 
9
- def to_s
10
- @to_s ||= CSV.generate(encoding: 'UTF-8') do |csv|
11
- csv << ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message']
11
+ def <<(csv_data)
12
+ link = csv_data[:link]
12
13
 
13
- @bad_link_map.each do |page_info, bad_links|
14
- bad_links.each do |bad_link|
15
- exception_message = bad_link.exception.message if bad_link.exception
16
- csv << [page_info[:url], page_info[:id], bad_link.link, bad_link.error_message, exception_message]
17
- end
18
- end
19
- end
14
+ @csv << [
15
+ csv_data[:url],
16
+ csv_data[:id],
17
+ link.link,
18
+ link.error_message,
19
+ (link.exception.message if link.exception)
20
+ ]
20
21
  end
21
22
  end
22
23
  end
@@ -29,7 +29,7 @@ module BadLinkFinder
29
29
  rescue Mechanize::RobotsDisallowedError => exception
30
30
  record_error("This link is blocked by robots.txt or nofollow attributes", exception)
31
31
  rescue Mechanize::Error, Net::HTTP::Persistent::Error, Timeout::Error, Errno::EINVAL,
32
- Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
32
+ Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
33
33
  Net::ProtocolError, OpenSSL::SSL::SSLError, SocketError => exception # Thanks Net::HTTP
34
34
  record_error("The server failed to serve this page properly", exception)
35
35
  end
@@ -11,12 +11,19 @@ module BadLinkFinder
11
11
 
12
12
  attr_reader :page_url
13
13
 
14
- def bad_links
15
- @bad_links ||= @page.links.map do |raw_link|
16
- link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
14
+ def each_bad_link(&block)
15
+ if @bad_links
16
+ @bad_links.each(&block)
17
+ else
18
+ @bad_links = @page.links.map do |raw_link|
19
+ link = @result_cache.fetch(raw_link) || @result_cache.store(raw_link, BadLinkFinder::Link.new(@page_url, raw_link))
17
20
 
18
- link unless link.valid?
19
- end.compact
21
+ unless link.valid?
22
+ yield link
23
+ next link
24
+ end
25
+ end.compact
26
+ end
20
27
  end
21
28
  end
22
29
  end
@@ -4,31 +4,28 @@ require 'bad_link_finder/page_checker'
4
4
 
5
5
  module BadLinkFinder
6
6
  class SiteChecker
7
- def initialize(mirror_dir, host)
7
+ def initialize(mirror_dir, host, csv_builder)
8
8
  @mirror_dir = File.expand_path(mirror_dir)
9
9
  @host = host
10
+ @csv_builder = csv_builder
10
11
  @result_cache = BadLinkFinder::ResultCache.new
11
12
  end
12
13
 
13
14
  def run
14
- bad_link_map = {}
15
- BadLinkFinder::Site.new(@mirror_dir).map do |page|
15
+ BadLinkFinder::Site.new(@mirror_dir).each do |page|
16
16
  page_checker = BadLinkFinder::PageChecker.new(@host, page, @result_cache)
17
17
  puts "Checking page #{page.path} as #{page_checker.page_url}"
18
18
 
19
- bad_links = page_checker.bad_links
20
-
21
- if bad_links.any?
22
- page_info = {
19
+ page_checker.each_bad_link do |link|
20
+ @csv_builder << {
21
+ url: page_checker.page_url,
23
22
  id: page.id,
24
- url: page_checker.page_url
23
+ link: link
25
24
  }
26
-
27
- bad_link_map[page_info] = bad_links
28
25
  end
29
26
  end
30
27
 
31
- return bad_link_map
28
+ nil
32
29
  end
33
30
  end
34
31
  end
@@ -1,3 +1,3 @@
1
1
  module BadLinkFinder
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -10,14 +10,17 @@ module BadLinkFinder
10
10
 
11
11
  raise EnvironmentVariableError.new("MIRROR_DIR '#{ENV['MIRROR_DIR']}' does not exist") unless Dir.exist?(ENV['MIRROR_DIR'])
12
12
 
13
- bad_link_map = BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST']).run
14
- csv_builder = CSVBuilder.new(bad_link_map)
15
-
16
13
  report_path = Pathname.new(ENV['REPORT_OUTPUT_FILE'])
17
14
  report_path.parent.mkpath
18
- report_path.open('w') do |file|
19
- file.write(csv_builder)
20
- end
15
+
16
+ csv_file = report_path.open('w')
17
+ csv_builder = BadLinkFinder::CSVBuilder.new(csv_file)
18
+
19
+ BadLinkFinder::SiteChecker.new(ENV['MIRROR_DIR'], ENV['SITE_HOST'], csv_builder).run
20
+
21
+ csv_file.close
22
+
23
+ nil
21
24
  end
22
25
 
23
26
  class EnvironmentVariableError < ArgumentError; end
@@ -1,44 +1,101 @@
1
1
  require 'test_helper'
2
2
  require 'bad_link_finder/csv_builder'
3
+
4
+ require 'tempfile'
3
5
  require 'ostruct'
4
6
  require 'csv'
5
7
 
6
8
  describe BadLinkFinder::CSVBuilder do
7
9
 
8
- it "flattens out the bad links map into a CSV structure" do
9
- bad_link_map = {
10
- {url: 'http://www.example.com/example/', id: 'some-article-id'} => [
11
- mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found')),
12
- mock_link(link: 'relative-example', error_message: "Nope")
13
- ],
14
- {url: 'http://www.example.com/example/relative-example'} => [
15
- mock_link(
10
+ before do
11
+ @report_output_file = Tempfile.new('csv')
12
+ @report_output_file.unlink
13
+ end
14
+
15
+ after do
16
+ @report_output_file.close
17
+ end
18
+
19
+ it "writes headers to the output file on creation" do
20
+ BadLinkFinder::CSVBuilder.new(@report_output_file)
21
+ @report_output_file.rewind
22
+ parsed_csv = CSV.parse(@report_output_file.read)
23
+
24
+ assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], parsed_csv.shift
25
+ assert_empty parsed_csv
26
+ end
27
+
28
+ describe '#<<' do
29
+ it "writes a link to the CSV" do
30
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
31
+
32
+ csv_builder << {
33
+ url: 'http://www.example.com/example/',
34
+ id: 'some-article-id',
35
+ link: mock_link(link: 'https://www.example.net/external-example.html', error_message: "This link returned a 404", exception: TestException.new('404 not found'))
36
+ }
37
+
38
+ @report_output_file.rewind
39
+ parsed_csv = CSV.parse(@report_output_file.read)
40
+
41
+ parsed_csv.shift # drop headers
42
+
43
+ assert_equal [
44
+ 'http://www.example.com/example/',
45
+ 'some-article-id',
46
+ 'https://www.example.net/external-example.html',
47
+ 'This link returned a 404',
48
+ '404 not found'
49
+ ], parsed_csv.shift
50
+ end
51
+
52
+ it "ignores missing exceptions" do
53
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
54
+
55
+ csv_builder << {
56
+ url: 'http://www.example.com/example/',
57
+ id: 'some-article-id',
58
+ link: mock_link(link: 'relative-example', error_message: "Nope")
59
+ }
60
+
61
+ @report_output_file.rewind
62
+ parsed_csv = CSV.parse(@report_output_file.read)
63
+
64
+ parsed_csv.shift # drop headers
65
+
66
+ assert_equal [
67
+ 'http://www.example.com/example/',
68
+ 'some-article-id',
69
+ 'relative-example',
70
+ 'Nope',
71
+ nil
72
+ ], parsed_csv.shift
73
+ end
74
+
75
+ it "ignores missing ids" do
76
+ csv_builder = BadLinkFinder::CSVBuilder.new(@report_output_file)
77
+
78
+ csv_builder << {
79
+ url: 'http://www.example.com/example/relative-example',
80
+ link: mock_link(
16
81
  link: '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
17
- error_message: "What even is this?",
82
+ error_message: 'What even is this?',
18
83
  exception: TestException.new('Test exception')
19
84
  )
20
- ]
21
- }
22
-
23
- csv_builder = BadLinkFinder::CSVBuilder.new(bad_link_map)
24
-
25
- parsed_csv = CSV.parse(csv_builder.to_s)
85
+ }
26
86
 
27
- headers = parsed_csv.shift
28
- assert_equal ['page_url', 'page_id', 'link', 'error_message', 'raw_error_message'], headers
87
+ @report_output_file.rewind
88
+ parsed_csv = CSV.parse(@report_output_file.read)
29
89
 
30
- assert_equal bad_link_map.values.flatten.count, parsed_csv.count
90
+ parsed_csv.shift # drop headers
31
91
 
32
- bad_link_map.each do |page_info, links|
33
- links.each do |link|
34
- assert parsed_csv.include?([
35
- page_info[:url],
36
- page_info[:id],
37
- link.link,
38
- link.error_message,
39
- (link.exception.message if link.exception),
40
- ])
41
- end
92
+ assert_equal [
93
+ 'http://www.example.com/example/relative-example',
94
+ nil,
95
+ '/example/?test=true&redirect=http://www.example.com/in-param-url/index.html#section-1',
96
+ 'What even is this?',
97
+ 'Test exception'
98
+ ], parsed_csv.shift
42
99
  end
43
100
  end
44
101
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bad_link_finder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elliot Crosby-McCullough