RubyGems - broken_link_finder - Versions diffs - 0.9.3 → 0.11.1 - Mend

broken_link_finder 0.9.3 → 0.11.1

Files changed (20) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +51 -0
data/Gemfile.lock +44 -33
data/README.md +28 -19
data/benchmark.rb +9 -5
data/bin/console +11 -19
data/bin/setup +1 -1
data/broken_link_finder.gemspec +8 -5
data/exe/broken_link_finder +12 -3
data/lib/broken_link_finder.rb +6 -1
data/lib/broken_link_finder/finder.rb +134 -141
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
data/lib/broken_link_finder/reporter/reporter.rb +76 -0
data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
metadata +18 -13
data/lib/broken_link_finder/reporter.rb +0 -116

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0e03d27fce04a04ff97aa40ea895b01272926915367712a865787cda0efc3a0a
-  data.tar.gz: 487abab06c5664fb4024e58c7e1ba43209d076100d5d7da275db2f534aee4c4b
+  metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
+  data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
 SHA512:
-  metadata.gz: 2b90b76061107fcde9b79e6b737710ee067be255182bdd7726ba2343aa64f0357b9901217f78cd0e3928244610cb9ad37007db8d316bbbef2a50355213d5764b
-  data.tar.gz: 5d727113d883f94e263933c726ea0eb0a41da343844c044d8220e042b877ac84b5c806e2f2d5570dd0dd9eb09a56f4603676b9194bc62201a01cb9fba7b14e23
+  metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
+  data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a

data/.ruby-version CHANGED

	@@ -1 +1 @@
1	- 2.5.3
1	+ 2.7.0

data/CHANGELOG.md CHANGED

@@ -9,6 +9,57 @@
 - ...
 ---
+## v0.11.1
+### Added
+- ...
+### Changed/Removed
+- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
+## v0.10.0
+### Added
+- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
+- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
+- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
+### Changed/Removed
+- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
+### Fixed
+- ...
+---
+## v0.9.5
+### Added
+- ...
+### Changed/Removed
+- Now using optimistic dep versioning.
+- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
+### Fixed
+- ...
+---
+## v0.9.4
+### Added
+- ...
+### Changed/Removed
+- Updated `wgit` gem to version 0.5.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
 ## v0.9.3
 ### Added
 - ...

data/Gemfile.lock CHANGED

@@ -1,53 +1,64 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.9.3)
-      thor (~> 0.20.3)
-      thread (~> 0.2.0)
-      wgit (~> 0.4.1)
+    broken_link_finder (0.11.1)
+      thor (~> 0.20)
+      thread (~> 0.2)
+      wgit (~> 0.9)
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.6.0)
-      public_suffix (>= 2.0.2, < 4.0)
-    bson (4.5.0)
-    byebug (11.0.1)
-    coderay (1.1.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    bson (4.10.0)
+    byebug (11.1.3)
+    cliver (0.3.2)
+    coderay (1.1.3)
+    concurrent-ruby (1.1.6)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
     ethon (0.12.0)
       ffi (>= 1.3.0)
-    ffi (1.11.1)
-    hashdiff (1.0.0)
-    maxitest (3.4.0)
-      minitest (>= 5.0.0, < 5.13.0)
-    method_source (0.9.2)
+    ferrum (0.9)
+      addressable (~> 2.5)
+      cliver (~> 0.3)
+      concurrent-ruby (~> 1.1)
+      websocket-driver (>= 0.6, < 0.8)
+    ffi (1.13.1)
+    hashdiff (1.0.1)
+    maxitest (3.6.0)
+      minitest (>= 5.0.0, < 5.14.0)
+    method_source (1.0.0)
     mini_portile2 (2.4.0)
-    minitest (5.12.2)
-    mongo (2.9.2)
-      bson (>= 4.4.2, < 5.0.0)
-    nokogiri (1.10.4)
+    minitest (5.13.0)
+    mongo (2.13.0)
+      bson (>= 4.8.2, < 5.0.0)
+    nokogiri (1.10.10)
       mini_portile2 (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    public_suffix (3.1.0)
-    rake (10.5.0)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    public_suffix (4.0.5)
+    rake (13.0.1)
     safe_yaml (1.0.5)
     thor (0.20.3)
     thread (0.2.2)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    webmock (3.7.6)
+    webmock (3.8.3)
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.4.1)
-      addressable (~> 2.6.0)
-      mongo (~> 2.9.0)
-      nokogiri (~> 1.10.3)
-      typhoeus (~> 1.3.1)
+    websocket-driver (0.7.3)
+      websocket-extensions (>= 0.1.0)
+    websocket-extensions (0.1.5)
+    wgit (0.9.0)
+      addressable (~> 2.6)
+      ferrum (~> 0.8)
+      mongo (~> 2.9)
+      nokogiri (~> 1.10)
+      typhoeus (~> 1.3)
 PLATFORMS
   ruby
@@ -58,11 +69,11 @@ DEPENDENCIES
   byebug (~> 11.0)
   maxitest (~> 3.3)
   pry (~> 0.12)
-  rake (~> 10.0)
+  rake (~> 13.0)
   webmock (~> 3.6)
 RUBY VERSION
-   ruby 2.5.3p105
+   ruby 2.7.0p0
 BUNDLED WITH
-   2.0.1
+   2.1.4

data/README.md CHANGED

@@ -1,8 +1,10 @@
 # Broken Link Finder
-Does what it says on the tin; Finds a website's broken links.
+Does what it says on the tin - finds a website's broken links.
-Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a nice concise summary of the broken links found.
+Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
+Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
 ## How It Works
@@ -10,7 +12,7 @@ Any HTML page element with a `href` or `src` attribute is considered a link. For
 - An empty HTML response body is returned.
 - A response status code of `404 Not Found` is returned.
-- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
+- The HTML response body doesn't contain an element ID matching that of the link's fragment e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
 - The link redirects more than 5 times consecutively.
 **Note**: Not all link types are supported.
@@ -55,7 +57,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
     $ broken_link_finder crawl http://txti.es
-Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
+Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
 See the [output](#Output) section below for an example of a site with broken links.
@@ -73,9 +75,9 @@ Below is a simple script which crawls a website and outputs its broken links to
 require 'broken_link_finder'
 finder = BrokenLinkFinder.new
-finder.crawl_site 'http://txti.es'    # Or use Finder#crawl_page for a single webpage.
-finder.pretty_print_link_report       # Or use Finder#broken_links and Finder#ignored_links
-                                      # for direct access to the link Hashes.
+finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
+finder.report                      # Or use Finder#broken_links and Finder#ignored_links
+                                   # for direct access to the link Hashes.
 ```
 Then execute the script with:
@@ -89,28 +91,33 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Found 6 broken link(s) across 2 page(s):
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
+/doesntexist
 http://twitter.com/nwbld
-http://twitter.com/txties
-https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
+twitter.com/txties
 The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
-The following links were ignored on http://txti.es:
+The following links were ignored on 'http://txti.es':
 tel:+13174562564
 mailto:big.jim@jmail.com
-The following links were ignored on http://txti.es/contact:
+The following links were ignored on 'http://txti.es/contact':
 ftp://server.com
 ```
+You can provide the `--html` flag if you'd prefer a HTML based report.
 ## Contributing
 Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
@@ -126,9 +133,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`.
 To release a new gem version:
-- Update the version number in `version.rb` and add the new version to the `CHANGELOG`
-- Run `bundle install`
-- Run `bundle exec rake test` ensuring all tests pass
-- Run `bundle exec rake compile` ensuring no warnings
-- Run `bundle exec rake install && rbenv rehash` and manually test the executable
-- Run `bundle exec rake release[origin]`
+- Update the deps in the `*.gemspec`, if necessary.
+- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
+- Run `bundle install`.
+- Run `bundle exec rake test` ensuring all tests pass.
+- Run `bundle exec rake compile` ensuring no warnings.
+- Run `bundle exec rake install && rbenv rehash`.
+- Manually test the executable.
+- Run `bundle exec rake release[origin]`.

data/benchmark.rb CHANGED

@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
 puts Benchmark.measure { finder.crawl_site url }
 puts "Links crawled: #{finder.total_links_crawled}"
-# http://txti.es page crawl
-# Pre  threading: 17.5 seconds
-# Post threading: 7.5  seconds
+# http://txti.es page crawl with threading
+# Pre:  17.5 seconds
+# Post: 7.5  seconds
-# http://txti.es post threading - page vs site crawl
+# http://txti.es with threading - page vs site crawl
 # Page: 9.526981
 # Site: 9.732416
 # Multi-threading crawl_site now yields the same time as a single page
-# Large site crawl - post all link recording functionality
+# Large site crawl - all link recording functionality
 # Pre:  608 seconds with 7665 links crawled
 # Post: 355 seconds with 1099 links crawled
+# Large site crawl - retry mechanism
+# Pre:  140 seconds
+# Post: 170 seconds

data/bin/console CHANGED

@@ -5,20 +5,10 @@ require 'bundler/setup'
 require 'pry'
 require 'byebug'
 require 'broken_link_finder'
+require 'logger'
-# Monkey patch and log all HTTP requests made during the console.
-module Typhoeus
-  singleton_class.class_eval do
-    alias_method :orig_get, :get
-  end
-  def self.get(base_url, options = {})
-    puts "[typhoeus] Sending GET: #{base_url}"
-    resp = orig_get(base_url, options)
-    puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
-    resp
-  end
-end
+# Logs all HTTP requests.
+Wgit.logger.level = Logger::DEBUG
 # Call reload to load all recent code changes.
 def reload
@@ -33,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/bin/setup CHANGED

@@ -5,4 +5,4 @@ set -vx
 bundle install
-# Do any other automated setup that you need to do here
+# Do any other automated setup that you need to do here...

data/broken_link_finder.gemspec CHANGED

@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = 'https://github.com/michaeltelford/broken-link-finder'
   spec.license       = 'MIT'
   spec.metadata      = {
-    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
+    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
+    'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
+    'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
+    'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
   }
   # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'byebug', '~> 11.0'
   spec.add_development_dependency 'maxitest', '~> 3.3'
   spec.add_development_dependency 'pry', '~> 0.12'
-  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rake', '~> 13.0'
   spec.add_development_dependency 'webmock', '~> 3.6'
-  spec.add_runtime_dependency 'thor', '~> 0.20.3'
-  spec.add_runtime_dependency 'thread', '~> 0.2.0'
-  spec.add_runtime_dependency 'wgit', '~> 0.4.1'
+  spec.add_runtime_dependency 'thor', '~> 0.20'
+  spec.add_runtime_dependency 'thread', '~> 0.2'
+  spec.add_runtime_dependency 'wgit', '~> 0.9'
 end

data/exe/broken_link_finder CHANGED

@@ -9,12 +9,14 @@ class BrokenLinkFinderCLI < Thor
   desc 'crawl [URL]', 'Find broken links at the URL'
   option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
   option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
+  option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
   option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
   option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
   option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
   def crawl(url)
     url = "http://#{url}" unless url.start_with?('http')
+    report_type     = options[:html] ? :html : :text
     sort_by         = options[:sort_by_link] ? :link : :page
     max_threads     = options[:threads]
     broken_verbose  = !options[:concise]
@@ -22,17 +24,24 @@ class BrokenLinkFinderCLI < Thor
     finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
-    finder.pretty_print_link_report(
-      broken_verbose: broken_verbose,
+    finder.report(
+      type:            report_type,
+      broken_verbose:  broken_verbose,
       ignored_verbose: ignored_verbose
     )
-  rescue Exception => e
+    exit 0
+  rescue StandardError => e
     puts "An error has occurred: #{e.message}"
+    exit 1
   end
   desc 'version', 'Display the currently installed version'
   def version
     puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
+    exit 0
   end
 end

data/lib/broken_link_finder.rb CHANGED

@@ -2,8 +2,13 @@
 require 'wgit'
 require 'wgit/core_ext'
+require 'thread/pool'
+require 'set'
 require_relative './broken_link_finder/wgit_extensions'
 require_relative './broken_link_finder/version'
-require_relative './broken_link_finder/reporter'
+require_relative './broken_link_finder/link_manager'
+require_relative './broken_link_finder/reporter/reporter'
+require_relative './broken_link_finder/reporter/text_reporter'
+require_relative './broken_link_finder/reporter/html_reporter'
 require_relative './broken_link_finder/finder'

data/lib/broken_link_finder/finder.rb CHANGED

@@ -1,234 +1,227 @@
 # frozen_string_literal: true
-require_relative 'reporter'
-require 'thread/pool'
-require 'set'
 module BrokenLinkFinder
-  DEFAULT_MAX_THREADS = 100
+  DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
+  SERVER_WAIT_TIME    = 0.5 # Used by Finder#retry_broken_links.
   # Alias for BrokenLinkFinder::Finder.new.
   def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
     Finder.new(sort: sort, max_threads: max_threads)
   end
+  # Class responsible for finding broken links on a page or site.
   class Finder
-    attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
+    # The collection key - either :page or :link.
+    attr_reader :sort
+    # The max number of threads created during #crawl_site - one thread per page.
+    attr_reader :max_threads
-    # Creates a new Finder instance.
-    def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
+    # Returns a new Finder instance.
+    def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
       raise "Sort by either :page or :link, not #{sort}" \
       unless %i[page link].include?(sort)
       @sort        = sort
       @max_threads = max_threads
-      @lock        = Mutex.new
       @crawler     = Wgit::Crawler.new
+      @manager     = BrokenLinkFinder::LinkManager.new(@sort)
+    end
+    # Returns the current broken links.
+    def broken_links
+      @manager.broken_links
+    end
-      clear_links
+    # Returns the current ignored links.
+    def ignored_links
+      @manager.ignored_links
     end
-    # Clear/empty the link collection Hashes.
-    def clear_links
-      @broken_links        = {}
-      @ignored_links       = {}
-      @total_links_crawled = 0
-      @all_broken_links    = Set.new
-      @all_intact_links    = Set.new
+    # Returns the current crawl stats.
+    def crawl_stats
+      @manager.crawl_stats
     end
-    # Finds broken links within a single page and appends them to the
-    # @broken_links array. Returns true if at least one broken link was found.
+    # Finds broken links within a single page and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_url(url)
-      clear_links
+      @manager.empty
-      url = url.to_url
-      doc = @crawler.crawl(url)
+      start = Time.now
+      url   = url.to_url
+      # We dup the url to avoid recording any redirects.
+      doc = @crawler.crawl(url.dup)
       # Ensure the given page url is valid.
       raise "Invalid or broken URL: #{url}" unless doc
       # Get all page links and determine which are broken.
       find_broken_links(doc)
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: [url], start: start)
-      @broken_links.any?
+      broken_links.any?
     end
-    # Finds broken links within an entire site and appends them to the
-    # @broken_links array. Returns a tuple containing a Boolean of true if
-    # at least one broken link was found and an Array of all pages crawled.
+    # Finds broken links within an entire site and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
-    def crawl_site(url)
-      clear_links
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil)
+      @manager.empty
-      url           = url.to_url
-      pool          = Thread.pool(@max_threads)
-      crawled_pages = []
+      start   = Time.now
+      url     = url.to_url
+      pool    = Thread.pool(@max_threads)
+      crawled = Set.new
       # Crawl the site's HTML web pages looking for links.
-      externals = @crawler.crawl_site(url) do |doc|
-        crawled_pages << doc.url
+      # We dup the url to avoid recording any redirects.
+      paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      externals = @crawler.crawl_site(url.dup, **paths) do |doc|
+        crawled << doc.url
         next unless doc
         # Start a thread for each page, checking for broken links.
         pool.process { find_broken_links(doc) }
       end
+      # Wait for all threads to finish, even if url was invalid.
+      pool.shutdown
       # Ensure the given website url is valid.
       raise "Invalid or broken URL: #{url}" unless externals
-      # Wait for all threads to finish.
-      pool.shutdown
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
-      [@broken_links.any?, crawled_pages.uniq]
+      broken_links.any?
+    ensure
+      pool.shutdown if defined?(pool)
     end
-    # Pretty prints the link report into a stream e.g. STDOUT or a file,
+    # Outputs the link report into a stream e.g. STDOUT or a file,
     # anything that respond_to? :puts. Defaults to STDOUT.
-    # Returns true if there were broken links and vice versa.
-    def pretty_print_link_report(
-      stream = STDOUT,
-      broken_verbose:  true,
-      ignored_verbose: false
-    )
-      reporter = BrokenLinkFinder::Reporter.new(
-        stream, @sort, @broken_links, @ignored_links
-      )
-      reporter.pretty_print_link_report(
-        broken_verbose:  broken_verbose,
-        ignored_verbose: ignored_verbose
-      )
-      @broken_links.any?
+    def report(stream = STDOUT, type: :text,
+               broken_verbose: true, ignored_verbose: false)
+      klass = case type
+              when :text
+                BrokenLinkFinder::TextReporter
+              when :html
+                BrokenLinkFinder::HTMLReporter
+              else
+                raise "The type: must be :text or :html, not: :#{type}"
+              end
+      reporter = klass.new(stream, @sort,
+                           broken_links, ignored_links,
+                           @manager.broken_link_map, crawl_stats)
+      reporter.call(broken_verbose: broken_verbose,
+                    ignored_verbose: ignored_verbose)
     end
     private
     # Finds which links are unsupported or broken and records the details.
-    def find_broken_links(doc)
-      links = get_supported_links(doc)
+    def find_broken_links(page)
+      record_unparsable_links(page) # Record them as broken.
+      links = get_supported_links(page)
       # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        # Check if the link has already been processed previously.
-        next if @all_intact_links.include?(link)
+        # Skip if the link has been encountered previously.
+        next if @manager.all_intact_links.include?(link)
-        if @all_broken_links.include?(link)
-          append_broken_link(doc.url, link)
+        if @manager.all_broken_links.include?(link)
+          # The link has already been proven broken so simply record it.
+          @manager.append_broken_link(page, link, map: false)
           next
         end
-        # The link hasn't been processed before so we crawl it.
-        link_doc = crawl_link(doc, link)
+        # The link hasn't been encountered before so we crawl it.
+        link_doc = crawl_link(page, link)
-        # Determine if the crawled link is broken or not.
-        if  link_doc.nil? ||
-            @crawler.last_response.code == 404 ||
-            has_broken_anchor(link_doc)
-          append_broken_link(doc.url, link)
+        # Determine if the crawled link is broken or not and record it.
+        if link_broken?(link_doc)
+          @manager.append_broken_link(page, link)
         else
-          @lock.synchronize { @all_intact_links << link }
+          @manager.append_intact_link(link)
         end
       end
       nil
     end
-    # Report and reject any non supported links. Any link that is absolute and
-    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
-    def get_supported_links(doc)
-      doc.all_links
-         .reject do |link|
-           if link.is_absolute? && !link.start_with?('http')
-             append_ignored_link(doc.url, link)
-             true
-           end
-         end
-    end
+    # Implements a retry mechanism for each of the broken links found.
+    # Removes any broken links found to be working OK.
+    def retry_broken_links
+      sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
-    # Makes the link absolute and crawls it, returning its Wgit::Document.
-    def crawl_link(doc, link)
-      link = get_absolute_link(doc, link)
-      @crawler.crawl(link)
-    end
+      @manager.broken_link_map.select! do |link, href|
+        # Don't retry unparsable links (which are Strings).
+        next(true) unless href.is_a?(Wgit::Url)
-    # Returns the link in absolute form so it can be crawled.
-    def get_absolute_link(doc, link)
-      link.is_relative? ? doc.base_url(link: link).concat(link) : link
-    end
+        doc = @crawler.crawl(href.dup)
-    # Returns true if the link is/contains a broken anchor.
-    def has_broken_anchor(doc)
-      raise 'link document is nil' unless doc
-      anchor = doc.url.anchor
-      return false if anchor.nil? || (anchor == '#')
-      anchor = anchor[1..-1] if anchor.start_with?('#')
-      doc.xpath("//*[@id='#{anchor}']").empty?
+        if link_broken?(doc)
+          true
+        else
+          @manager.remove_broken_link(link)
+          false
+        end
+      end
     end
-    # Append key => [value] to @broken_links.
-    def append_broken_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @broken_links[key] = [] unless @broken_links[key]
-        @broken_links[key] << value
-        @all_broken_links  << link
+    # Record each unparsable link as a broken link.
+    def record_unparsable_links(doc)
+      doc.unparsable_links.each do |link|
+        # We map the link ourselves because link is a String, not a Wgit::Url.
+        @manager.append_broken_link(doc, link, map: false)
+        @manager.broken_link_map[link] = link
       end
     end
-    # Append key => [value] to @ignored_links.
-    def append_ignored_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @ignored_links[key] = [] unless @ignored_links[key]
-        @ignored_links[key] << value
+    # Report and reject any non supported links. Any link that is absolute and
+    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
+    def get_supported_links(doc)
+      doc.all_links.reject do |link|
+        if link.is_absolute? && !link.start_with?('http')
+          @manager.append_ignored_link(doc.url, link)
+          true
+        end
       end
     end
-    # Returns the correct key value depending on the @sort type.
-    # @sort == :page ? [url, link] : [link, url]
-    def get_key_value(url, link)
-      case @sort
-      when :page
-        [url, link]
-      when :link
-        [link, url]
-      else
-        raise "Unsupported sort type: #{sort}"
-      end
+    # Make the link absolute and crawl it, returning its Wgit::Document.
+    def crawl_link(doc, link)
+      link = link.make_absolute(doc)
+      @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
     end
-    # Sort keys and values alphabetically.
-    def sort_links
-      @broken_links.values.map(&:uniq!)
-      @ignored_links.values.map(&:uniq!)
+    # Return if the crawled link is broken or not.
+    def link_broken?(doc)
+      doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
+    end
-      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
-      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
+    # Returns true if the link is/contains a broken anchor/fragment.
+    # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
+    def has_broken_anchor(doc)
+      raise 'The link document is nil' unless doc
-      @broken_links.each  { |_k, v| v.sort! }
-      @ignored_links.each { |_k, v| v.sort! }
-    end
+      fragment = doc.url.fragment
+      return false if fragment.nil? || fragment.empty?
-    # Sets and returns the total number of links crawled.
-    def set_total_links_crawled
-      @total_links_crawled = @all_broken_links.size + @all_intact_links.size
+      doc.xpath("//*[@id='#{fragment}']").empty?
     end
-    alias crawl_page                crawl_url
-    alias crawl_r                   crawl_site
-    alias pretty_print_link_summary pretty_print_link_report
+    alias crawl_page crawl_url
+    alias crawl_r    crawl_site
   end
 end