RubyGems - broken_link_finder - Versions diffs - 0.9.3 → 0.11.1 - Mend

broken_link_finder 0.9.3 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +51 -0
data/Gemfile.lock +44 -33
data/README.md +28 -19
data/benchmark.rb +9 -5
data/bin/console +11 -19
data/bin/setup +1 -1
data/broken_link_finder.gemspec +8 -5
data/exe/broken_link_finder +12 -3
data/lib/broken_link_finder.rb +6 -1
data/lib/broken_link_finder/finder.rb +134 -141
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
data/lib/broken_link_finder/reporter/reporter.rb +76 -0
data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
metadata +18 -13
data/lib/broken_link_finder/reporter.rb +0 -116

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0e03d27fce04a04ff97aa40ea895b01272926915367712a865787cda0efc3a0a
-  data.tar.gz: 487abab06c5664fb4024e58c7e1ba43209d076100d5d7da275db2f534aee4c4b
+  metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
+  data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
 SHA512:
-  metadata.gz: 2b90b76061107fcde9b79e6b737710ee067be255182bdd7726ba2343aa64f0357b9901217f78cd0e3928244610cb9ad37007db8d316bbbef2a50355213d5764b
-  data.tar.gz: 5d727113d883f94e263933c726ea0eb0a41da343844c044d8220e042b877ac84b5c806e2f2d5570dd0dd9eb09a56f4603676b9194bc62201a01cb9fba7b14e23
+  metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
+  data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a

data/.ruby-version CHANGED

	@@ -1 +1 @@
1	- 2.5.3
1	+ 2.7.0

data/CHANGELOG.md CHANGED

@@ -9,6 +9,57 @@
 - ...
 ---
+## v0.11.1
+### Added
+- ...
+### Changed/Removed
+- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
+## v0.10.0
+### Added
+- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
+- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
+- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
+### Changed/Removed
+- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
+### Fixed
+- ...
+---
+## v0.9.5
+### Added
+- ...
+### Changed/Removed
+- Now using optimistic dep versioning.
+- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
+### Fixed
+- ...
+---
+## v0.9.4
+### Added
+- ...
+### Changed/Removed
+- Updated `wgit` gem to version 0.5.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
 ## v0.9.3
 ### Added
 - ...

data/Gemfile.lock CHANGED

@@ -1,53 +1,64 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.9.3)
-      thor (~> 0.20.3)
-      thread (~> 0.2.0)
-      wgit (~> 0.4.1)
+    broken_link_finder (0.11.1)
+      thor (~> 0.20)
+      thread (~> 0.2)
+      wgit (~> 0.9)
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.6.0)
-      public_suffix (>= 2.0.2, < 4.0)
-    bson (4.5.0)
-    byebug (11.0.1)
-    coderay (1.1.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    bson (4.10.0)
+    byebug (11.1.3)
+    cliver (0.3.2)
+    coderay (1.1.3)
+    concurrent-ruby (1.1.6)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
     ethon (0.12.0)
       ffi (>= 1.3.0)
-    ffi (1.11.1)
-    hashdiff (1.0.0)
-    maxitest (3.4.0)
-      minitest (>= 5.0.0, < 5.13.0)
-    method_source (0.9.2)
+    ferrum (0.9)
+      addressable (~> 2.5)
+      cliver (~> 0.3)
+      concurrent-ruby (~> 1.1)
+      websocket-driver (>= 0.6, < 0.8)
+    ffi (1.13.1)
+    hashdiff (1.0.1)
+    maxitest (3.6.0)
+      minitest (>= 5.0.0, < 5.14.0)
+    method_source (1.0.0)
     mini_portile2 (2.4.0)
-    minitest (5.12.2)
-    mongo (2.9.2)
-      bson (>= 4.4.2, < 5.0.0)
-    nokogiri (1.10.4)
+    minitest (5.13.0)
+    mongo (2.13.0)
+      bson (>= 4.8.2, < 5.0.0)
+    nokogiri (1.10.10)
       mini_portile2 (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    public_suffix (3.1.0)
-    rake (10.5.0)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    public_suffix (4.0.5)
+    rake (13.0.1)
     safe_yaml (1.0.5)
     thor (0.20.3)
     thread (0.2.2)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    webmock (3.7.6)
+    webmock (3.8.3)
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.4.1)
-      addressable (~> 2.6.0)
-      mongo (~> 2.9.0)
-      nokogiri (~> 1.10.3)
-      typhoeus (~> 1.3.1)
+    websocket-driver (0.7.3)
+      websocket-extensions (>= 0.1.0)
+    websocket-extensions (0.1.5)
+    wgit (0.9.0)
+      addressable (~> 2.6)
+      ferrum (~> 0.8)
+      mongo (~> 2.9)
+      nokogiri (~> 1.10)
+      typhoeus (~> 1.3)
 PLATFORMS
   ruby
@@ -58,11 +69,11 @@ DEPENDENCIES
   byebug (~> 11.0)
   maxitest (~> 3.3)
   pry (~> 0.12)
-  rake (~> 10.0)
+  rake (~> 13.0)
   webmock (~> 3.6)
 RUBY VERSION
-   ruby 2.5.3p105
+   ruby 2.7.0p0
 BUNDLED WITH
-   2.0.1
+   2.1.4

data/README.md CHANGED

@@ -1,8 +1,10 @@
 # Broken Link Finder
-Does what it says on the tin; Finds a website's broken links.
+Does what it says on the tin - finds a website's broken links.
-Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a nice concise summary of the broken links found.
+Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
+Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
 ## How It Works
@@ -10,7 +12,7 @@ Any HTML page element with a `href` or `src` attribute is considered a link. For
 - An empty HTML response body is returned.
 - A response status code of `404 Not Found` is returned.
-- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
+- The HTML response body doesn't contain an element ID matching that of the link's fragment e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
 - The link redirects more than 5 times consecutively.
 **Note**: Not all link types are supported.
@@ -55,7 +57,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
     $ broken_link_finder crawl http://txti.es
-Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
+Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
 See the [output](#Output) section below for an example of a site with broken links.
@@ -73,9 +75,9 @@ Below is a simple script which crawls a website and outputs its broken links to
 require 'broken_link_finder'
 finder = BrokenLinkFinder.new
-finder.crawl_site 'http://txti.es'    # Or use Finder#crawl_page for a single webpage.
-finder.pretty_print_link_report       # Or use Finder#broken_links and Finder#ignored_links
-                                      # for direct access to the link Hashes.
+finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
+finder.report                      # Or use Finder#broken_links and Finder#ignored_links
+                                   # for direct access to the link Hashes.
 ```
 Then execute the script with:
@@ -89,28 +91,33 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Found 6 broken link(s) across 2 page(s):
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
+/doesntexist
 http://twitter.com/nwbld
-http://twitter.com/txties
-https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
+twitter.com/txties
 The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
-The following links were ignored on http://txti.es:
+The following links were ignored on 'http://txti.es':
 tel:+13174562564
 mailto:big.jim@jmail.com
-The following links were ignored on http://txti.es/contact:
+The following links were ignored on 'http://txti.es/contact':
 ftp://server.com
 ```
+You can provide the `--html` flag if you'd prefer a HTML based report.
 ## Contributing
 Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
@@ -126,9 +133,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`.
 To release a new gem version:
-- Update the version number in `version.rb` and add the new version to the `CHANGELOG`
-- Run `bundle install`
-- Run `bundle exec rake test` ensuring all tests pass
-- Run `bundle exec rake compile` ensuring no warnings
-- Run `bundle exec rake install && rbenv rehash` and manually test the executable
-- Run `bundle exec rake release[origin]`
+- Update the deps in the `*.gemspec`, if necessary.
+- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
+- Run `bundle install`.
+- Run `bundle exec rake test` ensuring all tests pass.
+- Run `bundle exec rake compile` ensuring no warnings.
+- Run `bundle exec rake install && rbenv rehash`.
+- Manually test the executable.
+- Run `bundle exec rake release[origin]`.

data/benchmark.rb CHANGED

@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
 puts Benchmark.measure { finder.crawl_site url }
 puts "Links crawled: #{finder.total_links_crawled}"
-# http://txti.es page crawl
-# Pre  threading: 17.5 seconds
-# Post threading: 7.5  seconds
+# http://txti.es page crawl with threading
+# Pre:  17.5 seconds
+# Post: 7.5  seconds
-# http://txti.es post threading - page vs site crawl
+# http://txti.es with threading - page vs site crawl
 # Page: 9.526981
 # Site: 9.732416
 # Multi-threading crawl_site now yields the same time as a single page
-# Large site crawl - post all link recording functionality
+# Large site crawl - all link recording functionality
 # Pre:  608 seconds with 7665 links crawled
 # Post: 355 seconds with 1099 links crawled
+# Large site crawl - retry mechanism
+# Pre:  140 seconds
+# Post: 170 seconds

data/bin/console CHANGED

@@ -5,20 +5,10 @@ require 'bundler/setup'
 require 'pry'
 require 'byebug'
 require 'broken_link_finder'
+require 'logger'
-# Monkey patch and log all HTTP requests made during the console.
-module Typhoeus
-  singleton_class.class_eval do
-    alias_method :orig_get, :get
-  end
-  def self.get(base_url, options = {})
-    puts "[typhoeus] Sending GET: #{base_url}"
-    resp = orig_get(base_url, options)
-    puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
-    resp
-  end
-end
+# Logs all HTTP requests.
+Wgit.logger.level = Logger::DEBUG
 # Call reload to load all recent code changes.
 def reload
@@ -33,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/bin/setup CHANGED

@@ -5,4 +5,4 @@ set -vx
 bundle install
-# Do any other automated setup that you need to do here
+# Do any other automated setup that you need to do here...

data/broken_link_finder.gemspec CHANGED

@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = 'https://github.com/michaeltelford/broken-link-finder'
   spec.license       = 'MIT'
   spec.metadata      = {
-    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
+    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
+    'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
+    'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
+    'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
   }
   # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'byebug', '~> 11.0'
   spec.add_development_dependency 'maxitest', '~> 3.3'
   spec.add_development_dependency 'pry', '~> 0.12'
-  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rake', '~> 13.0'
   spec.add_development_dependency 'webmock', '~> 3.6'
-  spec.add_runtime_dependency 'thor', '~> 0.20.3'
-  spec.add_runtime_dependency 'thread', '~> 0.2.0'
-  spec.add_runtime_dependency 'wgit', '~> 0.4.1'
+  spec.add_runtime_dependency 'thor', '~> 0.20'
+  spec.add_runtime_dependency 'thread', '~> 0.2'
+  spec.add_runtime_dependency 'wgit', '~> 0.9'
 end

data/exe/broken_link_finder CHANGED

@@ -9,12 +9,14 @@ class BrokenLinkFinderCLI < Thor
   desc 'crawl [URL]', 'Find broken links at the URL'
   option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
   option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
+  option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
   option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
   option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
   option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
   def crawl(url)
     url = "http://#{url}" unless url.start_with?('http')
+    report_type     = options[:html] ? :html : :text
     sort_by         = options[:sort_by_link] ? :link : :page
     max_threads     = options[:threads]
     broken_verbose  = !options[:concise]
@@ -22,17 +24,24 @@ class BrokenLinkFinderCLI < Thor
     finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
-    finder.pretty_print_link_report(
-      broken_verbose: broken_verbose,
+    finder.report(
+      type:            report_type,
+      broken_verbose:  broken_verbose,
       ignored_verbose: ignored_verbose
     )
-  rescue Exception => e
+    exit 0
+  rescue StandardError => e
     puts "An error has occurred: #{e.message}"
+    exit 1
   end
   desc 'version', 'Display the currently installed version'
   def version
     puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
+    exit 0
   end
 end

data/lib/broken_link_finder.rb CHANGED

@@ -2,8 +2,13 @@
 require 'wgit'
 require 'wgit/core_ext'
+require 'thread/pool'
+require 'set'
 require_relative './broken_link_finder/wgit_extensions'
 require_relative './broken_link_finder/version'
-require_relative './broken_link_finder/reporter'
+require_relative './broken_link_finder/link_manager'
+require_relative './broken_link_finder/reporter/reporter'
+require_relative './broken_link_finder/reporter/text_reporter'
+require_relative './broken_link_finder/reporter/html_reporter'
 require_relative './broken_link_finder/finder'

data/lib/broken_link_finder/finder.rb CHANGED

@@ -1,234 +1,227 @@
 # frozen_string_literal: true
-require_relative 'reporter'
-require 'thread/pool'
-require 'set'
 module BrokenLinkFinder
-  DEFAULT_MAX_THREADS = 100
+  DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
+  SERVER_WAIT_TIME    = 0.5 # Used by Finder#retry_broken_links.
   # Alias for BrokenLinkFinder::Finder.new.
   def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
     Finder.new(sort: sort, max_threads: max_threads)
   end
+  # Class responsible for finding broken links on a page or site.
   class Finder
-    attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
+    # The collection key - either :page or :link.
+    attr_reader :sort
+    # The max number of threads created during #crawl_site - one thread per page.
+    attr_reader :max_threads
-    # Creates a new Finder instance.
-    def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
+    # Returns a new Finder instance.
+    def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
       raise "Sort by either :page or :link, not #{sort}" \
       unless %i[page link].include?(sort)
       @sort        = sort
       @max_threads = max_threads
-      @lock        = Mutex.new
       @crawler     = Wgit::Crawler.new
+      @manager     = BrokenLinkFinder::LinkManager.new(@sort)
+    end
+    # Returns the current broken links.
+    def broken_links
+      @manager.broken_links
+    end
-      clear_links
+    # Returns the current ignored links.
+    def ignored_links
+      @manager.ignored_links
     end
-    # Clear/empty the link collection Hashes.
-    def clear_links
-      @broken_links        = {}
-      @ignored_links       = {}
-      @total_links_crawled = 0
-      @all_broken_links    = Set.new
-      @all_intact_links    = Set.new
+    # Returns the current crawl stats.
+    def crawl_stats
+      @manager.crawl_stats
     end
-    # Finds broken links within a single page and appends them to the
-    # @broken_links array. Returns true if at least one broken link was found.
+    # Finds broken links within a single page and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_url(url)
-      clear_links
+      @manager.empty
-      url = url.to_url
-      doc = @crawler.crawl(url)
+      start = Time.now
+      url   = url.to_url
+      # We dup the url to avoid recording any redirects.
+      doc = @crawler.crawl(url.dup)
       # Ensure the given page url is valid.
       raise "Invalid or broken URL: #{url}" unless doc
       # Get all page links and determine which are broken.
       find_broken_links(doc)
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: [url], start: start)
-      @broken_links.any?
+      broken_links.any?
     end
-    # Finds broken links within an entire site and appends them to the
-    # @broken_links array. Returns a tuple containing a Boolean of true if
-    # at least one broken link was found and an Array of all pages crawled.
+    # Finds broken links within an entire site and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
-    def crawl_site(url)
-      clear_links
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil)
+      @manager.empty
-      url           = url.to_url
-      pool          = Thread.pool(@max_threads)
-      crawled_pages = []
+      start   = Time.now
+      url     = url.to_url
+      pool    = Thread.pool(@max_threads)
+      crawled = Set.new
       # Crawl the site's HTML web pages looking for links.
-      externals = @crawler.crawl_site(url) do |doc|
-        crawled_pages << doc.url
+      # We dup the url to avoid recording any redirects.
+      paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      externals = @crawler.crawl_site(url.dup, **paths) do |doc|
+        crawled << doc.url
         next unless doc
         # Start a thread for each page, checking for broken links.
         pool.process { find_broken_links(doc) }
       end
+      # Wait for all threads to finish, even if url was invalid.
+      pool.shutdown
       # Ensure the given website url is valid.
       raise "Invalid or broken URL: #{url}" unless externals
-      # Wait for all threads to finish.
-      pool.shutdown
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
-      [@broken_links.any?, crawled_pages.uniq]
+      broken_links.any?
+    ensure
+      pool.shutdown if defined?(pool)
     end
-    # Pretty prints the link report into a stream e.g. STDOUT or a file,
+    # Outputs the link report into a stream e.g. STDOUT or a file,
     # anything that respond_to? :puts. Defaults to STDOUT.
-    # Returns true if there were broken links and vice versa.
-    def pretty_print_link_report(
-      stream = STDOUT,
-      broken_verbose:  true,
-      ignored_verbose: false
-    )
-      reporter = BrokenLinkFinder::Reporter.new(
-        stream, @sort, @broken_links, @ignored_links
-      )
-      reporter.pretty_print_link_report(
-        broken_verbose:  broken_verbose,
-        ignored_verbose: ignored_verbose
-      )
-      @broken_links.any?
+    def report(stream = STDOUT, type: :text,
+               broken_verbose: true, ignored_verbose: false)
+      klass = case type
+              when :text
+                BrokenLinkFinder::TextReporter
+              when :html
+                BrokenLinkFinder::HTMLReporter
+              else
+                raise "The type: must be :text or :html, not: :#{type}"
+              end
+      reporter = klass.new(stream, @sort,
+                           broken_links, ignored_links,
+                           @manager.broken_link_map, crawl_stats)
+      reporter.call(broken_verbose: broken_verbose,
+                    ignored_verbose: ignored_verbose)
     end
     private
     # Finds which links are unsupported or broken and records the details.
-    def find_broken_links(doc)
-      links = get_supported_links(doc)
+    def find_broken_links(page)
+      record_unparsable_links(page) # Record them as broken.
+      links = get_supported_links(page)
       # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        # Check if the link has already been processed previously.
-        next if @all_intact_links.include?(link)
+        # Skip if the link has been encountered previously.
+        next if @manager.all_intact_links.include?(link)
-        if @all_broken_links.include?(link)
-          append_broken_link(doc.url, link)
+        if @manager.all_broken_links.include?(link)
+          # The link has already been proven broken so simply record it.
+          @manager.append_broken_link(page, link, map: false)
           next
         end
-        # The link hasn't been processed before so we crawl it.
-        link_doc = crawl_link(doc, link)
+        # The link hasn't been encountered before so we crawl it.
+        link_doc = crawl_link(page, link)
-        # Determine if the crawled link is broken or not.
-        if  link_doc.nil? ||
-            @crawler.last_response.code == 404 ||
-            has_broken_anchor(link_doc)
-          append_broken_link(doc.url, link)
+        # Determine if the crawled link is broken or not and record it.
+        if link_broken?(link_doc)
+          @manager.append_broken_link(page, link)
         else
-          @lock.synchronize { @all_intact_links << link }
+          @manager.append_intact_link(link)
         end
       end
       nil
     end
-    # Report and reject any non supported links. Any link that is absolute and
-    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
-    def get_supported_links(doc)
-      doc.all_links
-         .reject do |link|
-           if link.is_absolute? && !link.start_with?('http')
-             append_ignored_link(doc.url, link)
-             true
-           end
-         end
-    end
+    # Implements a retry mechanism for each of the broken links found.
+    # Removes any broken links found to be working OK.
+    def retry_broken_links
+      sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
-    # Makes the link absolute and crawls it, returning its Wgit::Document.
-    def crawl_link(doc, link)
-      link = get_absolute_link(doc, link)
-      @crawler.crawl(link)
-    end
+      @manager.broken_link_map.select! do |link, href|
+        # Don't retry unparsable links (which are Strings).
+        next(true) unless href.is_a?(Wgit::Url)
-    # Returns the link in absolute form so it can be crawled.
-    def get_absolute_link(doc, link)
-      link.is_relative? ? doc.base_url(link: link).concat(link) : link
-    end
+        doc = @crawler.crawl(href.dup)
-    # Returns true if the link is/contains a broken anchor.
-    def has_broken_anchor(doc)
-      raise 'link document is nil' unless doc
-      anchor = doc.url.anchor
-      return false if anchor.nil? || (anchor == '#')
-      anchor = anchor[1..-1] if anchor.start_with?('#')
-      doc.xpath("//*[@id='#{anchor}']").empty?
+        if link_broken?(doc)
+          true
+        else
+          @manager.remove_broken_link(link)
+          false
+        end
+      end
     end
-    # Append key => [value] to @broken_links.
-    def append_broken_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @broken_links[key] = [] unless @broken_links[key]
-        @broken_links[key] << value
-        @all_broken_links  << link
+    # Record each unparsable link as a broken link.
+    def record_unparsable_links(doc)
+      doc.unparsable_links.each do |link|
+        # We map the link ourselves because link is a String, not a Wgit::Url.
+        @manager.append_broken_link(doc, link, map: false)
+        @manager.broken_link_map[link] = link
       end
     end
-    # Append key => [value] to @ignored_links.
-    def append_ignored_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @ignored_links[key] = [] unless @ignored_links[key]
-        @ignored_links[key] << value
+    # Report and reject any non supported links. Any link that is absolute and
+    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
+    def get_supported_links(doc)
+      doc.all_links.reject do |link|
+        if link.is_absolute? && !link.start_with?('http')
+          @manager.append_ignored_link(doc.url, link)
+          true
+        end
       end
     end
-    # Returns the correct key value depending on the @sort type.
-    # @sort == :page ? [url, link] : [link, url]
-    def get_key_value(url, link)
-      case @sort
-      when :page
-        [url, link]
-      when :link
-        [link, url]
-      else
-        raise "Unsupported sort type: #{sort}"
-      end
+    # Make the link absolute and crawl it, returning its Wgit::Document.
+    def crawl_link(doc, link)
+      link = link.make_absolute(doc)
+      @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
     end
-    # Sort keys and values alphabetically.
-    def sort_links
-      @broken_links.values.map(&:uniq!)
-      @ignored_links.values.map(&:uniq!)
+    # Return if the crawled link is broken or not.
+    def link_broken?(doc)
+      doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
+    end
-      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
-      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
+    # Returns true if the link is/contains a broken anchor/fragment.
+    # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
+    def has_broken_anchor(doc)
+      raise 'The link document is nil' unless doc
-      @broken_links.each  { |_k, v| v.sort! }
-      @ignored_links.each { |_k, v| v.sort! }
-    end
+      fragment = doc.url.fragment
+      return false if fragment.nil? || fragment.empty?
-    # Sets and returns the total number of links crawled.
-    def set_total_links_crawled
-      @total_links_crawled = @all_broken_links.size + @all_intact_links.size
+      doc.xpath("//*[@id='#{fragment}']").empty?
     end
-    alias crawl_page                crawl_url
-    alias crawl_r                   crawl_site
-    alias pretty_print_link_summary pretty_print_link_report
+    alias crawl_page crawl_url
+    alias crawl_r    crawl_site
   end
 end