RubyGems - broken_link_finder - Versions diffs - 0.10.0 → 0.12.1 - Mend

broken_link_finder 0.10.0 → 0.12.1

Files changed (20) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +40 -0
data/Gemfile +1 -1
data/Gemfile.lock +48 -35
data/README.md +48 -16
data/bin/console +8 -6
data/bin/setup +1 -1
data/broken_link_finder.gemspec +3 -3
data/exe/broken_link_finder +9 -1
data/lib/broken_link_finder/finder.rb +98 -135
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +9 -6
data/lib/broken_link_finder/reporter/reporter.rb +2 -3
data/lib/broken_link_finder/reporter/text_reporter.rb +7 -5
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
data/lib/broken_link_finder/xpath.rb +14 -0
data/lib/broken_link_finder.rb +3 -1
metadata +16 -11

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a53784c1bd2f75c18b3492ea782b4cc2e229a94f89afcf33b60ef633512554e
-  data.tar.gz: 393dca220b7f00d72314c93e7b877e0412afdf784fa2e563bbecb2dc6c6b29f7
+  metadata.gz: 88b1e96f1de644a1a3c06ba7cc0ee1b53f75a3de6686b343e55028e8fa69da9f
+  data.tar.gz: e399ca05a4b0b9b2c0644b2846fa9dc6be6acd664e1bdc58758eb9ca7a5543cd
 SHA512:
-  metadata.gz: c0d304e5b0a9258265c5c084c0a6e5819c169ba8eb02b3c6317a37784a9ca12982b0fc520c3cca1060fde60126ee936708d7891c69133c5d72c9c0287a79b3f5
-  data.tar.gz: c21a4aec2c077e2617fb625debad28f746148ad98229a27a590a4412601e30759c709aa3a6e6d80e81c16160e16968fc0392181fc9c75e4da06578452f7c5ab6
+  metadata.gz: 57a1604358b0297b66604d1fc5a60a9d1bda05aa9bd5f6b91135ddc2aec4a6eb703c00ef4d905ac156170b190bf500481ce56cf6319f07e8b57447cca4c6a210
+  data.tar.gz: f4b88e66c9c4fcd2bcbca2fe882abdede7c531e1d5e752a2ac986e39cf51d87714852dcb6e7e8e4870b623d54b468cc8f3ec88c253e7182c1fe89c0af91366a4

data/.ruby-version CHANGED Viewed

	@@ -1 +1 @@
1	- 2.5.3
1	+ 3.0.2

data/CHANGELOG.md CHANGED Viewed

@@ -9,6 +9,46 @@
 - ...
 ---
+## v0.12.1
+### Added
+- Support for Ruby 3.
+### Changed/Removed
+- Removed support for Ruby 2.5 (as it's too old).
+### Fixed
+- ...
+---
+## v0.12.0
+### Added
+- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
+- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
+### Changed/Removed
+- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
+### Fixed
+- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
+---
+## v0.11.1
+### Added
+- ...
+### Changed/Removed
+- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
 ## v0.10.0
 ### Added
 - A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).

data/Gemfile CHANGED Viewed

@@ -2,7 +2,7 @@
 source 'https://rubygems.org'
-ruby '~> 2.5'
+ruby '>= 2.6', '< 4'
 # Specify your gem's dependencies in broken_link_finder.gemspec
 gemspec

data/Gemfile.lock CHANGED Viewed

@@ -1,50 +1,63 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.10.0)
+    broken_link_finder (0.12.1)
       thor (~> 0.20)
       thread (~> 0.2)
-      wgit (~> 0.5)
+      wgit (~> 0.10)
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.6.0)
-      public_suffix (>= 2.0.2, < 4.0)
-    bson (4.6.0)
-    byebug (11.0.1)
-    coderay (1.1.2)
-    crack (0.4.3)
-      safe_yaml (~> 1.0.0)
-    ethon (0.12.0)
-      ffi (>= 1.3.0)
-    ffi (1.11.3)
-    hashdiff (1.0.0)
-    maxitest (3.4.0)
-      minitest (>= 5.0.0, < 5.13.0)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.12.2)
-    mongo (2.11.1)
-      bson (>= 4.6.0, < 5.0.0)
-    nokogiri (1.10.5)
-      mini_portile2 (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    public_suffix (3.1.0)
-    rake (10.5.0)
-    safe_yaml (1.0.5)
+    addressable (2.8.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    bson (4.12.1)
+    byebug (11.1.3)
+    cliver (0.3.2)
+    coderay (1.1.3)
+    concurrent-ruby (1.1.9)
+    crack (0.4.5)
+      rexml
+    ethon (0.15.0)
+      ffi (>= 1.15.0)
+    ferrum (0.11)
+      addressable (~> 2.5)
+      cliver (~> 0.3)
+      concurrent-ruby (~> 1.1)
+      websocket-driver (>= 0.6, < 0.8)
+    ffi (1.15.4)
+    hashdiff (1.0.1)
+    maxitest (3.7.0)
+      minitest (>= 5.0.0, < 5.15.0)
+    method_source (1.0.0)
+    mini_portile2 (2.6.1)
+    minitest (5.14.4)
+    mongo (2.17.0)
+      bson (>= 4.8.2, < 5.0.0)
+    nokogiri (1.12.5)
+      mini_portile2 (~> 2.6.1)
+      racc (~> 1.4)
+    pry (0.14.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    public_suffix (4.0.6)
+    racc (1.6.0)
+    rake (13.0.6)
+    rexml (3.2.5)
     thor (0.20.3)
     thread (0.2.2)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    webmock (3.7.6)
-      addressable (>= 2.3.6)
+    webmock (3.14.0)
+      addressable (>= 2.8.0)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.5.1)
+    websocket-driver (0.7.5)
+      websocket-extensions (>= 0.1.0)
+    websocket-extensions (0.1.5)
+    wgit (0.10.2)
       addressable (~> 2.6)
+      ferrum (~> 0.8)
       mongo (~> 2.9)
       nokogiri (~> 1.10)
       typhoeus (~> 1.3)
@@ -58,11 +71,11 @@ DEPENDENCIES
   byebug (~> 11.0)
   maxitest (~> 3.3)
   pry (~> 0.12)
-  rake (~> 10.0)
+  rake (~> 13.0)
   webmock (~> 3.6)
 RUBY VERSION
-   ruby 2.5.3p105
+   ruby 3.0.2p107
 BUNDLED WITH
-   2.0.2
+   2.2.22

data/README.md CHANGED Viewed

@@ -1,14 +1,16 @@
 # Broken Link Finder
-Does what it says on the tin; Finds a website's broken links.
+Does what it says on the tin - finds a website's broken links.
-Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
+Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
-Because `libcurl` is used under the hood, Broken Link Finder is fast!
+Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
 ## How It Works
-Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
+Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
+For each link on a given page, any of the following conditions constitutes that the link is broken:
 - An empty HTML response body is returned.
 - A response status code of `404 Not Found` is returned.
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
 ## Installation
-Add this line to your application's Gemfile:
+Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
-```ruby
-gem 'broken_link_finder'
-```
+Currently, the required MRI Ruby version is:
-And then execute:
+`ruby '>= 2.6', '< 4'`
-    $ bundle
+### Using Bundler
-Or install it yourself as:
+    $ bundle add broken_link_finder
+### Using RubyGems
     $ gem install broken_link_finder
-Finally, verify the installation with:
+### Verify
     $ broken_link_finder version
 ## Usage
-You can check for broken links via the library or executable.
+You can check for broken links via the executable or library.
 ### Executable
@@ -91,9 +93,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Crawled http://txti.es (7 page(s) in 7.88 seconds)
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
-Found 6 broken link(s) across 2 page(s):
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
@@ -105,7 +108,7 @@ The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
 The following links were ignored on 'http://txti.es':
 tel:+13174562564
@@ -117,6 +120,35 @@ ftp://server.com
 You can provide the `--html` flag if you'd prefer a HTML based report.
+## Link Extraction
+You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
+### Executable
+Add the `--xpath` (or `-x`) flag to the crawl command e.g.
+    $ broken_link_finder crawl http://txti.es -x //img/@src
+### Library
+Set the desired XPath using the accessor methods provided:
+> main.rb
+```ruby
+require 'broken_link_finder'
+# Set your desired xpath before crawling...
+BrokenLinkFinder::link_xpath = '//img/@src'
+# Now crawl as normal and only your custom targeted links will be checked.
+BrokenLinkFinder.new.crawl_page 'http://txti.es'
+# Go back to using the default provided xpath as needed.
+BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
+```
 ## Contributing
 Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.

data/bin/console CHANGED Viewed

@@ -23,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/bin/setup CHANGED Viewed

@@ -5,4 +5,4 @@ set -vx
 bundle install
-# Do any other automated setup that you need to do here
+# Do any other automated setup that you need to do here...

data/broken_link_finder.gemspec CHANGED Viewed

@@ -38,16 +38,16 @@ Gem::Specification.new do |spec|
   spec.require_paths = ['lib']
   spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
-  spec.required_ruby_version = '~> 2.5'
+  spec.required_ruby_version = '>= 2.6', '< 4'
   spec.add_development_dependency 'bundler', '~> 2.0'
   spec.add_development_dependency 'byebug', '~> 11.0'
   spec.add_development_dependency 'maxitest', '~> 3.3'
   spec.add_development_dependency 'pry', '~> 0.12'
-  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rake', '~> 13.0'
   spec.add_development_dependency 'webmock', '~> 3.6'
   spec.add_runtime_dependency 'thor', '~> 0.20'
   spec.add_runtime_dependency 'thread', '~> 0.2'
-  spec.add_runtime_dependency 'wgit', '~> 0.5'
+  spec.add_runtime_dependency 'wgit', '~> 0.10'
 end

data/exe/broken_link_finder CHANGED Viewed

@@ -9,6 +9,7 @@ class BrokenLinkFinderCLI < Thor
   desc 'crawl [URL]', 'Find broken links at the URL'
   option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
   option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
+  option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
   option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
   option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
   option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
@@ -22,6 +23,7 @@ class BrokenLinkFinderCLI < Thor
     broken_verbose  = !options[:concise]
     ignored_verbose = options[:verbose]
+    BrokenLinkFinder.link_xpath = options[:xpath]
     finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
     finder.report(
@@ -29,13 +31,19 @@ class BrokenLinkFinderCLI < Thor
       broken_verbose:  broken_verbose,
       ignored_verbose: ignored_verbose
     )
-  rescue Exception => e
+    exit 0
+  rescue StandardError => e
     puts "An error has occurred: #{e.message}"
+    exit 1
   end
   desc 'version', 'Display the currently installed version'
   def version
     puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
+    exit 0
   end
 end

data/lib/broken_link_finder/finder.rb CHANGED Viewed

@@ -1,48 +1,59 @@
 # frozen_string_literal: true
 module BrokenLinkFinder
-  DEFAULT_MAX_THREADS = 100
+  DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
+  SERVER_WAIT_TIME    = 0.5 # Used by Finder#retry_broken_links.
   # Alias for BrokenLinkFinder::Finder.new.
   def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
     Finder.new(sort: sort, max_threads: max_threads)
   end
+  # Class responsible for finding broken links on a page or site.
   class Finder
-    attr_reader :sort, :max_threads, :broken_links, :ignored_links, :crawl_stats
+    # The collection key - either :page or :link.
+    attr_reader :sort
-    # Creates a new Finder instance.
-    def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
+    # The max number of threads created during #crawl_site - one thread per page.
+    attr_reader :max_threads
+    # Returns a new Finder instance.
+    def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
       raise "Sort by either :page or :link, not #{sort}" \
       unless %i[page link].include?(sort)
       @sort        = sort
       @max_threads = max_threads
-      @lock        = Mutex.new
       @crawler     = Wgit::Crawler.new
+      @manager     = BrokenLinkFinder::LinkManager.new(@sort)
+    end
+    # Returns the current broken links.
+    def broken_links
+      @manager.broken_links
+    end
-      reset_crawl
+    # Returns the current ignored links.
+    def ignored_links
+      @manager.ignored_links
     end
-    # Clear/empty the link collection Hashes.
-    def reset_crawl
-      @broken_links        = {}
-      @ignored_links       = {}
-      @all_broken_links    = Set.new # Used to prevent crawling a link twice.
-      @all_intact_links    = Set.new #  "
-      @broken_link_map     = {}      # Maps a link to its absolute form.
-      @crawl_stats         = {}      # Records crawl stats e.g. duration etc.
+    # Returns the current crawl stats.
+    def crawl_stats
+      @manager.crawl_stats
     end
-    # Finds broken links within a single page and appends them to the
-    # @broken_links array. Returns true if at least one broken link was found.
+    # Finds broken links within a single page and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_url(url)
-      reset_crawl
+      @manager.empty
       start = Time.now
       url   = url.to_url
-      doc   = @crawler.crawl(url)
+      # We dup the url to avoid recording any redirects.
+      doc = @crawler.crawl(url.dup)
       # Ensure the given page url is valid.
       raise "Invalid or broken URL: #{url}" unless doc
@@ -51,18 +62,17 @@ module BrokenLinkFinder
       find_broken_links(doc)
       retry_broken_links
-      sort_links
-      set_crawl_stats(url: url, pages_crawled: [url], start: start)
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: [url], start: start)
-      @broken_links.any?
+      broken_links.any?
     end
-    # Finds broken links within an entire site and appends them to the
-    # @broken_links array. Returns a tuple containing a Boolean of true if
-    # at least one broken link was found and an Array of all pages crawled.
+    # Finds broken links within an entire site and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
-    def crawl_site(url)
-      reset_crawl
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil)
+      @manager.empty
       start   = Time.now
       url     = url.to_url
@@ -70,7 +80,9 @@ module BrokenLinkFinder
       crawled = Set.new
       # Crawl the site's HTML web pages looking for links.
-      externals = @crawler.crawl_site(url) do |doc|
+      # We dup the url to avoid recording any redirects.
+      paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      externals = @crawler.crawl_site(url.dup, **paths) do |doc|
         crawled << doc.url
         next unless doc
@@ -78,35 +90,39 @@ module BrokenLinkFinder
         pool.process { find_broken_links(doc) }
       end
+      # Wait for all threads to finish, even if url was invalid.
+      pool.shutdown
       # Ensure the given website url is valid.
       raise "Invalid or broken URL: #{url}" unless externals
-      # Wait for all threads to finish.
-      pool.shutdown
       retry_broken_links
-      sort_links
-      set_crawl_stats(url: url, pages_crawled: crawled.to_a, start: start)
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
-      @broken_links.any?
+      broken_links.any?
+    ensure
+      pool.shutdown if defined?(pool)
     end
-    # Pretty prints the link report into a stream e.g. STDOUT or a file,
+    # Outputs the link report into a stream e.g. STDOUT or a file,
     # anything that respond_to? :puts. Defaults to STDOUT.
-    def report(stream = STDOUT,
-               type: :text, broken_verbose: true, ignored_verbose: false)
+    def report(stream = STDOUT, type: :text,
+               broken_verbose: true, ignored_verbose: false)
       klass = case type
               when :text
                 BrokenLinkFinder::TextReporter
               when :html
                 BrokenLinkFinder::HTMLReporter
               else
-                raise "type: must be :text or :html, not: :#{type}"
+                raise "The type: must be :text or :html, not: :#{type}"
               end
-      reporter = klass.new(stream, @sort, @broken_links,
-                           @ignored_links, @broken_link_map, @crawl_stats)
-      reporter.call(broken_verbose:  broken_verbose,
+      reporter = klass.new(stream, @sort,
+                           broken_links, ignored_links,
+                           @manager.broken_link_map, crawl_stats)
+      reporter.call(broken_verbose: broken_verbose,
                     ignored_verbose: ignored_verbose)
     end
@@ -114,26 +130,29 @@ module BrokenLinkFinder
     # Finds which links are unsupported or broken and records the details.
     def find_broken_links(page)
+      record_unparsable_links(page) # Record them as broken.
       links = get_supported_links(page)
       # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        # Skip if the link has been processed previously.
-        next if @all_intact_links.include?(link)
+        # Skip if the link has been encountered previously.
+        next if @manager.all_intact_links.include?(link)
-        if @all_broken_links.include?(link)
-          append_broken_link(page.url, link) # Record on which page.
+        if @manager.all_broken_links.include?(link)
+          # The link has already been proven broken so simply record it.
+          @manager.append_broken_link(page, link, map: false)
           next
         end
-        # The link hasn't been processed before so we crawl it.
+        # The link hasn't been encountered before so we crawl it.
         link_doc = crawl_link(page, link)
-        # Determine if the crawled link is broken or not.
+        # Determine if the crawled link is broken or not and record it.
         if link_broken?(link_doc)
-          append_broken_link(page.url, link, doc: page)
+          @manager.append_broken_link(page, link)
         else
-          @lock.synchronize { @all_intact_links << link }
+          @manager.append_intact_link(link)
         end
       end
@@ -143,30 +162,47 @@ module BrokenLinkFinder
     # Implements a retry mechanism for each of the broken links found.
     # Removes any broken links found to be working OK.
     def retry_broken_links
-      sleep(0.5) # Give the servers a break, then retry the links.
+      sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
+      @manager.broken_link_map.select! do |link, href|
+        # Don't retry unparsable links (which are Strings).
+        next(true) unless href.is_a?(Wgit::Url)
+        doc = @crawler.crawl(href.dup)
-      @broken_link_map.each do |link, href|
-        doc = @crawler.crawl(href)
-        remove_broken_link(link) unless link_broken?(doc)
+        if link_broken?(doc)
+          true
+        else
+          @manager.remove_broken_link(link)
+          false
+        end
+      end
+    end
+    # Record each unparsable link as a broken link.
+    def record_unparsable_links(doc)
+      doc.unparsable_links.each do |link|
+        # We map the link ourselves because link is a String, not a Wgit::Url.
+        @manager.append_broken_link(doc, link, map: false)
+        @manager.broken_link_map[link] = link
       end
     end
     # Report and reject any non supported links. Any link that is absolute and
     # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
     def get_supported_links(doc)
-      doc.all_links
-         .reject do |link|
-           if link.is_absolute? && !link.start_with?('http')
-             append_ignored_link(doc.url, link)
-             true
-           end
-         end
+      doc.all_links.reject do |link|
+        if link.is_absolute? && !link.start_with?('http')
+          @manager.append_ignored_link(doc.url, link)
+          true
+        end
+      end
     end
     # Make the link absolute and crawl it, returning its Wgit::Document.
     def crawl_link(doc, link)
-      link = link.prefix_base(doc)
-      @crawler.crawl(link)
+      link = link.make_absolute(doc)
+      @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
     end
     # Return if the crawled link is broken or not.
@@ -175,8 +211,9 @@ module BrokenLinkFinder
     end
     # Returns true if the link is/contains a broken anchor/fragment.
+    # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
     def has_broken_anchor(doc)
-      raise 'link document is nil' unless doc
+      raise 'The link document is nil' unless doc
       fragment = doc.url.fragment
       return false if fragment.nil? || fragment.empty?
@@ -184,80 +221,6 @@ module BrokenLinkFinder
       doc.xpath("//*[@id='#{fragment}']").empty?
     end
-    # Append key => [value] to @broken_links.
-    # If doc: is provided then the link will be recorded in absolute form.
-    def append_broken_link(url, link, doc: nil)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @broken_links[key] = [] unless @broken_links[key]
-        @broken_links[key] << value
-        @all_broken_links  << link
-        @broken_link_map[link] = link.prefix_base(doc) if doc
-      end
-    end
-    # Remove the broken_link from the necessary collections.
-    def remove_broken_link(link)
-      @lock.synchronize do
-        if @sort == :page
-          @broken_links.each { |_k, links| links.delete(link) }
-          @broken_links.delete_if { |_k, links| links.empty? }
-        else
-          @broken_links.delete(link)
-        end
-        @all_broken_links.delete(link)
-        @all_intact_links << link
-      end
-    end
-    # Append key => [value] to @ignored_links.
-    def append_ignored_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @ignored_links[key] = [] unless @ignored_links[key]
-        @ignored_links[key] << value
-      end
-    end
-    # Returns the correct key value depending on the @sort type.
-    # @sort == :page ? [url, link] : [link, url]
-    def get_key_value(url, link)
-      case @sort
-      when :page
-        [url, link]
-      when :link
-        [link, url]
-      else
-        raise "Unsupported sort type: #{sort}"
-      end
-    end
-    # Sort keys and values alphabetically.
-    def sort_links
-      @broken_links.values.map(&:uniq!)
-      @ignored_links.values.map(&:uniq!)
-      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
-      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
-      @broken_links.each  { |_k, v| v.sort! }
-      @ignored_links.each { |_k, v| v.sort! }
-    end
-    # Sets and returns the total number of links crawled.
-    def set_crawl_stats(url:, pages_crawled:, start:)
-      @crawl_stats[:url] = url
-      @crawl_stats[:pages_crawled] = pages_crawled
-      @crawl_stats[:num_pages] = pages_crawled.size
-      @crawl_stats[:num_links] = @all_broken_links.size + @all_intact_links.size
-      @crawl_stats[:duration] = Time.now - start
-    end
     alias crawl_page crawl_url
     alias crawl_r    crawl_site
   end

data/lib/broken_link_finder/link_manager.rb ADDED Viewed

@@ -0,0 +1,137 @@
+# frozen_string_literal: true
+module BrokenLinkFinder
+  # Class responsible for handling the link collection logic.
+  class LinkManager
+    # Used for mapping pages to broken links.
+    attr_reader :broken_links
+    # Used for mapping pages to ignored links.
+    attr_reader :ignored_links
+    # Used to record crawl statistics e.g. duration etc.
+    attr_reader :crawl_stats
+    # Used to map a link (as is) to its absolute (crawlable) form.
+    attr_reader :broken_link_map
+    # Used to prevent crawling a broken link twice.
+    attr_reader :all_broken_links
+    # Used to prevent crawling an intact link twice.
+    attr_reader :all_intact_links
+    # Used for building crawl statistics.
+    attr_reader :all_ignored_links
+    # Returns a new LinkManager instance with empty link collections.
+    def initialize(sort)
+      raise "Sort by either :page or :link, not #{sort}" \
+      unless %i[page link].include?(sort)
+      @sort = sort
+      @lock = Mutex.new
+      empty # Initialises the link collections.
+    end
+    # Initialise/empty the link collection objects.
+    def empty
+      @broken_links      = {}
+      @ignored_links     = {}
+      @crawl_stats       = {}
+      @broken_link_map   = {}
+      @all_broken_links  = Set.new
+      @all_intact_links  = Set.new
+      @all_ignored_links = Set.new
+    end
+    # Append key => [value] to the broken link collections.
+    # If map: true, then the link will also be recorded in @broken_link_map.
+    def append_broken_link(doc, link, map: true)
+      key, value = get_key_value(doc.url, link)
+      @lock.synchronize do
+        @broken_links[key] = [] unless @broken_links[key]
+        @broken_links[key] << value
+        @all_broken_links << link
+        @broken_link_map[link] = link.make_absolute(doc) if map
+      end
+    end
+    # Remove the broken link from the necessary collections.
+    def remove_broken_link(link)
+      @lock.synchronize do
+        if @sort == :page
+          @broken_links.each { |_k, links| links.delete(link) }
+          @broken_links.delete_if { |_k, links| links.empty? }
+        else
+          @broken_links.delete(link)
+        end
+        @all_broken_links.delete(link)
+        @all_intact_links << link
+      end
+    end
+    # Append key => [value] to the ignored link collections.
+    def append_ignored_link(url, link)
+      key, value = get_key_value(url, link)
+      @lock.synchronize do
+        @ignored_links[key] = [] unless @ignored_links[key]
+        @ignored_links[key] << value
+        @all_ignored_links << link
+      end
+    end
+    # Append link to @all_intact_links.
+    def append_intact_link(link)
+      @lock.synchronize { @all_intact_links << link }
+    end
+    # Sorts the link collection's keys and values alphabetically.
+    def sort
+      @broken_links.values.map(&:uniq!)
+      @ignored_links.values.map(&:uniq!)
+      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
+      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
+      @broken_links.each  { |_k, v| v.sort! }
+      @ignored_links.each { |_k, v| v.sort! }
+    end
+    # Tally's up various statistics about the crawl and its links.
+    def tally(url:, pages_crawled:, start:)
+      @crawl_stats[:url]               = url
+      @crawl_stats[:pages_crawled]     = pages_crawled
+      @crawl_stats[:num_pages]         = pages_crawled.size
+      @crawl_stats[:num_links]         = (
+        @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
+      )
+      @crawl_stats[:num_broken_links]  = @all_broken_links.size
+      @crawl_stats[:num_intact_links]  = @all_intact_links.size
+      @crawl_stats[:num_ignored_links] = @all_ignored_links.size
+      @crawl_stats[:duration]          = Time.now - start
+    end
+    private
+    # Returns the correct key value depending on the @sort type.
+    # @sort == :page ? [url, link] : [link, url]
+    def get_key_value(url, link)
+      case @sort
+      when :page
+        [url, link]
+      when :link
+        [link, url]
+      else
+        raise "Unsupported sort type: #{sort}"
+      end
+    end
+  end
+end

data/lib/broken_link_finder/reporter/html_reporter.rb CHANGED Viewed

@@ -1,8 +1,9 @@
 # frozen_string_literal: true
 module BrokenLinkFinder
+  # Class responsible for reporting in a HTML format.
   class HTMLReporter < Reporter
-    # Creates a new HTMLReporter instance.
+    # Returns a new HTMLReporter instance.
     # stream is any Object that responds to :puts and :print.
     def initialize(stream, sort,
                    broken_links, ignored_links,
@@ -28,9 +29,11 @@ module BrokenLinkFinder
     # Report a summary of the overall crawl.
     def report_crawl_summary
       puts format(
-        '<p class="crawl_summary">Crawled %s (%s page(s) in %s seconds)</p>',
+        '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
+        @crawl_stats[:url],
         @crawl_stats[:url],
         @crawl_stats[:num_pages],
+        @crawl_stats[:num_links],
         @crawl_stats[:duration]&.truncate(2)
       )
     end
@@ -43,7 +46,7 @@ module BrokenLinkFinder
         puts_summary 'Good news, there are no broken links!', type: :broken
       else
         num_pages, num_links = get_hash_stats(@broken_links)
-        puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
+        puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
         @broken_links.each do |key, values|
           puts_group(key, type: :broken) # Puts the opening <p> element.
@@ -70,7 +73,7 @@ module BrokenLinkFinder
       if @ignored_links.any?
         num_pages, num_links = get_hash_stats(@ignored_links)
-        puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
+        puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
         @ignored_links.each do |key, values|
           puts_group(key, type: :ignored) # Puts the opening <p> element.
@@ -125,8 +128,8 @@ module BrokenLinkFinder
     end
     def build_url(link)
-      return link if link.to_url.absolute?
-      @broken_link_map.fetch(link)
+      href = @broken_link_map[link]
+      href || link
     end
     alias_method :report, :call

data/lib/broken_link_finder/reporter/reporter.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module BrokenLinkFinder
     # The amount of pages/links to display when verbose is false.
     NUM_VALUES = 3
-    # Creates a new Reporter instance.
+    # Returns a new Reporter instance.
     # stream is any Object that responds to :puts and :print.
     def initialize(stream, sort,
                    broken_links, ignored_links,
@@ -42,8 +42,7 @@ module BrokenLinkFinder
     # Use like: `num_pages, num_links = get_hash_stats(links)`.
     def get_hash_stats(hash)
       num_keys   = hash.keys.length
-      values     = hash.values.flatten
-      num_values = sort_by_page? ? values.length : values.uniq.length
+      num_values = hash.values.flatten.uniq.length
       sort_by_page? ?
         [num_keys, num_values] :

data/lib/broken_link_finder/reporter/text_reporter.rb CHANGED Viewed

@@ -1,8 +1,9 @@
 # frozen_string_literal: true
 module BrokenLinkFinder
+  # Class responsible for reporting in a text format.
   class TextReporter < Reporter
-    # Creates a new TextReporter instance.
+    # Returns a new TextReporter instance.
     # stream is any Object that responds to :puts and :print.
     def initialize(stream, sort,
                    broken_links, ignored_links,
@@ -23,10 +24,11 @@ module BrokenLinkFinder
     # Report a summary of the overall crawl.
     def report_crawl_summary
+      puts "Crawled #{@crawl_stats[:url]}"
       putsn format(
-        'Crawled %s (%s page(s) in %s seconds)',
-        @crawl_stats[:url],
+        '%s page(s) containing %s unique link(s) in %s seconds',
         @crawl_stats[:num_pages],
+        @crawl_stats[:num_links],
         @crawl_stats[:duration]&.truncate(2)
       )
     end
@@ -37,7 +39,7 @@ module BrokenLinkFinder
         puts 'Good news, there are no broken links!'
       else
         num_pages, num_links = get_hash_stats(@broken_links)
-        puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
+        puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
         @broken_links.each do |key, values|
           msg = sort_by_page? ?
@@ -61,7 +63,7 @@ module BrokenLinkFinder
     def report_ignored_links(verbose: false)
       if @ignored_links.any?
         num_pages, num_links = get_hash_stats(@ignored_links)
-        nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
+        nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
         @ignored_links.each do |key, values|
           msg = sort_by_page? ?

data/lib/broken_link_finder/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module BrokenLinkFinder
-  VERSION = '0.10.0'
+  VERSION = '0.12.1'
 end

data/lib/broken_link_finder/wgit_extensions.rb CHANGED Viewed

@@ -1,11 +1,31 @@
 # frozen_string_literal: true
-# We extract all the Document's links, not just the links to other webpages.
-Wgit::Document.define_extension(
+# Define a method on each doc for recording unparsable links.
+# Unparsable links are recorded as broken links by Finder.
+class Wgit::Document
+  def unparsable_links
+    @unparsable_links ||= []
+  end
+end
+# Returns a Wgit::Url or nil (if link is unparsable).
+# A proc is preferrable to a function to avoid polluting the global namespace.
+parse_link = lambda do |doc, link|
+  Wgit::Url.new(link)
+rescue StandardError
+  doc.unparsable_links << link
+  nil
+end
+# Define a custom extractor for all page links we're interested in checking.
+Wgit::Document.define_extractor(
   :all_links,
-  '//*/@href | //*/@src', # Any element with a href or src attribute.
+  lambda { BrokenLinkFinder::link_xpath },
   singleton: false,
   text_content_only: true
-) do |links|
-  links.uniq.to_urls
+) do |links, doc|
+  links
+    .uniq
+    .map { |link| parse_link.call(doc, link) }
+    .compact
 end

data/lib/broken_link_finder/xpath.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+module BrokenLinkFinder
+  # Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
+  DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
+  @link_xpath = DEFAULT_LINK_XPATH
+  class << self
+    # The xpath used to extract links from a crawled page.
+    # Can be overridden as required.
+    attr_accessor :link_xpath
+  end
+end

data/lib/broken_link_finder.rb CHANGED Viewed

@@ -5,8 +5,10 @@ require 'wgit/core_ext'
 require 'thread/pool'
 require 'set'
-require_relative './broken_link_finder/wgit_extensions'
 require_relative './broken_link_finder/version'
+require_relative './broken_link_finder/xpath'
+require_relative './broken_link_finder/wgit_extensions'
+require_relative './broken_link_finder/link_manager'
 require_relative './broken_link_finder/reporter/reporter'
 require_relative './broken_link_finder/reporter/text_reporter'
 require_relative './broken_link_finder/reporter/html_reporter'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: broken_link_finder
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.12.1
 platform: ruby
 authors:
 - Michael Telford
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-11-28 00:00:00.000000000 Z
+date: 2021-11-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -72,14 +72,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '13.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '13.0'
 - !ruby/object:Gem::Dependency
   name: webmock
   requirement: !ruby/object:Gem::Requirement
@@ -128,14 +128,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.10'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.10'
 description: Finds a website's broken links using the 'wgit' gem and reports back
   to you with a summary.
 email: michael.telford@live.com
@@ -159,11 +159,13 @@ files:
 - exe/broken_link_finder
 - lib/broken_link_finder.rb
 - lib/broken_link_finder/finder.rb
+- lib/broken_link_finder/link_manager.rb
 - lib/broken_link_finder/reporter/html_reporter.rb
 - lib/broken_link_finder/reporter/reporter.rb
 - lib/broken_link_finder/reporter/text_reporter.rb
 - lib/broken_link_finder/version.rb
 - lib/broken_link_finder/wgit_extensions.rb
+- lib/broken_link_finder/xpath.rb
 - load.rb
 homepage: https://github.com/michaeltelford/broken-link-finder
 licenses:
@@ -180,17 +182,20 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - "~>"
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '2.6'
+  - - "<"
     - !ruby/object:Gem::Version
-      version: '2.5'
+      version: '4'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
-signing_key:
+rubygems_version: 3.2.22
+signing_key:
 specification_version: 4
 summary: Finds a website's broken links and reports back to you with a summary.
 test_files: []