RubyGems - broken_link_finder - Versions diffs - 0.10.0 → 0.11.0 - Mend

broken_link_finder 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/Gemfile.lock +9 -9
data/README.md +4 -3
data/bin/console +8 -6
data/broken_link_finder.gemspec +1 -1
data/exe/broken_link_finder +7 -1
data/lib/broken_link_finder/finder.rb +80 -47
data/lib/broken_link_finder/reporter/html_reporter.rb +7 -5
data/lib/broken_link_finder/reporter/reporter.rb +1 -2
data/lib/broken_link_finder/reporter/text_reporter.rb +5 -4
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +24 -4
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a53784c1bd2f75c18b3492ea782b4cc2e229a94f89afcf33b60ef633512554e
-  data.tar.gz: 393dca220b7f00d72314c93e7b877e0412afdf784fa2e563bbecb2dc6c6b29f7
+  metadata.gz: 77094cfe9d0790770b5c34b86bc578fc65d0e425dc089c4fda41a3c587af6e00
+  data.tar.gz: 40f7f59411744bcd010c46bf4bdc17e59dbd4bd191bc33613c2d2bf269ba79ba
 SHA512:
-  metadata.gz: c0d304e5b0a9258265c5c084c0a6e5819c169ba8eb02b3c6317a37784a9ca12982b0fc520c3cca1060fde60126ee936708d7891c69133c5d72c9c0287a79b3f5
-  data.tar.gz: c21a4aec2c077e2617fb625debad28f746148ad98229a27a590a4412601e30759c709aa3a6e6d80e81c16160e16968fc0392181fc9c75e4da06578452f7c5ab6
+  metadata.gz: 4f3f4b7720d24c393fb844ed62159870bde4dd4222a8b0ec69b4fff7b96086b909df63834ef56a1b71e2d68e4ec319357f208273b3be79d81c982602b7a53b8a
+  data.tar.gz: c5af07c99199765688672ca396e19db9093ca0cd32c5a9e37810909787892c5070c729b275fcc6a126ea71bf2bdab4c5616b98643f74a3266775d112d4a8c274

data/CHANGELOG.md CHANGED

@@ -9,6 +9,18 @@
 - ...
 ---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
 ## v0.10.0
 ### Added
 - A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).

data/Gemfile.lock CHANGED

@@ -1,33 +1,33 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.10.0)
+    broken_link_finder (0.11.0)
       thor (~> 0.20)
       thread (~> 0.2)
-      wgit (~> 0.5)
+      wgit (~> 0.8)
 GEM
   remote: https://rubygems.org/
   specs:
     addressable (2.6.0)
       public_suffix (>= 2.0.2, < 4.0)
-    bson (4.6.0)
+    bson (4.7.1)
     byebug (11.0.1)
     coderay (1.1.2)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
     ethon (0.12.0)
       ffi (>= 1.3.0)
-    ffi (1.11.3)
+    ffi (1.12.1)
     hashdiff (1.0.0)
     maxitest (3.4.0)
       minitest (>= 5.0.0, < 5.13.0)
     method_source (0.9.2)
     mini_portile2 (2.4.0)
     minitest (5.12.2)
-    mongo (2.11.1)
-      bson (>= 4.6.0, < 5.0.0)
-    nokogiri (1.10.5)
+    mongo (2.11.3)
+      bson (>= 4.4.2, < 5.0.0)
+    nokogiri (1.10.7)
       mini_portile2 (~> 2.4.0)
     pry (0.12.2)
       coderay (~> 1.1.0)
@@ -43,7 +43,7 @@ GEM
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.5.1)
+    wgit (0.8.0)
       addressable (~> 2.6)
       mongo (~> 2.9)
       nokogiri (~> 1.10)
@@ -65,4 +65,4 @@ RUBY VERSION
    ruby 2.5.3p105
 BUNDLED WITH
-   2.0.2
+   2.1.4

data/README.md CHANGED

@@ -91,9 +91,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Crawled http://txti.es (7 page(s) in 7.88 seconds)
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
-Found 6 broken link(s) across 2 page(s):
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
@@ -105,7 +106,7 @@ The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
 The following links were ignored on 'http://txti.es':
 tel:+13174562564

data/bin/console CHANGED

@@ -23,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/broken_link_finder.gemspec CHANGED

@@ -49,5 +49,5 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency 'thor', '~> 0.20'
   spec.add_runtime_dependency 'thread', '~> 0.2'
-  spec.add_runtime_dependency 'wgit', '~> 0.5'
+  spec.add_runtime_dependency 'wgit', '~> 0.8'
 end

data/exe/broken_link_finder CHANGED

@@ -29,13 +29,19 @@ class BrokenLinkFinderCLI < Thor
       broken_verbose:  broken_verbose,
       ignored_verbose: ignored_verbose
     )
-  rescue Exception => e
+    exit 0
+  rescue StandardError => e
     puts "An error has occurred: #{e.message}"
+    exit 1
   end
   desc 'version', 'Display the currently installed version'
   def version
     puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
+    exit 0
   end
 end

data/lib/broken_link_finder/finder.rb CHANGED

@@ -2,6 +2,7 @@
 module BrokenLinkFinder
   DEFAULT_MAX_THREADS = 100
+  SERVER_WAIT_TIME    = 0.5
   # Alias for BrokenLinkFinder::Finder.new.
   def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
@@ -24,25 +25,28 @@ module BrokenLinkFinder
       reset_crawl
     end
-    # Clear/empty the link collection Hashes.
+    # Clear/empty the link collection objects.
     def reset_crawl
-      @broken_links        = {}
-      @ignored_links       = {}
-      @all_broken_links    = Set.new # Used to prevent crawling a link twice.
-      @all_intact_links    = Set.new #  "
-      @broken_link_map     = {}      # Maps a link to its absolute form.
-      @crawl_stats         = {}      # Records crawl stats e.g. duration etc.
+      @broken_links      = {}      # Used for mapping pages to broken links.
+      @ignored_links     = {}      # Used for mapping pages to ignored links.
+      @all_broken_links  = Set.new # Used to prevent crawling a broken link twice.
+      @all_intact_links  = Set.new # Used to prevent crawling an intact link twice.
+      @all_ignored_links = Set.new # Used for building crawl statistics.
+      @broken_link_map   = {}      # Maps a link to its absolute (crawlable) form.
+      @crawl_stats       = {}      # Records crawl stats e.g. duration etc.
     end
-    # Finds broken links within a single page and appends them to the
-    # @broken_links array. Returns true if at least one broken link was found.
+    # Finds broken links within a single page and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_url(url)
       reset_crawl
       start = Time.now
       url   = url.to_url
-      doc   = @crawler.crawl(url)
+      # We dup the url to avoid recording any redirects.
+      doc = @crawler.crawl(url.dup)
       # Ensure the given page url is valid.
       raise "Invalid or broken URL: #{url}" unless doc
@@ -57,9 +61,8 @@ module BrokenLinkFinder
       @broken_links.any?
     end
-    # Finds broken links within an entire site and appends them to the
-    # @broken_links array. Returns a tuple containing a Boolean of true if
-    # at least one broken link was found and an Array of all pages crawled.
+    # Finds broken links within an entire site and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_site(url)
       reset_crawl
@@ -70,7 +73,8 @@ module BrokenLinkFinder
       crawled = Set.new
       # Crawl the site's HTML web pages looking for links.
-      externals = @crawler.crawl_site(url) do |doc|
+      # We dup the url to avoid recording any redirects.
+      externals = @crawler.crawl_site(url.dup) do |doc|
         crawled << doc.url
         next unless doc
@@ -91,22 +95,23 @@ module BrokenLinkFinder
       @broken_links.any?
     end
-    # Pretty prints the link report into a stream e.g. STDOUT or a file,
+    # Outputs the link report into a stream e.g. STDOUT or a file,
     # anything that respond_to? :puts. Defaults to STDOUT.
-    def report(stream = STDOUT,
-               type: :text, broken_verbose: true, ignored_verbose: false)
+    def report(stream = STDOUT, type: :text,
+               broken_verbose: true, ignored_verbose: false)
       klass = case type
               when :text
                 BrokenLinkFinder::TextReporter
               when :html
                 BrokenLinkFinder::HTMLReporter
               else
-                raise "type: must be :text or :html, not: :#{type}"
+                raise "The type: must be :text or :html, not: :#{type}"
               end
-      reporter = klass.new(stream, @sort, @broken_links,
-                           @ignored_links, @broken_link_map, @crawl_stats)
-      reporter.call(broken_verbose:  broken_verbose,
+      reporter = klass.new(stream, @sort,
+                           @broken_links, @ignored_links,
+                           @broken_link_map, @crawl_stats)
+      reporter.call(broken_verbose: broken_verbose,
                     ignored_verbose: ignored_verbose)
     end
@@ -114,25 +119,28 @@ module BrokenLinkFinder
     # Finds which links are unsupported or broken and records the details.
     def find_broken_links(page)
+      process_unparsable_links(page) # Record them as broken.
       links = get_supported_links(page)
       # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        # Skip if the link has been processed previously.
+        # Skip if the link has been encountered previously.
         next if @all_intact_links.include?(link)
         if @all_broken_links.include?(link)
-          append_broken_link(page.url, link) # Record on which page.
+          # The link has already been proven broken so simply record it.
+          append_broken_link(page, link, map: false)
           next
         end
-        # The link hasn't been processed before so we crawl it.
+        # The link hasn't been encountered before so we crawl it.
         link_doc = crawl_link(page, link)
-        # Determine if the crawled link is broken or not.
+        # Determine if the crawled link is broken or not and record it.
         if link_broken?(link_doc)
-          append_broken_link(page.url, link, doc: page)
-        else
+          append_broken_link(page, link)
+        else # Record it as being intact.
           @lock.synchronize { @all_intact_links << link }
         end
       end
@@ -140,14 +148,31 @@ module BrokenLinkFinder
       nil
     end
+    # Record each unparsable link as a broken link.
+    def process_unparsable_links(doc)
+      doc.unparsable_links.each do |link|
+        append_broken_link(doc, link, map: false)
+        @broken_link_map[link] = link
+      end
+    end
     # Implements a retry mechanism for each of the broken links found.
     # Removes any broken links found to be working OK.
     def retry_broken_links
-      sleep(0.5) # Give the servers a break, then retry the links.
+      sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
-      @broken_link_map.each do |link, href|
-        doc = @crawler.crawl(href)
-        remove_broken_link(link) unless link_broken?(doc)
+      @broken_link_map.select! do |link, href|
+        # Don't retry unparsable links (which are Strings).
+        next(true) unless href.is_a?(Wgit::Url)
+        doc = @crawler.crawl(href.dup)
+        if link_broken?(doc)
+          true
+        else
+          remove_broken_link(link)
+          false
+        end
       end
     end
@@ -166,7 +191,7 @@ module BrokenLinkFinder
     # Make the link absolute and crawl it, returning its Wgit::Document.
     def crawl_link(doc, link)
       link = link.prefix_base(doc)
-      @crawler.crawl(link)
+      @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
     end
     # Return if the crawled link is broken or not.
@@ -175,8 +200,9 @@ module BrokenLinkFinder
     end
     # Returns true if the link is/contains a broken anchor/fragment.
+    # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
     def has_broken_anchor(doc)
-      raise 'link document is nil' unless doc
+      raise 'The link document is nil' unless doc
       fragment = doc.url.fragment
       return false if fragment.nil? || fragment.empty?
@@ -184,22 +210,22 @@ module BrokenLinkFinder
       doc.xpath("//*[@id='#{fragment}']").empty?
     end
-    # Append key => [value] to @broken_links.
-    # If doc: is provided then the link will be recorded in absolute form.
-    def append_broken_link(url, link, doc: nil)
-      key, value = get_key_value(url, link)
+    # Append key => [value] to the broken link collections.
+    # If map: true, then the link will also be recorded in @broken_link_map.
+    def append_broken_link(doc, link, map: true)
+      key, value = get_key_value(doc.url, link)
       @lock.synchronize do
         @broken_links[key] = [] unless @broken_links[key]
         @broken_links[key] << value
-        @all_broken_links  << link
+        @all_broken_links << link
-        @broken_link_map[link] = link.prefix_base(doc) if doc
+        @broken_link_map[link] = link.prefix_base(doc) if map
       end
     end
-    # Remove the broken_link from the necessary collections.
+    # Remove the broken link from the necessary collections.
     def remove_broken_link(link)
       @lock.synchronize do
         if @sort == :page
@@ -214,13 +240,15 @@ module BrokenLinkFinder
       end
     end
-    # Append key => [value] to @ignored_links.
+    # Append key => [value] to the ignored link collections.
     def append_ignored_link(url, link)
       key, value = get_key_value(url, link)
       @lock.synchronize do
         @ignored_links[key] = [] unless @ignored_links[key]
         @ignored_links[key] << value
+        @all_ignored_links << link
       end
     end
@@ -249,13 +277,18 @@ module BrokenLinkFinder
       @ignored_links.each { |_k, v| v.sort! }
     end
-    # Sets and returns the total number of links crawled.
+    # Sets various statistics about the crawl and its links.
     def set_crawl_stats(url:, pages_crawled:, start:)
-      @crawl_stats[:url] = url
-      @crawl_stats[:pages_crawled] = pages_crawled
-      @crawl_stats[:num_pages] = pages_crawled.size
-      @crawl_stats[:num_links] = @all_broken_links.size + @all_intact_links.size
-      @crawl_stats[:duration] = Time.now - start
+      @crawl_stats[:url]               = url
+      @crawl_stats[:pages_crawled]     = pages_crawled
+      @crawl_stats[:num_pages]         = pages_crawled.size
+      @crawl_stats[:num_links]         = (
+        @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
+      )
+      @crawl_stats[:num_broken_links]  = @all_broken_links.size
+      @crawl_stats[:num_intact_links]  = @all_intact_links.size
+      @crawl_stats[:num_ignored_links] = @all_ignored_links.size
+      @crawl_stats[:duration]          = Time.now - start
     end
     alias crawl_page crawl_url

data/lib/broken_link_finder/reporter/html_reporter.rb CHANGED

@@ -28,9 +28,11 @@ module BrokenLinkFinder
     # Report a summary of the overall crawl.
     def report_crawl_summary
       puts format(
-        '<p class="crawl_summary">Crawled %s (%s page(s) in %s seconds)</p>',
+        '<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
+        @crawl_stats[:url],
         @crawl_stats[:url],
         @crawl_stats[:num_pages],
+        @crawl_stats[:num_links],
         @crawl_stats[:duration]&.truncate(2)
       )
     end
@@ -43,7 +45,7 @@ module BrokenLinkFinder
         puts_summary 'Good news, there are no broken links!', type: :broken
       else
         num_pages, num_links = get_hash_stats(@broken_links)
-        puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
+        puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
         @broken_links.each do |key, values|
           puts_group(key, type: :broken) # Puts the opening <p> element.
@@ -70,7 +72,7 @@ module BrokenLinkFinder
       if @ignored_links.any?
         num_pages, num_links = get_hash_stats(@ignored_links)
-        puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
+        puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
         @ignored_links.each do |key, values|
           puts_group(key, type: :ignored) # Puts the opening <p> element.
@@ -125,8 +127,8 @@ module BrokenLinkFinder
     end
     def build_url(link)
-      return link if link.to_url.absolute?
-      @broken_link_map.fetch(link)
+      href = @broken_link_map[link]
+      href || link
     end
     alias_method :report, :call

data/lib/broken_link_finder/reporter/reporter.rb CHANGED

@@ -42,8 +42,7 @@ module BrokenLinkFinder
     # Use like: `num_pages, num_links = get_hash_stats(links)`.
     def get_hash_stats(hash)
       num_keys   = hash.keys.length
-      values     = hash.values.flatten
-      num_values = sort_by_page? ? values.length : values.uniq.length
+      num_values = hash.values.flatten.uniq.length
       sort_by_page? ?
         [num_keys, num_values] :

data/lib/broken_link_finder/reporter/text_reporter.rb CHANGED

@@ -23,10 +23,11 @@ module BrokenLinkFinder
     # Report a summary of the overall crawl.
     def report_crawl_summary
+      puts "Crawled #{@crawl_stats[:url]}"
       putsn format(
-        'Crawled %s (%s page(s) in %s seconds)',
-        @crawl_stats[:url],
+        '%s page(s) containing %s unique link(s) in %s seconds',
         @crawl_stats[:num_pages],
+        @crawl_stats[:num_links],
         @crawl_stats[:duration]&.truncate(2)
       )
     end
@@ -37,7 +38,7 @@ module BrokenLinkFinder
         puts 'Good news, there are no broken links!'
       else
         num_pages, num_links = get_hash_stats(@broken_links)
-        puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
+        puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
         @broken_links.each do |key, values|
           msg = sort_by_page? ?
@@ -61,7 +62,7 @@ module BrokenLinkFinder
     def report_ignored_links(verbose: false)
       if @ignored_links.any?
         num_pages, num_links = get_hash_stats(@ignored_links)
-        nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
+        nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
         @ignored_links.each do |key, values|
           msg = sort_by_page? ?

data/lib/broken_link_finder/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module BrokenLinkFinder
-  VERSION = '0.10.0'
+  VERSION = '0.11.0'
 end

data/lib/broken_link_finder/wgit_extensions.rb CHANGED

@@ -1,11 +1,31 @@
 # frozen_string_literal: true
-# We extract all the Document's links, not just the links to other webpages.
+# Define a method on each doc for recording unparsable links.
+# Unparsable links are recorded as broken links by Finder.
+class Wgit::Document
+  def unparsable_links
+    @unparsable_links ||= []
+  end
+end
+# Returns a Wgit::Url or nil (if link is unparsable).
+# A proc is preferrable to a function to avoid polluting the global namespace.
+parse_link = lambda do |doc, link|
+  Wgit::Url.new(link)
+rescue StandardError
+  doc.unparsable_links << link
+  nil
+end
+# We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
 Wgit::Document.define_extension(
   :all_links,
-  '//*/@href | //*/@src', # Any element with a href or src attribute.
+  '//*/@href | //*/@src', # Any element's href or src attribute URL.
   singleton: false,
   text_content_only: true
-) do |links|
-  links.uniq.to_urls
+) do |links, doc|
+  links
+    .uniq
+    .map { |link| parse_link.call(doc, link) }
+    .compact
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: broken_link_finder
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.11.0
 platform: ruby
 authors:
 - Michael Telford
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-11-28 00:00:00.000000000 Z
+date: 2020-01-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -128,14 +128,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.8'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.8'
 description: Finds a website's broken links using the 'wgit' gem and reports back
   to you with a summary.
 email: michael.telford@live.com