RubyGems - broken_link_finder - Versions diffs - 0.9.4 → 0.12.0 - Mend

broken_link_finder 0.9.4 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +52 -0
data/Gemfile.lock +51 -38
data/README.md +65 -29
data/benchmark.rb +9 -5
data/bin/console +11 -19
data/bin/setup +1 -1
data/broken_link_finder.gemspec +8 -5
data/exe/broken_link_finder +14 -3
data/lib/broken_link_finder.rb +8 -2
data/lib/broken_link_finder/finder.rb +131 -132
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
data/lib/broken_link_finder/reporter/reporter.rb +76 -0
data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
data/lib/broken_link_finder/xpath.rb +14 -0
metadata +21 -15
data/lib/broken_link_finder/reporter.rb +0 -116

data/exe/broken_link_finder CHANGED Viewed

@@ -9,30 +9,41 @@ class BrokenLinkFinderCLI < Thor
   desc 'crawl [URL]', 'Find broken links at the URL'
   option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
   option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
+  option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
+  option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
   option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
   option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
   option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
   def crawl(url)
     url = "http://#{url}" unless url.start_with?('http')
+    report_type     = options[:html] ? :html : :text
     sort_by         = options[:sort_by_link] ? :link : :page
     max_threads     = options[:threads]
     broken_verbose  = !options[:concise]
     ignored_verbose = options[:verbose]
+    BrokenLinkFinder.link_xpath = options[:xpath]
     finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
-    finder.pretty_print_link_report(
-      broken_verbose: broken_verbose,
+    finder.report(
+      type:            report_type,
+      broken_verbose:  broken_verbose,
       ignored_verbose: ignored_verbose
     )
-  rescue Exception => e
+    exit 0
+  rescue StandardError => e
     puts "An error has occurred: #{e.message}"
+    exit 1
   end
   desc 'version', 'Display the currently installed version'
   def version
     puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
+    exit 0
   end
 end

data/lib/broken_link_finder.rb CHANGED Viewed

@@ -2,8 +2,14 @@
 require 'wgit'
 require 'wgit/core_ext'
+require 'thread/pool'
+require 'set'
-require_relative './broken_link_finder/wgit_extensions'
 require_relative './broken_link_finder/version'
-require_relative './broken_link_finder/reporter'
+require_relative './broken_link_finder/xpath'
+require_relative './broken_link_finder/wgit_extensions'
+require_relative './broken_link_finder/link_manager'
+require_relative './broken_link_finder/reporter/reporter'
+require_relative './broken_link_finder/reporter/text_reporter'
+require_relative './broken_link_finder/reporter/html_reporter'
 require_relative './broken_link_finder/finder'

data/lib/broken_link_finder/finder.rb CHANGED Viewed

@@ -1,228 +1,227 @@
 # frozen_string_literal: true
-require_relative 'reporter'
-require 'thread/pool'
-require 'set'
 module BrokenLinkFinder
-  DEFAULT_MAX_THREADS = 100
+  DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
+  SERVER_WAIT_TIME    = 0.5 # Used by Finder#retry_broken_links.
   # Alias for BrokenLinkFinder::Finder.new.
   def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
     Finder.new(sort: sort, max_threads: max_threads)
   end
+  # Class responsible for finding broken links on a page or site.
   class Finder
-    attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
+    # The collection key - either :page or :link.
+    attr_reader :sort
+    # The max number of threads created during #crawl_site - one thread per page.
+    attr_reader :max_threads
-    # Creates a new Finder instance.
-    def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
+    # Returns a new Finder instance.
+    def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
       raise "Sort by either :page or :link, not #{sort}" \
       unless %i[page link].include?(sort)
       @sort        = sort
       @max_threads = max_threads
-      @lock        = Mutex.new
       @crawler     = Wgit::Crawler.new
+      @manager     = BrokenLinkFinder::LinkManager.new(@sort)
+    end
-      clear_links
+    # Returns the current broken links.
+    def broken_links
+      @manager.broken_links
     end
-    # Clear/empty the link collection Hashes.
-    def clear_links
-      @broken_links        = {}
-      @ignored_links       = {}
-      @total_links_crawled = 0
-      @all_broken_links    = Set.new
-      @all_intact_links    = Set.new
+    # Returns the current ignored links.
+    def ignored_links
+      @manager.ignored_links
     end
-    # Finds broken links within a single page and appends them to the
-    # @broken_links array. Returns true if at least one broken link was found.
+    # Returns the current crawl stats.
+    def crawl_stats
+      @manager.crawl_stats
+    end
+    # Finds broken links within a single page and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
     def crawl_url(url)
-      clear_links
+      @manager.empty
+      start = Time.now
+      url   = url.to_url
-      url = url.to_url
-      doc = @crawler.crawl(url)
+      # We dup the url to avoid recording any redirects.
+      doc = @crawler.crawl(url.dup)
       # Ensure the given page url is valid.
       raise "Invalid or broken URL: #{url}" unless doc
       # Get all page links and determine which are broken.
       find_broken_links(doc)
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: [url], start: start)
-      @broken_links.any?
+      broken_links.any?
     end
-    # Finds broken links within an entire site and appends them to the
-    # @broken_links array. Returns a tuple containing a Boolean of true if
-    # at least one broken link was found and an Array of all pages crawled.
+    # Finds broken links within an entire site and records them.
+    # Returns true if at least one broken link was found.
     # Access the broken links afterwards with Finder#broken_links.
-    def crawl_site(url)
-      clear_links
+    def crawl_site(url, allow_paths: nil, disallow_paths: nil)
+      @manager.empty
-      url           = url.to_url
-      pool          = Thread.pool(@max_threads)
-      crawled_pages = []
+      start   = Time.now
+      url     = url.to_url
+      pool    = Thread.pool(@max_threads)
+      crawled = Set.new
       # Crawl the site's HTML web pages looking for links.
-      externals = @crawler.crawl_site(url) do |doc|
-        crawled_pages << doc.url
+      # We dup the url to avoid recording any redirects.
+      paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      externals = @crawler.crawl_site(url.dup, **paths) do |doc|
+        crawled << doc.url
         next unless doc
         # Start a thread for each page, checking for broken links.
         pool.process { find_broken_links(doc) }
       end
+      # Wait for all threads to finish, even if url was invalid.
+      pool.shutdown
       # Ensure the given website url is valid.
       raise "Invalid or broken URL: #{url}" unless externals
-      # Wait for all threads to finish.
-      pool.shutdown
+      retry_broken_links
-      sort_links
-      set_total_links_crawled
+      @manager.sort
+      @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
-      [@broken_links.any?, crawled_pages.uniq]
+      broken_links.any?
+    ensure
+      pool.shutdown if defined?(pool)
     end
-    # Pretty prints the link report into a stream e.g. STDOUT or a file,
+    # Outputs the link report into a stream e.g. STDOUT or a file,
     # anything that respond_to? :puts. Defaults to STDOUT.
-    # Returns true if there were broken links and vice versa.
-    def pretty_print_link_report(
-      stream = STDOUT,
-      broken_verbose:  true,
-      ignored_verbose: false
-    )
-      reporter = BrokenLinkFinder::Reporter.new(
-        stream, @sort, @broken_links, @ignored_links
-      )
-      reporter.pretty_print_link_report(
-        broken_verbose:  broken_verbose,
-        ignored_verbose: ignored_verbose
-      )
-      @broken_links.any?
+    def report(stream = STDOUT, type: :text,
+               broken_verbose: true, ignored_verbose: false)
+      klass = case type
+              when :text
+                BrokenLinkFinder::TextReporter
+              when :html
+                BrokenLinkFinder::HTMLReporter
+              else
+                raise "The type: must be :text or :html, not: :#{type}"
+              end
+      reporter = klass.new(stream, @sort,
+                           broken_links, ignored_links,
+                           @manager.broken_link_map, crawl_stats)
+      reporter.call(broken_verbose: broken_verbose,
+                    ignored_verbose: ignored_verbose)
     end
     private
     # Finds which links are unsupported or broken and records the details.
     def find_broken_links(page)
+      record_unparsable_links(page) # Record them as broken.
       links = get_supported_links(page)
       # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        # Check if the link has already been processed previously.
-        next if @all_intact_links.include?(link)
+        # Skip if the link has been encountered previously.
+        next if @manager.all_intact_links.include?(link)
-        if @all_broken_links.include?(link)
-          append_broken_link(page.url, link)
+        if @manager.all_broken_links.include?(link)
+          # The link has already been proven broken so simply record it.
+          @manager.append_broken_link(page, link, map: false)
           next
         end
-        # The link hasn't been processed before so we crawl it.
+        # The link hasn't been encountered before so we crawl it.
         link_doc = crawl_link(page, link)
-        # Determine if the crawled link is broken or not.
-        if  link_doc.nil? ||
-            @crawler.last_response.not_found? ||
-            has_broken_anchor(link_doc)
-          append_broken_link(page.url, link)
+        # Determine if the crawled link is broken or not and record it.
+        if link_broken?(link_doc)
+          @manager.append_broken_link(page, link)
         else
-          @lock.synchronize { @all_intact_links << link }
+          @manager.append_intact_link(link)
         end
       end
       nil
     end
-    # Report and reject any non supported links. Any link that is absolute and
-    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
-    def get_supported_links(doc)
-      doc.all_links
-         .reject do |link|
-           if link.is_absolute? && !link.start_with?('http')
-             append_ignored_link(doc.url, link)
-             true
-           end
-         end
-    end
-    # Makes the link absolute and crawls it, returning its Wgit::Document.
-    def crawl_link(doc, link)
-      link = link.prefix_base(doc)
-      @crawler.crawl(link)
-    end
+    # Implements a retry mechanism for each of the broken links found.
+    # Removes any broken links found to be working OK.
+    def retry_broken_links
+      sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
-    # Returns true if the link is/contains a broken anchor/fragment.
-    def has_broken_anchor(doc)
-      raise 'link document is nil' unless doc
+      @manager.broken_link_map.select! do |link, href|
+        # Don't retry unparsable links (which are Strings).
+        next(true) unless href.is_a?(Wgit::Url)
-      fragment = doc.url.fragment
-      return false if fragment.nil? || fragment.empty?
+        doc = @crawler.crawl(href.dup)
-      doc.xpath("//*[@id='#{fragment}']").empty?
+        if link_broken?(doc)
+          true
+        else
+          @manager.remove_broken_link(link)
+          false
+        end
+      end
     end
-    # Append key => [value] to @broken_links.
-    def append_broken_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @broken_links[key] = [] unless @broken_links[key]
-        @broken_links[key] << value
-        @all_broken_links  << link
+    # Record each unparsable link as a broken link.
+    def record_unparsable_links(doc)
+      doc.unparsable_links.each do |link|
+        # We map the link ourselves because link is a String, not a Wgit::Url.
+        @manager.append_broken_link(doc, link, map: false)
+        @manager.broken_link_map[link] = link
       end
     end
-    # Append key => [value] to @ignored_links.
-    def append_ignored_link(url, link)
-      key, value = get_key_value(url, link)
-      @lock.synchronize do
-        @ignored_links[key] = [] unless @ignored_links[key]
-        @ignored_links[key] << value
+    # Report and reject any non supported links. Any link that is absolute and
+    # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
+    def get_supported_links(doc)
+      doc.all_links.reject do |link|
+        if link.is_absolute? && !link.start_with?('http')
+          @manager.append_ignored_link(doc.url, link)
+          true
+        end
       end
     end
-    # Returns the correct key value depending on the @sort type.
-    # @sort == :page ? [url, link] : [link, url]
-    def get_key_value(url, link)
-      case @sort
-      when :page
-        [url, link]
-      when :link
-        [link, url]
-      else
-        raise "Unsupported sort type: #{sort}"
-      end
+    # Make the link absolute and crawl it, returning its Wgit::Document.
+    def crawl_link(doc, link)
+      link = link.make_absolute(doc)
+      @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
     end
-    # Sort keys and values alphabetically.
-    def sort_links
-      @broken_links.values.map(&:uniq!)
-      @ignored_links.values.map(&:uniq!)
+    # Return if the crawled link is broken or not.
+    def link_broken?(doc)
+      doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
+    end
-      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
-      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
+    # Returns true if the link is/contains a broken anchor/fragment.
+    # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
+    def has_broken_anchor(doc)
+      raise 'The link document is nil' unless doc
-      @broken_links.each  { |_k, v| v.sort! }
-      @ignored_links.each { |_k, v| v.sort! }
-    end
+      fragment = doc.url.fragment
+      return false if fragment.nil? || fragment.empty?
-    # Sets and returns the total number of links crawled.
-    def set_total_links_crawled
-      @total_links_crawled = @all_broken_links.size + @all_intact_links.size
+      doc.xpath("//*[@id='#{fragment}']").empty?
     end
-    alias crawl_page                crawl_url
-    alias crawl_r                   crawl_site
-    alias pretty_print_link_summary pretty_print_link_report
+    alias crawl_page crawl_url
+    alias crawl_r    crawl_site
   end
 end

data/lib/broken_link_finder/link_manager.rb ADDED Viewed

@@ -0,0 +1,137 @@
+# frozen_string_literal: true
+module BrokenLinkFinder
+  # Class responsible for handling the link collection logic.
+  class LinkManager
+    # Used for mapping pages to broken links.
+    attr_reader :broken_links
+    # Used for mapping pages to ignored links.
+    attr_reader :ignored_links
+    # Used to record crawl statistics e.g. duration etc.
+    attr_reader :crawl_stats
+    # Used to map a link (as is) to its absolute (crawlable) form.
+    attr_reader :broken_link_map
+    # Used to prevent crawling a broken link twice.
+    attr_reader :all_broken_links
+    # Used to prevent crawling an intact link twice.
+    attr_reader :all_intact_links
+    # Used for building crawl statistics.
+    attr_reader :all_ignored_links
+    # Returns a new LinkManager instance with empty link collections.
+    def initialize(sort)
+      raise "Sort by either :page or :link, not #{sort}" \
+      unless %i[page link].include?(sort)
+      @sort = sort
+      @lock = Mutex.new
+      empty # Initialises the link collections.
+    end
+    # Initialise/empty the link collection objects.
+    def empty
+      @broken_links      = {}
+      @ignored_links     = {}
+      @crawl_stats       = {}
+      @broken_link_map   = {}
+      @all_broken_links  = Set.new
+      @all_intact_links  = Set.new
+      @all_ignored_links = Set.new
+    end
+    # Append key => [value] to the broken link collections.
+    # If map: true, then the link will also be recorded in @broken_link_map.
+    def append_broken_link(doc, link, map: true)
+      key, value = get_key_value(doc.url, link)
+      @lock.synchronize do
+        @broken_links[key] = [] unless @broken_links[key]
+        @broken_links[key] << value
+        @all_broken_links << link
+        @broken_link_map[link] = link.make_absolute(doc) if map
+      end
+    end
+    # Remove the broken link from the necessary collections.
+    def remove_broken_link(link)
+      @lock.synchronize do
+        if @sort == :page
+          @broken_links.each { |_k, links| links.delete(link) }
+          @broken_links.delete_if { |_k, links| links.empty? }
+        else
+          @broken_links.delete(link)
+        end
+        @all_broken_links.delete(link)
+        @all_intact_links << link
+      end
+    end
+    # Append key => [value] to the ignored link collections.
+    def append_ignored_link(url, link)
+      key, value = get_key_value(url, link)
+      @lock.synchronize do
+        @ignored_links[key] = [] unless @ignored_links[key]
+        @ignored_links[key] << value
+        @all_ignored_links << link
+      end
+    end
+    # Append link to @all_intact_links.
+    def append_intact_link(link)
+      @lock.synchronize { @all_intact_links << link }
+    end
+    # Sorts the link collection's keys and values alphabetically.
+    def sort
+      @broken_links.values.map(&:uniq!)
+      @ignored_links.values.map(&:uniq!)
+      @broken_links  = @broken_links.sort_by  { |k, _v| k }.to_h
+      @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
+      @broken_links.each  { |_k, v| v.sort! }
+      @ignored_links.each { |_k, v| v.sort! }
+    end
+    # Tally's up various statistics about the crawl and its links.
+    def tally(url:, pages_crawled:, start:)
+      @crawl_stats[:url]               = url
+      @crawl_stats[:pages_crawled]     = pages_crawled
+      @crawl_stats[:num_pages]         = pages_crawled.size
+      @crawl_stats[:num_links]         = (
+        @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
+      )
+      @crawl_stats[:num_broken_links]  = @all_broken_links.size
+      @crawl_stats[:num_intact_links]  = @all_intact_links.size
+      @crawl_stats[:num_ignored_links] = @all_ignored_links.size
+      @crawl_stats[:duration]          = Time.now - start
+    end
+    private
+    # Returns the correct key value depending on the @sort type.
+    # @sort == :page ? [url, link] : [link, url]
+    def get_key_value(url, link)
+      case @sort
+      when :page
+        [url, link]
+      when :link
+        [link, url]
+      else
+        raise "Unsupported sort type: #{sort}"
+      end
+    end
+  end
+end