RubyGems - broken_link_finder - Versions diffs - 0.5.0 → 0.6.0 - Mend

broken_link_finder 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +3 -3
data/README.md +25 -8
data/broken_link_finder.gemspec +1 -1
data/exe/broken_link_finder +1 -1
data/lib/broken_link_finder.rb +1 -0
data/lib/broken_link_finder/finder.rb +63 -16
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +21 -0
metadata +5 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 364ec155bda54b8757fbce6425c8978d8d17cd91618d5650dc14f8a63e712a2f
-  data.tar.gz: b5c0e405f159aaed54725042105c519e1e9d0e085bf40aad99b114200ec8713a
+  metadata.gz: 89e6476124fe4e40b2efe0646ad6d2708f233a464ae8a075833f03c27669a719
+  data.tar.gz: a85394f3013a1c073afcdd94451bd5044331dce6c9f6988d0c4ccb1c9682783c
 SHA512:
-  metadata.gz: eaf1aa2ea2b4f9177561291a8b729e91610f9b2fbd8541ceb2fa96f039a667ffd546dd758b9e4edb4fba9cad25c55368cb4af95ffbe9d0fdc20c546e3d4e5f0b
-  data.tar.gz: 8c6d407f74d900553782d7aedcdc4027c14e436bdb73b2888830827da9e305456e653ad9398c8752e01c8aae381871eb239aee5d7394ea995c5b68b0cbd38404
+  metadata.gz: 6a7e10444cedf91d3dcf77a6e943852b02d902991616e534ade4b6e194d4508ba342fdb6c1d76b83ab586499ba0fc82aa795875c96f2233fee097089112e2ea0
+  data.tar.gz: 75ab90b82b724eed3e837e48dec0d9584723ecc0cd606c9c8d42c87ba5350daac3a35099f7e71967a4e699a9b0817127da56c62646f6d36f49297b77731d8657

data/Gemfile.lock CHANGED

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.5.0)
+    broken_link_finder (0.6.0)
       thor (= 0.20.3)
       thread (= 0.2)
-      wgit (= 0.0.12)
+      wgit (= 0.0.13)
 GEM
   remote: https://rubygems.org/
@@ -36,7 +36,7 @@ GEM
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff
-    wgit (0.0.12)
+    wgit (0.0.13)
       mongo (~> 2.8.0)
       nokogiri (~> 1.10.3)

data/README.md CHANGED

@@ -6,15 +6,22 @@ Simply point it at a website and it will crawl all of its webpages searching for
 ## How It Works
-Any page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions (in order) constitutes that the link is broken:
+Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
-1) A response status code of `404 Not Found` is returned.
-2) An empty HTML response body is returned.
-3) The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with an ID of `about` or the link is considered broken.
+- A response status code of `404 Not Found` is returned.
+- An empty HTML response body is returned.
+- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
+- The link redirects more than 5 times consecutively.
+**Note**: Not all link types are supported.
+In a nutshell, only HTTP(S) based links can be successfully verified by `broken_link_finder`. As a result some links on a page might be (recorded and) ignored. You should verify these links yourself manually. Examples of unsupported link types include `tel:*`, `mailto:*`, `ftp://*` etc.
+See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
 ## Made Possible By
-This repository utilises the awesome `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
+`broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
 ## Installation
@@ -53,9 +60,10 @@ Below is a simple script which crawls a website and outputs it's broken links to
 ```ruby
 require 'broken_link_finder'
-finder = BrokenLinkFinder::Finder.new
-finder.crawl_site "http://txti.es" # Also, see Finder#crawl_page for a single webpage.
-finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash of links.
+finder = BrokenLinkFinder.new
+finder.crawl_site "http://txti.es"    # Or use Finder#crawl_page for a single webpage.
+finder.pretty_print_link_summary      # Or use Finder#broken_links and Finder#ignored_links
+                                      # for direct access to the link Hashes.
 ```
 Then execute the script with:
@@ -78,6 +86,15 @@ https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FB
 The following broken links exist in http://txti.es/how:
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
+Below is a breakdown of the non supported (ignored) links found, you should check these manually:
+The following links were ignored on http://txti.es:
+tel:+13174562564
+mailto:big.jim@jmail.com
+The following links were ignored on http://txti.es/contact:
+ftp://server.com
 ```
 ## TODO

data/broken_link_finder.gemspec CHANGED

@@ -43,7 +43,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "byebug", "~> 11.0"
   spec.add_development_dependency "webmock", "~> 3.5"
-  spec.add_runtime_dependency "wgit", "0.0.12"
+  spec.add_runtime_dependency "wgit", "0.0.13"
   spec.add_runtime_dependency "thread", "0.2"
   spec.add_runtime_dependency "thor", "0.20.3"
 end

data/exe/broken_link_finder CHANGED

@@ -11,7 +11,7 @@ class BrokenLinkFinderCLI < Thor
     url = "http://#{url}" unless url.start_with?('http')
     finder = BrokenLinkFinder::Finder.new
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
-    finder.pretty_print_broken_links
+    finder.pretty_print_link_summary
   end
 end

data/lib/broken_link_finder.rb CHANGED

@@ -1,2 +1,3 @@
+require_relative "./broken_link_finder/wgit_extensions"
 require_relative "./broken_link_finder/version"
 require_relative "./broken_link_finder/finder"

data/lib/broken_link_finder/finder.rb CHANGED

@@ -2,22 +2,29 @@ require 'wgit'
 require 'thread/pool'
 module BrokenLinkFinder
+  # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
+  # override the max_threads variable.
+  def self.new
+    Finder.new
+  end
   class Finder
     DEFAULT_MAX_THREADS = 30.freeze
-    attr_reader :broken_links
+    attr_reader :broken_links, :ignored_links
     # Create a new Finder instance.
     def initialize(max_threads: DEFAULT_MAX_THREADS)
       @max_threads = max_threads
       @lock = Mutex.new
       @crawler = Wgit::Crawler.new
-      @broken_links = {}
+      clear_links
     end
-    # Clear/empty the @broken_links Hash.
-    def clear_broken_links
+    # Clear/empty the link collection Hashes.
+    def clear_links
       @broken_links = {}
+      @ignored_links = {}
     end
     # Finds broken links within an entire site and appends them to the
@@ -25,11 +32,12 @@ module BrokenLinkFinder
     # at least one broken link was found and an Array of all pages crawled.
     # Access the broken links with Finder#broken_links.
     def crawl_site(url)
-      clear_broken_links
+      clear_links
       url = Wgit::Url.new(url)
       pool = Thread.pool(@max_threads)
       crawled_pages = []
+      # Crawl the site's HTML web pages looking for links.
       @crawler.crawl_site(url) do |doc|
         # Ensure the given website url is valid.
         raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
@@ -45,14 +53,14 @@ module BrokenLinkFinder
       end
       pool.shutdown
-      [!@broken_links.empty?, crawled_pages]
+      [@broken_links.any?, crawled_pages]
     end
     # Finds broken links within a single page and appends them to the
     # @broken_links array. Returns true if at least one broken link was found.
     # Access the broken links with Finder#broken_links.
     def crawl_url(url)
-      clear_broken_links
+      clear_links
       url = Wgit::Url.new(url)
       # Ensure the given page url is valid.
@@ -62,41 +70,70 @@ module BrokenLinkFinder
       # Get all page links and determine which are broken.
       find_broken_links(doc)
-      !@broken_links.empty?
+      @broken_links.any?
     end
-    # Pretty prints the contents of broken_links into a stream e.g. Kernel
+    # Pretty prints the link summary into a stream e.g. Kernel
     # (STDOUT) or a file - anything that respond_to? :puts.
     # Returns true if there were broken links and vice versa.
-    def pretty_print_broken_links(stream = Kernel)
+    def pretty_print_link_summary(stream = Kernel)
       raise "stream must respond_to? :puts" unless stream.respond_to? :puts
+      # Broken link summary.
       if @broken_links.empty?
         stream.puts("Good news, there are no broken links!")
-        false
+        stream.puts("")
       else
         stream.puts("Below is a breakdown of the different pages and their \
 broken links...")
         stream.puts("")
         @broken_links.each do |page, links|
-          stream.puts("The following broken links exist in #{page}:")
+          stream.puts("The following broken links exist on #{page}:")
+          links.each do |link|
+            stream.puts(link)
+          end
+          stream.puts("")
+        end
+      end
+      # Ignored link summary.
+      if @ignored_links.any?
+        stream.puts("Below is a breakdown of the non supported links found, \
+you should check these manually:")
+        stream.puts("")
+        @ignored_links.each do |page, links|
+          stream.puts("The following links were ignored on #{page}:")
           links.each do |link|
             stream.puts(link)
           end
           stream.puts("")
         end
-        true
       end
+      @broken_links.any?
     end
     private
-    # Finds which links are broken and appends the details to @broken_links.
+    # Finds which links are unsupported or broken and records the details.
     def find_broken_links(doc)
-      links = doc.internal_full_links + doc.external_links
+      # Process the Document's links before checking if they're broke.
+      links = doc.all_links.
+        reject do |link|
+          if !link.is_relative? and !link.start_with?('http')
+            append_ignored_link(doc.url, link)
+            true
+          end
+        end.
+        uniq
+      # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        link_doc = @crawler.crawl_url(link)
+        link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
+        link_doc = @crawler.crawl_url(link_url)
         if @crawler.last_response.is_a?(Net::HTTPNotFound) or
             link_doc.nil? or
             has_broken_anchor(link_doc)
@@ -124,6 +161,16 @@ broken links...")
       end
     end
+    # Append url => [link] to @ignored_links.
+    def append_ignored_link(url, link)
+      @lock.synchronize do
+        unless @ignored_links[url]
+          @ignored_links[url] = []
+        end
+        @ignored_links[url] << link
+      end
+    end
     alias_method :crawl_page, :crawl_url
   end
 end

data/lib/broken_link_finder/version.rb CHANGED

@@ -1,3 +1,3 @@
 module BrokenLinkFinder
-  VERSION = "0.5.0"
+  VERSION = "0.6.0"
 end

data/lib/broken_link_finder/wgit_extensions.rb ADDED

@@ -0,0 +1,21 @@
+require 'wgit'
+# We pull out all of a Document's links, not just the links to other webpages.
+Wgit::Document.define_extension(
+  :all_links,
+  '//*/@href | //*/@src',
+  singleton: false,
+  text_content_only: true,
+) do |links|
+  if links
+    links = links.
+      map do |link|
+        Wgit::Url.new(link)
+      rescue
+        nil
+      end.
+      compact.
+      uniq
+  end
+  links
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: broken_link_finder
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.6.0
 platform: ruby
 authors:
 - Michael Telford
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-07-21 00:00:00.000000000 Z
+date: 2019-07-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -100,14 +100,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.12
+        version: 0.0.13
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.12
+        version: 0.0.13
 - !ruby/object:Gem::Dependency
   name: thread
   requirement: !ruby/object:Gem::Requirement
@@ -159,6 +159,7 @@ files:
 - lib/broken_link_finder.rb
 - lib/broken_link_finder/finder.rb
 - lib/broken_link_finder/version.rb
+- lib/broken_link_finder/wgit_extensions.rb
 - load.rb
 homepage: https://github.com/michaeltelford/broken-link-finder
 licenses: