RubyGems - broken_link_finder - Versions diffs - 0.5.0 → 0.6.0 - Mend

broken_link_finder 0.5.0 → 0.6.0

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +3 -3
data/README.md +25 -8
data/broken_link_finder.gemspec +1 -1
data/exe/broken_link_finder +1 -1
data/lib/broken_link_finder.rb +1 -0
data/lib/broken_link_finder/finder.rb +63 -16
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +21 -0
metadata +5 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 364ec155bda54b8757fbce6425c8978d8d17cd91618d5650dc14f8a63e712a2f
-  data.tar.gz: b5c0e405f159aaed54725042105c519e1e9d0e085bf40aad99b114200ec8713a
+  metadata.gz: 89e6476124fe4e40b2efe0646ad6d2708f233a464ae8a075833f03c27669a719
+  data.tar.gz: a85394f3013a1c073afcdd94451bd5044331dce6c9f6988d0c4ccb1c9682783c
 SHA512:
-  metadata.gz: eaf1aa2ea2b4f9177561291a8b729e91610f9b2fbd8541ceb2fa96f039a667ffd546dd758b9e4edb4fba9cad25c55368cb4af95ffbe9d0fdc20c546e3d4e5f0b
-  data.tar.gz: 8c6d407f74d900553782d7aedcdc4027c14e436bdb73b2888830827da9e305456e653ad9398c8752e01c8aae381871eb239aee5d7394ea995c5b68b0cbd38404
+  metadata.gz: 6a7e10444cedf91d3dcf77a6e943852b02d902991616e534ade4b6e194d4508ba342fdb6c1d76b83ab586499ba0fc82aa795875c96f2233fee097089112e2ea0
+  data.tar.gz: 75ab90b82b724eed3e837e48dec0d9584723ecc0cd606c9c8d42c87ba5350daac3a35099f7e71967a4e699a9b0817127da56c62646f6d36f49297b77731d8657

data/Gemfile.lock CHANGED

@@ -1,10 +1,10 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.5.0)
+    broken_link_finder (0.6.0)
       thor (= 0.20.3)
       thread (= 0.2)
-      wgit (= 0.0.12)
+      wgit (= 0.0.13)
 GEM
   remote: https://rubygems.org/
@@ -36,7 +36,7 @@ GEM
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff
-    wgit (0.0.12)
+    wgit (0.0.13)
       mongo (~> 2.8.0)
       nokogiri (~> 1.10.3)

data/README.md CHANGED

@@ -6,15 +6,22 @@ Simply point it at a website and it will crawl all of its webpages searching for
 ## How It Works
-Any page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions (in order) constitutes that the link is broken:
+Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
-1) A response status code of `404 Not Found` is returned.
-2) An empty HTML response body is returned.
-3) The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with an ID of `about` or the link is considered broken.
+- A response status code of `404 Not Found` is returned.
+- An empty HTML response body is returned.
+- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
+- The link redirects more than 5 times consecutively.
+**Note**: Not all link types are supported.
+In a nutshell, only HTTP(S) based links can be successfully verified by `broken_link_finder`. As a result some links on a page might be (recorded and) ignored. You should verify these links yourself manually. Examples of unsupported link types include `tel:*`, `mailto:*`, `ftp://*` etc.
+See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
 ## Made Possible By
-This repository utilises the awesome `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
+`broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
 ## Installation
@@ -53,9 +60,10 @@ Below is a simple script which crawls a website and outputs it's broken links to
 ```ruby
 require 'broken_link_finder'
-finder = BrokenLinkFinder::Finder.new
-finder.crawl_site "http://txti.es" # Also, see Finder#crawl_page for a single webpage.
-finder.pretty_print_broken_links # Also, see Finder#broken_links for a Hash of links.
+finder = BrokenLinkFinder.new
+finder.crawl_site "http://txti.es"    # Or use Finder#crawl_page for a single webpage.
+finder.pretty_print_link_summary      # Or use Finder#broken_links and Finder#ignored_links
+                                      # for direct access to the link Hashes.
 ```
 Then execute the script with:
@@ -78,6 +86,15 @@ https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FB
 The following broken links exist in http://txti.es/how:
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
+Below is a breakdown of the non supported (ignored) links found, you should check these manually:
+The following links were ignored on http://txti.es:
+tel:+13174562564
+mailto:big.jim@jmail.com
+The following links were ignored on http://txti.es/contact:
+ftp://server.com
 ```
 ## TODO

data/broken_link_finder.gemspec CHANGED

@@ -43,7 +43,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "byebug", "~> 11.0"
   spec.add_development_dependency "webmock", "~> 3.5"
-  spec.add_runtime_dependency "wgit", "0.0.12"
+  spec.add_runtime_dependency "wgit", "0.0.13"
   spec.add_runtime_dependency "thread", "0.2"
   spec.add_runtime_dependency "thor", "0.20.3"
 end

data/exe/broken_link_finder CHANGED

@@ -11,7 +11,7 @@ class BrokenLinkFinderCLI < Thor
     url = "http://#{url}" unless url.start_with?('http')
     finder = BrokenLinkFinder::Finder.new
     options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
-    finder.pretty_print_broken_links
+    finder.pretty_print_link_summary
   end
 end

data/lib/broken_link_finder.rb CHANGED

@@ -1,2 +1,3 @@
+require_relative "./broken_link_finder/wgit_extensions"
 require_relative "./broken_link_finder/version"
 require_relative "./broken_link_finder/finder"

data/lib/broken_link_finder/finder.rb CHANGED

@@ -2,22 +2,29 @@ require 'wgit'
 require 'thread/pool'
 module BrokenLinkFinder
+  # Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
+  # override the max_threads variable.
+  def self.new
+    Finder.new
+  end
   class Finder
     DEFAULT_MAX_THREADS = 30.freeze
-    attr_reader :broken_links
+    attr_reader :broken_links, :ignored_links
     # Create a new Finder instance.
     def initialize(max_threads: DEFAULT_MAX_THREADS)
       @max_threads = max_threads
       @lock = Mutex.new
       @crawler = Wgit::Crawler.new
-      @broken_links = {}
+      clear_links
     end
-    # Clear/empty the @broken_links Hash.
-    def clear_broken_links
+    # Clear/empty the link collection Hashes.
+    def clear_links
       @broken_links = {}
+      @ignored_links = {}
     end
     # Finds broken links within an entire site and appends them to the
@@ -25,11 +32,12 @@ module BrokenLinkFinder
     # at least one broken link was found and an Array of all pages crawled.
     # Access the broken links with Finder#broken_links.
     def crawl_site(url)
-      clear_broken_links
+      clear_links
       url = Wgit::Url.new(url)
       pool = Thread.pool(@max_threads)
       crawled_pages = []
+      # Crawl the site's HTML web pages looking for links.
       @crawler.crawl_site(url) do |doc|
         # Ensure the given website url is valid.
         raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
@@ -45,14 +53,14 @@ module BrokenLinkFinder
       end
       pool.shutdown
-      [!@broken_links.empty?, crawled_pages]
+      [@broken_links.any?, crawled_pages]
     end
     # Finds broken links within a single page and appends them to the
     # @broken_links array. Returns true if at least one broken link was found.
     # Access the broken links with Finder#broken_links.
     def crawl_url(url)
-      clear_broken_links
+      clear_links
       url = Wgit::Url.new(url)
       # Ensure the given page url is valid.
@@ -62,41 +70,70 @@ module BrokenLinkFinder
       # Get all page links and determine which are broken.
       find_broken_links(doc)
-      !@broken_links.empty?
+      @broken_links.any?
     end
-    # Pretty prints the contents of broken_links into a stream e.g. Kernel
+    # Pretty prints the link summary into a stream e.g. Kernel
     # (STDOUT) or a file - anything that respond_to? :puts.
     # Returns true if there were broken links and vice versa.
-    def pretty_print_broken_links(stream = Kernel)
+    def pretty_print_link_summary(stream = Kernel)
       raise "stream must respond_to? :puts" unless stream.respond_to? :puts
+      # Broken link summary.
       if @broken_links.empty?
         stream.puts("Good news, there are no broken links!")
-        false
+        stream.puts("")
       else
         stream.puts("Below is a breakdown of the different pages and their \
 broken links...")
         stream.puts("")
         @broken_links.each do |page, links|
-          stream.puts("The following broken links exist in #{page}:")
+          stream.puts("The following broken links exist on #{page}:")
+          links.each do |link|
+            stream.puts(link)
+          end
+          stream.puts("")
+        end
+      end
+      # Ignored link summary.
+      if @ignored_links.any?
+        stream.puts("Below is a breakdown of the non supported links found, \
+you should check these manually:")
+        stream.puts("")
+        @ignored_links.each do |page, links|
+          stream.puts("The following links were ignored on #{page}:")
           links.each do |link|
             stream.puts(link)
           end
           stream.puts("")
         end
-        true
       end
+      @broken_links.any?
     end
     private
-    # Finds which links are broken and appends the details to @broken_links.
+    # Finds which links are unsupported or broken and records the details.
     def find_broken_links(doc)
-      links = doc.internal_full_links + doc.external_links
+      # Process the Document's links before checking if they're broke.
+      links = doc.all_links.
+        reject do |link|
+          if !link.is_relative? and !link.start_with?('http')
+            append_ignored_link(doc.url, link)
+            true
+          end
+        end.
+        uniq
+      # Iterate over the supported links checking if they're broken or not.
       links.each do |link|
-        link_doc = @crawler.crawl_url(link)
+        link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
+        link_doc = @crawler.crawl_url(link_url)
         if @crawler.last_response.is_a?(Net::HTTPNotFound) or
             link_doc.nil? or
             has_broken_anchor(link_doc)
@@ -124,6 +161,16 @@ broken links...")
       end
     end
+    # Append url => [link] to @ignored_links.
+    def append_ignored_link(url, link)
+      @lock.synchronize do
+        unless @ignored_links[url]
+          @ignored_links[url] = []
+        end
+        @ignored_links[url] << link
+      end
+    end
     alias_method :crawl_page, :crawl_url
   end
 end

data/lib/broken_link_finder/version.rb CHANGED

@@ -1,3 +1,3 @@
 module BrokenLinkFinder
-  VERSION = "0.5.0"
+  VERSION = "0.6.0"
 end

data/lib/broken_link_finder/wgit_extensions.rb ADDED

@@ -0,0 +1,21 @@
+require 'wgit'
+# We pull out all of a Document's links, not just the links to other webpages.
+Wgit::Document.define_extension(
+  :all_links,
+  '//*/@href | //*/@src',
+  singleton: false,
+  text_content_only: true,
+) do |links|
+  if links
+    links = links.
+      map do |link|
+        Wgit::Url.new(link)
+      rescue
+        nil
+      end.
+      compact.
+      uniq
+  end
+  links
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: broken_link_finder
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.6.0
 platform: ruby
 authors:
 - Michael Telford
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-07-21 00:00:00.000000000 Z
+date: 2019-07-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -100,14 +100,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.12
+        version: 0.0.13
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.12
+        version: 0.0.13
 - !ruby/object:Gem::Dependency
   name: thread
   requirement: !ruby/object:Gem::Requirement
@@ -159,6 +159,7 @@ files:
 - lib/broken_link_finder.rb
 - lib/broken_link_finder/finder.rb
 - lib/broken_link_finder/version.rb
+- lib/broken_link_finder/wgit_extensions.rb
 - load.rb
 homepage: https://github.com/michaeltelford/broken-link-finder
 licenses: