RubyGems - broken_link_finder - Versions diffs - 0.9.4 → 0.12.0 - Mend

broken_link_finder 0.9.4 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +52 -0
data/Gemfile.lock +51 -38
data/README.md +65 -29
data/benchmark.rb +9 -5
data/bin/console +11 -19
data/bin/setup +1 -1
data/broken_link_finder.gemspec +8 -5
data/exe/broken_link_finder +14 -3
data/lib/broken_link_finder.rb +8 -2
data/lib/broken_link_finder/finder.rb +131 -132
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
data/lib/broken_link_finder/reporter/reporter.rb +76 -0
data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
data/lib/broken_link_finder/xpath.rb +14 -0
metadata +21 -15
data/lib/broken_link_finder/reporter.rb +0 -116

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cb0cc981acce272911be9d8a3ed36dd49e0f621eee3e9fd71893020da1600945
-  data.tar.gz: 3b368404cf3b2da83445212c44e43f32ad7d1fc5119c8980aeaa04540ebce2c9
+  metadata.gz: 24ca9c7a6071b07f5ab3132c9c79c4628570c9c3e157b77a27a05cdc0578ac6e
+  data.tar.gz: 6668eb430c8296e1439f56c242e7e08a27733605d724ec1c5cfa638dcfaa8b52
 SHA512:
-  metadata.gz: 92ffd946b60411dba032ac30b8a96820dea262520ab92e1f2d64c48477d4c4ca6e22fe41d221fb421423565f9d61883b48017c1c5af651c1bb71ba96eacf490c
-  data.tar.gz: 17455ab4cf7cb3ab0df9763b98cc844b1c1c07ed702c600cc623e263119a8f071b8a9f55519a021662db64e6bffda91d1c5d439ed8594ae87d830da19acf3529
+  metadata.gz: 1d1cdc47ade4651b8bc2df01212364ba938ee73269bf53e7278519ecd374247291c932abfa73a031973403ed55d360bc9d14b5c60ba312aca4b32837b5064294
+  data.tar.gz: f56308da4b9d7a4a39afd43808f77d2b6f2fbbf00f17502d2d889de504bcc82ee1858fb673333b11693a65ad73a4f5fb65a97b15955443e8268b1e0ab08b4e51

data/.ruby-version CHANGED Viewed

	@@ -1 +1 @@
1	- 2.5.3
1	+ 2.7.0

data/CHANGELOG.md CHANGED Viewed

@@ -9,6 +9,58 @@
 - ...
 ---
+## v0.12.0
+### Added
+- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
+- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
+### Changed/Removed
+- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
+### Fixed
+- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
+---
+## v0.11.1
+### Added
+- ...
+### Changed/Removed
+- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
+## v0.10.0
+### Added
+- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
+- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
+- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
+### Changed/Removed
+- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
+### Fixed
+- ...
+---
+## v0.9.5
+### Added
+- ...
+### Changed/Removed
+- Now using optimistic dep versioning.
+- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
+### Fixed
+- ...
+---
 ## v0.9.4
 ### Added
 - ...

data/Gemfile.lock CHANGED Viewed

@@ -1,53 +1,66 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.9.4)
-      thor (~> 0.20.3)
-      thread (~> 0.2.0)
-      wgit (~> 0.5.0)
+    broken_link_finder (0.12.0)
+      thor (~> 0.20)
+      thread (~> 0.2)
+      wgit (~> 0.10)
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.6.0)
-      public_suffix (>= 2.0.2, < 4.0)
-    bson (4.6.0)
-    byebug (11.0.1)
-    coderay (1.1.2)
-    crack (0.4.3)
-      safe_yaml (~> 1.0.0)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    bson (4.12.0)
+    byebug (11.1.3)
+    cliver (0.3.2)
+    coderay (1.1.3)
+    concurrent-ruby (1.1.8)
+    crack (0.4.5)
+      rexml
     ethon (0.12.0)
       ffi (>= 1.3.0)
-    ffi (1.11.1)
-    hashdiff (1.0.0)
-    maxitest (3.4.0)
-      minitest (>= 5.0.0, < 5.13.0)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.12.2)
-    mongo (2.9.2)
-      bson (>= 4.4.2, < 5.0.0)
-    nokogiri (1.10.5)
-      mini_portile2 (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    public_suffix (3.1.0)
-    rake (10.5.0)
-    safe_yaml (1.0.5)
+    ferrum (0.11)
+      addressable (~> 2.5)
+      cliver (~> 0.3)
+      concurrent-ruby (~> 1.1)
+      websocket-driver (>= 0.6, < 0.8)
+    ffi (1.15.0)
+    hashdiff (1.0.1)
+    maxitest (3.6.0)
+      minitest (>= 5.0.0, < 5.14.0)
+    method_source (1.0.0)
+    mini_portile2 (2.5.0)
+    minitest (5.13.0)
+    mongo (2.14.0)
+      bson (>= 4.8.2, < 5.0.0)
+    nokogiri (1.11.2)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
+    pry (0.14.0)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    public_suffix (4.0.6)
+    racc (1.5.2)
+    rake (13.0.3)
+    rexml (3.2.4)
     thor (0.20.3)
     thread (0.2.2)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    webmock (3.7.6)
+    webmock (3.12.2)
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.5.0)
-      addressable (~> 2.6.0)
-      mongo (~> 2.9.0)
-      nokogiri (~> 1.10.3)
-      typhoeus (~> 1.3.1)
+    websocket-driver (0.7.3)
+      websocket-extensions (>= 0.1.0)
+    websocket-extensions (0.1.5)
+    wgit (0.10.0)
+      addressable (~> 2.6)
+      ferrum (~> 0.8)
+      mongo (~> 2.9)
+      nokogiri (~> 1.10)
+      typhoeus (~> 1.3)
 PLATFORMS
   ruby
@@ -58,11 +71,11 @@ DEPENDENCIES
   byebug (~> 11.0)
   maxitest (~> 3.3)
   pry (~> 0.12)
-  rake (~> 10.0)
+  rake (~> 13.0)
   webmock (~> 3.6)
 RUBY VERSION
-   ruby 2.5.3p105
+   ruby 2.7.0p0
 BUNDLED WITH
-   2.0.1
+   2.1.4

data/README.md CHANGED Viewed

@@ -1,14 +1,16 @@
 # Broken Link Finder
-Does what it says on the tin; Finds a website's broken links.
+Does what it says on the tin - finds a website's broken links.
-Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
+Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
-Because `libcurl` is used under the hood, Broken Link Finder is fast!
+Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
 ## How It Works
-Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
+Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
+For each link on a given page, any of the following conditions constitutes that the link is broken:
 - An empty HTML response body is returned.
 - A response status code of `404 Not Found` is returned.
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
 ## Installation
-Add this line to your application's Gemfile:
+Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
-```ruby
-gem 'broken_link_finder'
-```
+Currently, the required MRI Ruby version is:
-And then execute:
+`~> 2.5` (a.k.a.) `>= 2.5 && < 3`
-    $ bundle
+### Using Bundler
-Or install it yourself as:
+    $ bundle add broken_link_finder
+### Using RubyGems
     $ gem install broken_link_finder
-Finally, verify the installation with:
+### Verify
     $ broken_link_finder version
 ## Usage
-You can check for broken links via the library or executable.
+You can check for broken links via the executable or library.
 ### Executable
@@ -57,7 +59,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
     $ broken_link_finder crawl http://txti.es
-Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
+Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
 See the [output](#Output) section below for an example of a site with broken links.
@@ -76,7 +78,7 @@ require 'broken_link_finder'
 finder = BrokenLinkFinder.new
 finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
-finder.pretty_print_link_report    # Or use Finder#broken_links and Finder#ignored_links
+finder.report                      # Or use Finder#broken_links and Finder#ignored_links
                                    # for direct access to the link Hashes.
 ```
@@ -91,28 +93,62 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Found 6 broken link(s) across 2 page(s):
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
+/doesntexist
 http://twitter.com/nwbld
-http://twitter.com/txties
-https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
+twitter.com/txties
 The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
-The following links were ignored on http://txti.es:
+The following links were ignored on 'http://txti.es':
 tel:+13174562564
 mailto:big.jim@jmail.com
-The following links were ignored on http://txti.es/contact:
+The following links were ignored on 'http://txti.es/contact':
 ftp://server.com
 ```
+You can provide the `--html` flag if you'd prefer a HTML based report.
+## Link Extraction
+You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
+### Executable
+Add the `--xpath` (or `-x`) flag to the crawl command e.g.
+    $ broken_link_finder crawl http://txti.es -x //img/@src
+### Library
+Set the desired XPath using the accessor methods provided:
+> main.rb
+```ruby
+require 'broken_link_finder'
+# Set your desired xpath before crawling...
+BrokenLinkFinder::link_xpath = '//img/@src'
+# Now crawl as normal and only your custom targeted links will be checked.
+BrokenLinkFinder.new.crawl_page 'http://txti.es'
+# Go back to using the default provided xpath as needed.
+BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
+```
 ## Contributing
 Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
@@ -128,11 +164,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`.
 To release a new gem version:
-- Update the deps in the `*.gemspec` if necessary
-- Update the version number in `version.rb` and add the new version to the `CHANGELOG`
-- Run `bundle install`
-- Run `bundle exec rake test` ensuring all tests pass
-- Run `bundle exec rake compile` ensuring no warnings
-- Run `bundle exec rake install && rbenv rehash`
-- Manually test the executable
-- Run `bundle exec rake release[origin]`
+- Update the deps in the `*.gemspec`, if necessary.
+- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
+- Run `bundle install`.
+- Run `bundle exec rake test` ensuring all tests pass.
+- Run `bundle exec rake compile` ensuring no warnings.
+- Run `bundle exec rake install && rbenv rehash`.
+- Manually test the executable.
+- Run `bundle exec rake release[origin]`.

data/benchmark.rb CHANGED Viewed

@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
 puts Benchmark.measure { finder.crawl_site url }
 puts "Links crawled: #{finder.total_links_crawled}"
-# http://txti.es page crawl
-# Pre  threading: 17.5 seconds
-# Post threading: 7.5  seconds
+# http://txti.es page crawl with threading
+# Pre:  17.5 seconds
+# Post: 7.5  seconds
-# http://txti.es post threading - page vs site crawl
+# http://txti.es with threading - page vs site crawl
 # Page: 9.526981
 # Site: 9.732416
 # Multi-threading crawl_site now yields the same time as a single page
-# Large site crawl - post all link recording functionality
+# Large site crawl - all link recording functionality
 # Pre:  608 seconds with 7665 links crawled
 # Post: 355 seconds with 1099 links crawled
+# Large site crawl - retry mechanism
+# Pre:  140 seconds
+# Post: 170 seconds

data/bin/console CHANGED Viewed

@@ -5,20 +5,10 @@ require 'bundler/setup'
 require 'pry'
 require 'byebug'
 require 'broken_link_finder'
+require 'logger'
-# Monkey patch and log all HTTP requests made during the console.
-module Typhoeus
-  singleton_class.class_eval do
-    alias_method :orig_get, :get
-  end
-  def self.get(base_url, options = {})
-    puts "[typhoeus] Sending GET: #{base_url}"
-    resp = orig_get(base_url, options)
-    puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
-    resp
-  end
-end
+# Logs all HTTP requests.
+Wgit.logger.level = Logger::DEBUG
 # Call reload to load all recent code changes.
 def reload
@@ -33,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/bin/setup CHANGED Viewed

@@ -5,4 +5,4 @@ set -vx
 bundle install
-# Do any other automated setup that you need to do here
+# Do any other automated setup that you need to do here...

data/broken_link_finder.gemspec CHANGED Viewed

@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = 'https://github.com/michaeltelford/broken-link-finder'
   spec.license       = 'MIT'
   spec.metadata      = {
-    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
+    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
+    'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
+    'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
+    'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
   }
   # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'byebug', '~> 11.0'
   spec.add_development_dependency 'maxitest', '~> 3.3'
   spec.add_development_dependency 'pry', '~> 0.12'
-  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rake', '~> 13.0'
   spec.add_development_dependency 'webmock', '~> 3.6'
-  spec.add_runtime_dependency 'thor', '~> 0.20.3'
-  spec.add_runtime_dependency 'thread', '~> 0.2.0'
-  spec.add_runtime_dependency 'wgit', '~> 0.5.0'
+  spec.add_runtime_dependency 'thor', '~> 0.20'
+  spec.add_runtime_dependency 'thread', '~> 0.2'
+  spec.add_runtime_dependency 'wgit', '~> 0.10'
 end