RubyGems - broken_link_finder - Versions diffs - 0.9.4 → 0.12.0 - Mend

broken_link_finder 0.9.4 → 0.12.0

Files changed (21) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/CHANGELOG.md +52 -0
data/Gemfile.lock +51 -38
data/README.md +65 -29
data/benchmark.rb +9 -5
data/bin/console +11 -19
data/bin/setup +1 -1
data/broken_link_finder.gemspec +8 -5
data/exe/broken_link_finder +14 -3
data/lib/broken_link_finder.rb +8 -2
data/lib/broken_link_finder/finder.rb +131 -132
data/lib/broken_link_finder/link_manager.rb +137 -0
data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
data/lib/broken_link_finder/reporter/reporter.rb +76 -0
data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
data/lib/broken_link_finder/version.rb +1 -1
data/lib/broken_link_finder/wgit_extensions.rb +25 -5
data/lib/broken_link_finder/xpath.rb +14 -0
metadata +21 -15
data/lib/broken_link_finder/reporter.rb +0 -116

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cb0cc981acce272911be9d8a3ed36dd49e0f621eee3e9fd71893020da1600945
-  data.tar.gz: 3b368404cf3b2da83445212c44e43f32ad7d1fc5119c8980aeaa04540ebce2c9
+  metadata.gz: 24ca9c7a6071b07f5ab3132c9c79c4628570c9c3e157b77a27a05cdc0578ac6e
+  data.tar.gz: 6668eb430c8296e1439f56c242e7e08a27733605d724ec1c5cfa638dcfaa8b52
 SHA512:
-  metadata.gz: 92ffd946b60411dba032ac30b8a96820dea262520ab92e1f2d64c48477d4c4ca6e22fe41d221fb421423565f9d61883b48017c1c5af651c1bb71ba96eacf490c
-  data.tar.gz: 17455ab4cf7cb3ab0df9763b98cc844b1c1c07ed702c600cc623e263119a8f071b8a9f55519a021662db64e6bffda91d1c5d439ed8594ae87d830da19acf3529
+  metadata.gz: 1d1cdc47ade4651b8bc2df01212364ba938ee73269bf53e7278519ecd374247291c932abfa73a031973403ed55d360bc9d14b5c60ba312aca4b32837b5064294
+  data.tar.gz: f56308da4b9d7a4a39afd43808f77d2b6f2fbbf00f17502d2d889de504bcc82ee1858fb673333b11693a65ad73a4f5fb65a97b15955443e8268b1e0ab08b4e51

data/.ruby-version CHANGED Viewed

	@@ -1 +1 @@
1	- 2.5.3
1	+ 2.7.0

data/CHANGELOG.md CHANGED Viewed

@@ -9,6 +9,58 @@
 - ...
 ---
+## v0.12.0
+### Added
+- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
+- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
+### Changed/Removed
+- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
+### Fixed
+- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
+---
+## v0.11.1
+### Added
+- ...
+### Changed/Removed
+- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
+### Fixed
+- ...
+---
+## v0.11.0
+### Added
+- Additional crawl statistics.
+- Exit code handling to executable. `0` for success, `1` for an error scenario.
+### Changed/Removed
+- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
+### Fixed
+- Bug in html report, summary url is now an `<a>` link.
+- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
+- Bug causing an error on crawling unparsable/invalid URL's.
+---
+## v0.10.0
+### Added
+- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
+- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
+- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
+### Changed/Removed
+- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
+### Fixed
+- ...
+---
+## v0.9.5
+### Added
+- ...
+### Changed/Removed
+- Now using optimistic dep versioning.
+- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
+### Fixed
+- ...
+---
 ## v0.9.4
 ### Added
 - ...

data/Gemfile.lock CHANGED Viewed

@@ -1,53 +1,66 @@
 PATH
   remote: .
   specs:
-    broken_link_finder (0.9.4)
-      thor (~> 0.20.3)
-      thread (~> 0.2.0)
-      wgit (~> 0.5.0)
+    broken_link_finder (0.12.0)
+      thor (~> 0.20)
+      thread (~> 0.2)
+      wgit (~> 0.10)
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.6.0)
-      public_suffix (>= 2.0.2, < 4.0)
-    bson (4.6.0)
-    byebug (11.0.1)
-    coderay (1.1.2)
-    crack (0.4.3)
-      safe_yaml (~> 1.0.0)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    bson (4.12.0)
+    byebug (11.1.3)
+    cliver (0.3.2)
+    coderay (1.1.3)
+    concurrent-ruby (1.1.8)
+    crack (0.4.5)
+      rexml
     ethon (0.12.0)
       ffi (>= 1.3.0)
-    ffi (1.11.1)
-    hashdiff (1.0.0)
-    maxitest (3.4.0)
-      minitest (>= 5.0.0, < 5.13.0)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.12.2)
-    mongo (2.9.2)
-      bson (>= 4.4.2, < 5.0.0)
-    nokogiri (1.10.5)
-      mini_portile2 (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    public_suffix (3.1.0)
-    rake (10.5.0)
-    safe_yaml (1.0.5)
+    ferrum (0.11)
+      addressable (~> 2.5)
+      cliver (~> 0.3)
+      concurrent-ruby (~> 1.1)
+      websocket-driver (>= 0.6, < 0.8)
+    ffi (1.15.0)
+    hashdiff (1.0.1)
+    maxitest (3.6.0)
+      minitest (>= 5.0.0, < 5.14.0)
+    method_source (1.0.0)
+    mini_portile2 (2.5.0)
+    minitest (5.13.0)
+    mongo (2.14.0)
+      bson (>= 4.8.2, < 5.0.0)
+    nokogiri (1.11.2)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
+    pry (0.14.0)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    public_suffix (4.0.6)
+    racc (1.5.2)
+    rake (13.0.3)
+    rexml (3.2.4)
     thor (0.20.3)
     thread (0.2.2)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    webmock (3.7.6)
+    webmock (3.12.2)
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff (>= 0.4.0, < 2.0.0)
-    wgit (0.5.0)
-      addressable (~> 2.6.0)
-      mongo (~> 2.9.0)
-      nokogiri (~> 1.10.3)
-      typhoeus (~> 1.3.1)
+    websocket-driver (0.7.3)
+      websocket-extensions (>= 0.1.0)
+    websocket-extensions (0.1.5)
+    wgit (0.10.0)
+      addressable (~> 2.6)
+      ferrum (~> 0.8)
+      mongo (~> 2.9)
+      nokogiri (~> 1.10)
+      typhoeus (~> 1.3)
 PLATFORMS
   ruby
@@ -58,11 +71,11 @@ DEPENDENCIES
   byebug (~> 11.0)
   maxitest (~> 3.3)
   pry (~> 0.12)
-  rake (~> 10.0)
+  rake (~> 13.0)
   webmock (~> 3.6)
 RUBY VERSION
-   ruby 2.5.3p105
+   ruby 2.7.0p0
 BUNDLED WITH
-   2.0.1
+   2.1.4

data/README.md CHANGED Viewed

@@ -1,14 +1,16 @@
 # Broken Link Finder
-Does what it says on the tin; Finds a website's broken links.
+Does what it says on the tin - finds a website's broken links.
-Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a concise summary of the broken links found.
+Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
-Because `libcurl` is used under the hood, Broken Link Finder is fast!
+Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
 ## How It Works
-Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
+Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
+For each link on a given page, any of the following conditions constitutes that the link is broken:
 - An empty HTML response body is returned.
 - A response status code of `404 Not Found` is returned.
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
 ## Installation
-Add this line to your application's Gemfile:
+Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
-```ruby
-gem 'broken_link_finder'
-```
+Currently, the required MRI Ruby version is:
-And then execute:
+`~> 2.5` (a.k.a.) `>= 2.5 && < 3`
-    $ bundle
+### Using Bundler
-Or install it yourself as:
+    $ bundle add broken_link_finder
+### Using RubyGems
     $ gem install broken_link_finder
-Finally, verify the installation with:
+### Verify
     $ broken_link_finder version
 ## Usage
-You can check for broken links via the library or executable.
+You can check for broken links via the executable or library.
 ### Executable
@@ -57,7 +59,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
     $ broken_link_finder crawl http://txti.es
-Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
+Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
 See the [output](#Output) section below for an example of a site with broken links.
@@ -76,7 +78,7 @@ require 'broken_link_finder'
 finder = BrokenLinkFinder.new
 finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
-finder.pretty_print_link_report    # Or use Finder#broken_links and Finder#ignored_links
+finder.report                      # Or use Finder#broken_links and Finder#ignored_links
                                    # for direct access to the link Hashes.
 ```
@@ -91,28 +93,62 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
 If broken links are found then the output will look something like:
 ```text
-Found 6 broken link(s) across 2 page(s):
+Crawled http://txti.es
+7 page(s) containing 32 unique link(s) in 6.82 seconds
+Found 6 unique broken link(s) across 2 page(s):
 The following broken links were found on 'http://txti.es/about':
 http://twitter.com/thebarrytone
+/doesntexist
 http://twitter.com/nwbld
-http://twitter.com/txties
-https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
+twitter.com/txties
 The following broken links were found on 'http://txti.es/how':
 http://en.wikipedia.org/wiki/Markdown
 http://imgur.com
-Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
+Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
-The following links were ignored on http://txti.es:
+The following links were ignored on 'http://txti.es':
 tel:+13174562564
 mailto:big.jim@jmail.com
-The following links were ignored on http://txti.es/contact:
+The following links were ignored on 'http://txti.es/contact':
 ftp://server.com
 ```
+You can provide the `--html` flag if you'd prefer a HTML based report.
+## Link Extraction
+You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
+### Executable
+Add the `--xpath` (or `-x`) flag to the crawl command e.g.
+    $ broken_link_finder crawl http://txti.es -x //img/@src
+### Library
+Set the desired XPath using the accessor methods provided:
+> main.rb
+```ruby
+require 'broken_link_finder'
+# Set your desired xpath before crawling...
+BrokenLinkFinder::link_xpath = '//img/@src'
+# Now crawl as normal and only your custom targeted links will be checked.
+BrokenLinkFinder.new.crawl_page 'http://txti.es'
+# Go back to using the default provided xpath as needed.
+BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
+```
 ## Contributing
 Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
@@ -128,11 +164,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`.
 To release a new gem version:
-- Update the deps in the `*.gemspec` if necessary
-- Update the version number in `version.rb` and add the new version to the `CHANGELOG`
-- Run `bundle install`
-- Run `bundle exec rake test` ensuring all tests pass
-- Run `bundle exec rake compile` ensuring no warnings
-- Run `bundle exec rake install && rbenv rehash`
-- Manually test the executable
-- Run `bundle exec rake release[origin]`
+- Update the deps in the `*.gemspec`, if necessary.
+- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
+- Run `bundle install`.
+- Run `bundle exec rake test` ensuring all tests pass.
+- Run `bundle exec rake compile` ensuring no warnings.
+- Run `bundle exec rake install && rbenv rehash`.
+- Manually test the executable.
+- Run `bundle exec rake release[origin]`.

data/benchmark.rb CHANGED Viewed

@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
 puts Benchmark.measure { finder.crawl_site url }
 puts "Links crawled: #{finder.total_links_crawled}"
-# http://txti.es page crawl
-# Pre  threading: 17.5 seconds
-# Post threading: 7.5  seconds
+# http://txti.es page crawl with threading
+# Pre:  17.5 seconds
+# Post: 7.5  seconds
-# http://txti.es post threading - page vs site crawl
+# http://txti.es with threading - page vs site crawl
 # Page: 9.526981
 # Site: 9.732416
 # Multi-threading crawl_site now yields the same time as a single page
-# Large site crawl - post all link recording functionality
+# Large site crawl - all link recording functionality
 # Pre:  608 seconds with 7665 links crawled
 # Post: 355 seconds with 1099 links crawled
+# Large site crawl - retry mechanism
+# Pre:  140 seconds
+# Post: 170 seconds

data/bin/console CHANGED Viewed

@@ -5,20 +5,10 @@ require 'bundler/setup'
 require 'pry'
 require 'byebug'
 require 'broken_link_finder'
+require 'logger'
-# Monkey patch and log all HTTP requests made during the console.
-module Typhoeus
-  singleton_class.class_eval do
-    alias_method :orig_get, :get
-  end
-  def self.get(base_url, options = {})
-    puts "[typhoeus] Sending GET: #{base_url}"
-    resp = orig_get(base_url, options)
-    puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
-    resp
-  end
-end
+# Logs all HTTP requests.
+Wgit.logger.level = Logger::DEBUG
 # Call reload to load all recent code changes.
 def reload
@@ -33,12 +23,14 @@ end
 # You can add fixtures and/or initialization code here...
 reload
-url = 'http://txti.es/'
-by_page = Finder.new
-by_link = Finder.new sort: :link
-finder = by_page
+def url; @url ||= 'http://txti.es/'; end
+def by_page; @by_page ||= Finder.new; end
+def by_link; @by_link ||= Finder.new(sort: :link); end
+def finder; @finder ||= by_page; end
 # Start the console.
-puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
+puts
+puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
+puts
-binding.pry
+Pry.start

data/bin/setup CHANGED Viewed

@@ -5,4 +5,4 @@ set -vx
 bundle install
-# Do any other automated setup that you need to do here
+# Do any other automated setup that you need to do here...

data/broken_link_finder.gemspec CHANGED Viewed

@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = 'https://github.com/michaeltelford/broken-link-finder'
   spec.license       = 'MIT'
   spec.metadata      = {
-    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
+    'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
+    'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
+    'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
+    'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
   }
   # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'byebug', '~> 11.0'
   spec.add_development_dependency 'maxitest', '~> 3.3'
   spec.add_development_dependency 'pry', '~> 0.12'
-  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rake', '~> 13.0'
   spec.add_development_dependency 'webmock', '~> 3.6'
-  spec.add_runtime_dependency 'thor', '~> 0.20.3'
-  spec.add_runtime_dependency 'thread', '~> 0.2.0'
-  spec.add_runtime_dependency 'wgit', '~> 0.5.0'
+  spec.add_runtime_dependency 'thor', '~> 0.20'
+  spec.add_runtime_dependency 'thread', '~> 0.2'
+  spec.add_runtime_dependency 'wgit', '~> 0.10'
 end