broken_link_finder 0.9.3 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e03d27fce04a04ff97aa40ea895b01272926915367712a865787cda0efc3a0a
4
- data.tar.gz: 487abab06c5664fb4024e58c7e1ba43209d076100d5d7da275db2f534aee4c4b
3
+ metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
4
+ data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
5
5
  SHA512:
6
- metadata.gz: 2b90b76061107fcde9b79e6b737710ee067be255182bdd7726ba2343aa64f0357b9901217f78cd0e3928244610cb9ad37007db8d316bbbef2a50355213d5764b
7
- data.tar.gz: 5d727113d883f94e263933c726ea0eb0a41da343844c044d8220e042b877ac84b5c806e2f2d5570dd0dd9eb09a56f4603676b9194bc62201a01cb9fba7b14e23
6
+ metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
7
+ data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
@@ -1 +1 @@
1
- 2.5.3
1
+ 2.7.0
@@ -9,6 +9,57 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.11.1
13
+ ### Added
14
+ - ...
15
+ ### Changed/Removed
16
+ - Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.11.0
22
+ ### Added
23
+ - Additional crawl statistics.
24
+ - Exit code handling to executable. `0` for success, `1` for an error scenario.
25
+ ### Changed/Removed
26
+ - Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
27
+ ### Fixed
28
+ - Bug in html report, summary url is now an `<a>` link.
29
+ - Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
30
+ - Bug causing an error on crawling unparsable/invalid URL's.
31
+ ---
32
+
33
+ ## v0.10.0
34
+ ### Added
35
+ - A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
36
+ - Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
37
+ - `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
38
+ ### Changed/Removed
39
+ - The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
40
+ ### Fixed
41
+ - ...
42
+ ---
43
+
44
+ ## v0.9.5
45
+ ### Added
46
+ - ...
47
+ ### Changed/Removed
48
+ - Now using optimistic dep versioning.
49
+ - Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
50
+ ### Fixed
51
+ - ...
52
+ ---
53
+
54
+ ## v0.9.4
55
+ ### Added
56
+ - ...
57
+ ### Changed/Removed
58
+ - Updated `wgit` gem to version 0.5.0 which contains improvements and bugs fixes.
59
+ ### Fixed
60
+ - ...
61
+ ---
62
+
12
63
  ## v0.9.3
13
64
  ### Added
14
65
  - ...
@@ -1,53 +1,64 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- broken_link_finder (0.9.3)
5
- thor (~> 0.20.3)
6
- thread (~> 0.2.0)
7
- wgit (~> 0.4.1)
4
+ broken_link_finder (0.11.1)
5
+ thor (~> 0.20)
6
+ thread (~> 0.2)
7
+ wgit (~> 0.9)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.6.0)
13
- public_suffix (>= 2.0.2, < 4.0)
14
- bson (4.5.0)
15
- byebug (11.0.1)
16
- coderay (1.1.2)
12
+ addressable (2.7.0)
13
+ public_suffix (>= 2.0.2, < 5.0)
14
+ bson (4.10.0)
15
+ byebug (11.1.3)
16
+ cliver (0.3.2)
17
+ coderay (1.1.3)
18
+ concurrent-ruby (1.1.6)
17
19
  crack (0.4.3)
18
20
  safe_yaml (~> 1.0.0)
19
21
  ethon (0.12.0)
20
22
  ffi (>= 1.3.0)
21
- ffi (1.11.1)
22
- hashdiff (1.0.0)
23
- maxitest (3.4.0)
24
- minitest (>= 5.0.0, < 5.13.0)
25
- method_source (0.9.2)
23
+ ferrum (0.9)
24
+ addressable (~> 2.5)
25
+ cliver (~> 0.3)
26
+ concurrent-ruby (~> 1.1)
27
+ websocket-driver (>= 0.6, < 0.8)
28
+ ffi (1.13.1)
29
+ hashdiff (1.0.1)
30
+ maxitest (3.6.0)
31
+ minitest (>= 5.0.0, < 5.14.0)
32
+ method_source (1.0.0)
26
33
  mini_portile2 (2.4.0)
27
- minitest (5.12.2)
28
- mongo (2.9.2)
29
- bson (>= 4.4.2, < 5.0.0)
30
- nokogiri (1.10.4)
34
+ minitest (5.13.0)
35
+ mongo (2.13.0)
36
+ bson (>= 4.8.2, < 5.0.0)
37
+ nokogiri (1.10.10)
31
38
  mini_portile2 (~> 2.4.0)
32
- pry (0.12.2)
33
- coderay (~> 1.1.0)
34
- method_source (~> 0.9.0)
35
- public_suffix (3.1.0)
36
- rake (10.5.0)
39
+ pry (0.13.1)
40
+ coderay (~> 1.1)
41
+ method_source (~> 1.0)
42
+ public_suffix (4.0.5)
43
+ rake (13.0.1)
37
44
  safe_yaml (1.0.5)
38
45
  thor (0.20.3)
39
46
  thread (0.2.2)
40
- typhoeus (1.3.1)
47
+ typhoeus (1.4.0)
41
48
  ethon (>= 0.9.0)
42
- webmock (3.7.6)
49
+ webmock (3.8.3)
43
50
  addressable (>= 2.3.6)
44
51
  crack (>= 0.3.2)
45
52
  hashdiff (>= 0.4.0, < 2.0.0)
46
- wgit (0.4.1)
47
- addressable (~> 2.6.0)
48
- mongo (~> 2.9.0)
49
- nokogiri (~> 1.10.3)
50
- typhoeus (~> 1.3.1)
53
+ websocket-driver (0.7.3)
54
+ websocket-extensions (>= 0.1.0)
55
+ websocket-extensions (0.1.5)
56
+ wgit (0.9.0)
57
+ addressable (~> 2.6)
58
+ ferrum (~> 0.8)
59
+ mongo (~> 2.9)
60
+ nokogiri (~> 1.10)
61
+ typhoeus (~> 1.3)
51
62
 
52
63
  PLATFORMS
53
64
  ruby
@@ -58,11 +69,11 @@ DEPENDENCIES
58
69
  byebug (~> 11.0)
59
70
  maxitest (~> 3.3)
60
71
  pry (~> 0.12)
61
- rake (~> 10.0)
72
+ rake (~> 13.0)
62
73
  webmock (~> 3.6)
63
74
 
64
75
  RUBY VERSION
65
- ruby 2.5.3p105
76
+ ruby 2.7.0p0
66
77
 
67
78
  BUNDLED WITH
68
- 2.0.1
79
+ 2.1.4
data/README.md CHANGED
@@ -1,8 +1,10 @@
1
1
  # Broken Link Finder
2
2
 
3
- Does what it says on the tin; Finds a website's broken links.
3
+ Does what it says on the tin - finds a website's broken links.
4
4
 
5
- Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a nice concise summary of the broken links found.
5
+ Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
6
+
7
+ Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
6
8
 
7
9
  ## How It Works
8
10
 
@@ -10,7 +12,7 @@ Any HTML page element with a `href` or `src` attribute is considered a link. For
10
12
 
11
13
  - An empty HTML response body is returned.
12
14
  - A response status code of `404 Not Found` is returned.
13
- - The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
15
+ - The HTML response body doesn't contain an element ID matching that of the link's fragment e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
14
16
  - The link redirects more than 5 times consecutively.
15
17
 
16
18
  **Note**: Not all link types are supported.
@@ -55,7 +57,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
55
57
 
56
58
  $ broken_link_finder crawl http://txti.es
57
59
 
58
- Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
60
+ Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
59
61
 
60
62
  See the [output](#Output) section below for an example of a site with broken links.
61
63
 
@@ -73,9 +75,9 @@ Below is a simple script which crawls a website and outputs its broken links to
73
75
  require 'broken_link_finder'
74
76
 
75
77
  finder = BrokenLinkFinder.new
76
- finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
77
- finder.pretty_print_link_report # Or use Finder#broken_links and Finder#ignored_links
78
- # for direct access to the link Hashes.
78
+ finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
79
+ finder.report # Or use Finder#broken_links and Finder#ignored_links
80
+ # for direct access to the link Hashes.
79
81
  ```
80
82
 
81
83
  Then execute the script with:
@@ -89,28 +91,33 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
89
91
  If broken links are found then the output will look something like:
90
92
 
91
93
  ```text
92
- Found 6 broken link(s) across 2 page(s):
94
+ Crawled http://txti.es
95
+ 7 page(s) containing 32 unique link(s) in 6.82 seconds
96
+
97
+ Found 6 unique broken link(s) across 2 page(s):
93
98
 
94
99
  The following broken links were found on 'http://txti.es/about':
95
100
  http://twitter.com/thebarrytone
101
+ /doesntexist
96
102
  http://twitter.com/nwbld
97
- http://twitter.com/txties
98
- https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
103
+ twitter.com/txties
99
104
 
100
105
  The following broken links were found on 'http://txti.es/how':
101
106
  http://en.wikipedia.org/wiki/Markdown
102
107
  http://imgur.com
103
108
 
104
- Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
109
+ Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
105
110
 
106
- The following links were ignored on http://txti.es:
111
+ The following links were ignored on 'http://txti.es':
107
112
  tel:+13174562564
108
113
  mailto:big.jim@jmail.com
109
114
 
110
- The following links were ignored on http://txti.es/contact:
115
+ The following links were ignored on 'http://txti.es/contact':
111
116
  ftp://server.com
112
117
  ```
113
118
 
119
+ You can provide the `--html` flag if you'd prefer a HTML based report.
120
+
114
121
  ## Contributing
115
122
 
116
123
  Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
@@ -126,9 +133,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
126
133
  To install this gem onto your local machine, run `bundle exec rake install`.
127
134
 
128
135
  To release a new gem version:
129
- - Update the version number in `version.rb` and add the new version to the `CHANGELOG`
130
- - Run `bundle install`
131
- - Run `bundle exec rake test` ensuring all tests pass
132
- - Run `bundle exec rake compile` ensuring no warnings
133
- - Run `bundle exec rake install && rbenv rehash` and manually test the executable
134
- - Run `bundle exec rake release[origin]`
136
+ - Update the deps in the `*.gemspec`, if necessary.
137
+ - Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
138
+ - Run `bundle install`.
139
+ - Run `bundle exec rake test` ensuring all tests pass.
140
+ - Run `bundle exec rake compile` ensuring no warnings.
141
+ - Run `bundle exec rake install && rbenv rehash`.
142
+ - Manually test the executable.
143
+ - Run `bundle exec rake release[origin]`.
@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
10
10
  puts Benchmark.measure { finder.crawl_site url }
11
11
  puts "Links crawled: #{finder.total_links_crawled}"
12
12
 
13
- # http://txti.es page crawl
14
- # Pre threading: 17.5 seconds
15
- # Post threading: 7.5 seconds
13
+ # http://txti.es page crawl with threading
14
+ # Pre: 17.5 seconds
15
+ # Post: 7.5 seconds
16
16
 
17
- # http://txti.es post threading - page vs site crawl
17
+ # http://txti.es with threading - page vs site crawl
18
18
  # Page: 9.526981
19
19
  # Site: 9.732416
20
20
  # Multi-threading crawl_site now yields the same time as a single page
21
21
 
22
- # Large site crawl - post all link recording functionality
22
+ # Large site crawl - all link recording functionality
23
23
  # Pre: 608 seconds with 7665 links crawled
24
24
  # Post: 355 seconds with 1099 links crawled
25
+
26
+ # Large site crawl - retry mechanism
27
+ # Pre: 140 seconds
28
+ # Post: 170 seconds
@@ -5,20 +5,10 @@ require 'bundler/setup'
5
5
  require 'pry'
6
6
  require 'byebug'
7
7
  require 'broken_link_finder'
8
+ require 'logger'
8
9
 
9
- # Monkey patch and log all HTTP requests made during the console.
10
- module Typhoeus
11
- singleton_class.class_eval do
12
- alias_method :orig_get, :get
13
- end
14
-
15
- def self.get(base_url, options = {})
16
- puts "[typhoeus] Sending GET: #{base_url}"
17
- resp = orig_get(base_url, options)
18
- puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
19
- resp
20
- end
21
- end
10
+ # Logs all HTTP requests.
11
+ Wgit.logger.level = Logger::DEBUG
22
12
 
23
13
  # Call reload to load all recent code changes.
24
14
  def reload
@@ -33,12 +23,14 @@ end
33
23
  # You can add fixtures and/or initialization code here...
34
24
  reload
35
25
 
36
- url = 'http://txti.es/'
37
- by_page = Finder.new
38
- by_link = Finder.new sort: :link
39
- finder = by_page
26
+ def url; @url ||= 'http://txti.es/'; end
27
+ def by_page; @by_page ||= Finder.new; end
28
+ def by_link; @by_link ||= Finder.new(sort: :link); end
29
+ def finder; @finder ||= by_page; end
40
30
 
41
31
  # Start the console.
42
- puts "\nbroken_link_finder v#{BrokenLinkFinder::VERSION}"
32
+ puts
33
+ puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
34
+ puts
43
35
 
44
- binding.pry
36
+ Pry.start
data/bin/setup CHANGED
@@ -5,4 +5,4 @@ set -vx
5
5
 
6
6
  bundle install
7
7
 
8
- # Do any other automated setup that you need to do here
8
+ # Do any other automated setup that you need to do here...
@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
15
15
  spec.homepage = 'https://github.com/michaeltelford/broken-link-finder'
16
16
  spec.license = 'MIT'
17
17
  spec.metadata = {
18
- 'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
18
+ 'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
19
+ 'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
20
+ 'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
21
+ 'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
19
22
  }
20
23
 
21
24
  # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
41
44
  spec.add_development_dependency 'byebug', '~> 11.0'
42
45
  spec.add_development_dependency 'maxitest', '~> 3.3'
43
46
  spec.add_development_dependency 'pry', '~> 0.12'
44
- spec.add_development_dependency 'rake', '~> 10.0'
47
+ spec.add_development_dependency 'rake', '~> 13.0'
45
48
  spec.add_development_dependency 'webmock', '~> 3.6'
46
49
 
47
- spec.add_runtime_dependency 'thor', '~> 0.20.3'
48
- spec.add_runtime_dependency 'thread', '~> 0.2.0'
49
- spec.add_runtime_dependency 'wgit', '~> 0.4.1'
50
+ spec.add_runtime_dependency 'thor', '~> 0.20'
51
+ spec.add_runtime_dependency 'thread', '~> 0.2'
52
+ spec.add_runtime_dependency 'wgit', '~> 0.9'
50
53
  end
@@ -9,12 +9,14 @@ class BrokenLinkFinderCLI < Thor
9
9
  desc 'crawl [URL]', 'Find broken links at the URL'
10
10
  option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
11
11
  option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
12
+ option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
12
13
  option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
13
14
  option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
14
15
  option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
15
16
  def crawl(url)
16
17
  url = "http://#{url}" unless url.start_with?('http')
17
18
 
19
+ report_type = options[:html] ? :html : :text
18
20
  sort_by = options[:sort_by_link] ? :link : :page
19
21
  max_threads = options[:threads]
20
22
  broken_verbose = !options[:concise]
@@ -22,17 +24,24 @@ class BrokenLinkFinderCLI < Thor
22
24
 
23
25
  finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
24
26
  options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
25
- finder.pretty_print_link_report(
26
- broken_verbose: broken_verbose,
27
+ finder.report(
28
+ type: report_type,
29
+ broken_verbose: broken_verbose,
27
30
  ignored_verbose: ignored_verbose
28
31
  )
29
- rescue Exception => e
32
+
33
+ exit 0
34
+ rescue StandardError => e
30
35
  puts "An error has occurred: #{e.message}"
36
+
37
+ exit 1
31
38
  end
32
39
 
33
40
  desc 'version', 'Display the currently installed version'
34
41
  def version
35
42
  puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
43
+
44
+ exit 0
36
45
  end
37
46
  end
38
47
 
@@ -2,8 +2,13 @@
2
2
 
3
3
  require 'wgit'
4
4
  require 'wgit/core_ext'
5
+ require 'thread/pool'
6
+ require 'set'
5
7
 
6
8
  require_relative './broken_link_finder/wgit_extensions'
7
9
  require_relative './broken_link_finder/version'
8
- require_relative './broken_link_finder/reporter'
10
+ require_relative './broken_link_finder/link_manager'
11
+ require_relative './broken_link_finder/reporter/reporter'
12
+ require_relative './broken_link_finder/reporter/text_reporter'
13
+ require_relative './broken_link_finder/reporter/html_reporter'
9
14
  require_relative './broken_link_finder/finder'
@@ -1,234 +1,227 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'reporter'
4
- require 'thread/pool'
5
- require 'set'
6
-
7
3
  module BrokenLinkFinder
8
- DEFAULT_MAX_THREADS = 100
4
+ DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
5
+ SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
9
6
 
10
7
  # Alias for BrokenLinkFinder::Finder.new.
11
8
  def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
12
9
  Finder.new(sort: sort, max_threads: max_threads)
13
10
  end
14
11
 
12
+ # Class responsible for finding broken links on a page or site.
15
13
  class Finder
16
- attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads
14
+ # The collection key - either :page or :link.
15
+ attr_reader :sort
16
+
17
+ # The max number of threads created during #crawl_site - one thread per page.
18
+ attr_reader :max_threads
17
19
 
18
- # Creates a new Finder instance.
19
- def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS)
20
+ # Returns a new Finder instance.
21
+ def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
20
22
  raise "Sort by either :page or :link, not #{sort}" \
21
23
  unless %i[page link].include?(sort)
22
24
 
23
25
  @sort = sort
24
26
  @max_threads = max_threads
25
- @lock = Mutex.new
26
27
  @crawler = Wgit::Crawler.new
28
+ @manager = BrokenLinkFinder::LinkManager.new(@sort)
29
+ end
30
+
31
+ # Returns the current broken links.
32
+ def broken_links
33
+ @manager.broken_links
34
+ end
27
35
 
28
- clear_links
36
+ # Returns the current ignored links.
37
+ def ignored_links
38
+ @manager.ignored_links
29
39
  end
30
40
 
31
- # Clear/empty the link collection Hashes.
32
- def clear_links
33
- @broken_links = {}
34
- @ignored_links = {}
35
- @total_links_crawled = 0
36
- @all_broken_links = Set.new
37
- @all_intact_links = Set.new
41
+ # Returns the current crawl stats.
42
+ def crawl_stats
43
+ @manager.crawl_stats
38
44
  end
39
45
 
40
- # Finds broken links within a single page and appends them to the
41
- # @broken_links array. Returns true if at least one broken link was found.
46
+ # Finds broken links within a single page and records them.
47
+ # Returns true if at least one broken link was found.
42
48
  # Access the broken links afterwards with Finder#broken_links.
43
49
  def crawl_url(url)
44
- clear_links
50
+ @manager.empty
45
51
 
46
- url = url.to_url
47
- doc = @crawler.crawl(url)
52
+ start = Time.now
53
+ url = url.to_url
54
+
55
+ # We dup the url to avoid recording any redirects.
56
+ doc = @crawler.crawl(url.dup)
48
57
 
49
58
  # Ensure the given page url is valid.
50
59
  raise "Invalid or broken URL: #{url}" unless doc
51
60
 
52
61
  # Get all page links and determine which are broken.
53
62
  find_broken_links(doc)
63
+ retry_broken_links
54
64
 
55
- sort_links
56
- set_total_links_crawled
65
+ @manager.sort
66
+ @manager.tally(url: url, pages_crawled: [url], start: start)
57
67
 
58
- @broken_links.any?
68
+ broken_links.any?
59
69
  end
60
70
 
61
- # Finds broken links within an entire site and appends them to the
62
- # @broken_links array. Returns a tuple containing a Boolean of true if
63
- # at least one broken link was found and an Array of all pages crawled.
71
+ # Finds broken links within an entire site and records them.
72
+ # Returns true if at least one broken link was found.
64
73
  # Access the broken links afterwards with Finder#broken_links.
65
- def crawl_site(url)
66
- clear_links
74
+ def crawl_site(url, allow_paths: nil, disallow_paths: nil)
75
+ @manager.empty
67
76
 
68
- url = url.to_url
69
- pool = Thread.pool(@max_threads)
70
- crawled_pages = []
77
+ start = Time.now
78
+ url = url.to_url
79
+ pool = Thread.pool(@max_threads)
80
+ crawled = Set.new
71
81
 
72
82
  # Crawl the site's HTML web pages looking for links.
73
- externals = @crawler.crawl_site(url) do |doc|
74
- crawled_pages << doc.url
83
+ # We dup the url to avoid recording any redirects.
84
+ paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
85
+ externals = @crawler.crawl_site(url.dup, **paths) do |doc|
86
+ crawled << doc.url
75
87
  next unless doc
76
88
 
77
89
  # Start a thread for each page, checking for broken links.
78
90
  pool.process { find_broken_links(doc) }
79
91
  end
80
92
 
93
+ # Wait for all threads to finish, even if url was invalid.
94
+ pool.shutdown
95
+
81
96
  # Ensure the given website url is valid.
82
97
  raise "Invalid or broken URL: #{url}" unless externals
83
98
 
84
- # Wait for all threads to finish.
85
- pool.shutdown
99
+ retry_broken_links
86
100
 
87
- sort_links
88
- set_total_links_crawled
101
+ @manager.sort
102
+ @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
89
103
 
90
- [@broken_links.any?, crawled_pages.uniq]
104
+ broken_links.any?
105
+ ensure
106
+ pool.shutdown if defined?(pool)
91
107
  end
92
108
 
93
- # Pretty prints the link report into a stream e.g. STDOUT or a file,
109
+ # Outputs the link report into a stream e.g. STDOUT or a file,
94
110
  # anything that respond_to? :puts. Defaults to STDOUT.
95
- # Returns true if there were broken links and vice versa.
96
- def pretty_print_link_report(
97
- stream = STDOUT,
98
- broken_verbose: true,
99
- ignored_verbose: false
100
- )
101
- reporter = BrokenLinkFinder::Reporter.new(
102
- stream, @sort, @broken_links, @ignored_links
103
- )
104
- reporter.pretty_print_link_report(
105
- broken_verbose: broken_verbose,
106
- ignored_verbose: ignored_verbose
107
- )
108
-
109
- @broken_links.any?
111
+ def report(stream = STDOUT, type: :text,
112
+ broken_verbose: true, ignored_verbose: false)
113
+ klass = case type
114
+ when :text
115
+ BrokenLinkFinder::TextReporter
116
+ when :html
117
+ BrokenLinkFinder::HTMLReporter
118
+ else
119
+ raise "The type: must be :text or :html, not: :#{type}"
120
+ end
121
+
122
+ reporter = klass.new(stream, @sort,
123
+ broken_links, ignored_links,
124
+ @manager.broken_link_map, crawl_stats)
125
+ reporter.call(broken_verbose: broken_verbose,
126
+ ignored_verbose: ignored_verbose)
110
127
  end
111
128
 
112
129
  private
113
130
 
114
131
  # Finds which links are unsupported or broken and records the details.
115
- def find_broken_links(doc)
116
- links = get_supported_links(doc)
132
+ def find_broken_links(page)
133
+ record_unparsable_links(page) # Record them as broken.
134
+
135
+ links = get_supported_links(page)
117
136
 
118
137
  # Iterate over the supported links checking if they're broken or not.
119
138
  links.each do |link|
120
- # Check if the link has already been processed previously.
121
- next if @all_intact_links.include?(link)
139
+ # Skip if the link has been encountered previously.
140
+ next if @manager.all_intact_links.include?(link)
122
141
 
123
- if @all_broken_links.include?(link)
124
- append_broken_link(doc.url, link)
142
+ if @manager.all_broken_links.include?(link)
143
+ # The link has already been proven broken so simply record it.
144
+ @manager.append_broken_link(page, link, map: false)
125
145
  next
126
146
  end
127
147
 
128
- # The link hasn't been processed before so we crawl it.
129
- link_doc = crawl_link(doc, link)
148
+ # The link hasn't been encountered before so we crawl it.
149
+ link_doc = crawl_link(page, link)
130
150
 
131
- # Determine if the crawled link is broken or not.
132
- if link_doc.nil? ||
133
- @crawler.last_response.code == 404 ||
134
- has_broken_anchor(link_doc)
135
- append_broken_link(doc.url, link)
151
+ # Determine if the crawled link is broken or not and record it.
152
+ if link_broken?(link_doc)
153
+ @manager.append_broken_link(page, link)
136
154
  else
137
- @lock.synchronize { @all_intact_links << link }
155
+ @manager.append_intact_link(link)
138
156
  end
139
157
  end
140
158
 
141
159
  nil
142
160
  end
143
161
 
144
- # Report and reject any non supported links. Any link that is absolute and
145
- # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
146
- def get_supported_links(doc)
147
- doc.all_links
148
- .reject do |link|
149
- if link.is_absolute? && !link.start_with?('http')
150
- append_ignored_link(doc.url, link)
151
- true
152
- end
153
- end
154
- end
162
+ # Implements a retry mechanism for each of the broken links found.
163
+ # Removes any broken links found to be working OK.
164
+ def retry_broken_links
165
+ sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
155
166
 
156
- # Makes the link absolute and crawls it, returning its Wgit::Document.
157
- def crawl_link(doc, link)
158
- link = get_absolute_link(doc, link)
159
- @crawler.crawl(link)
160
- end
167
+ @manager.broken_link_map.select! do |link, href|
168
+ # Don't retry unparsable links (which are Strings).
169
+ next(true) unless href.is_a?(Wgit::Url)
161
170
 
162
- # Returns the link in absolute form so it can be crawled.
163
- def get_absolute_link(doc, link)
164
- link.is_relative? ? doc.base_url(link: link).concat(link) : link
165
- end
171
+ doc = @crawler.crawl(href.dup)
166
172
 
167
- # Returns true if the link is/contains a broken anchor.
168
- def has_broken_anchor(doc)
169
- raise 'link document is nil' unless doc
170
-
171
- anchor = doc.url.anchor
172
- return false if anchor.nil? || (anchor == '#')
173
-
174
- anchor = anchor[1..-1] if anchor.start_with?('#')
175
- doc.xpath("//*[@id='#{anchor}']").empty?
173
+ if link_broken?(doc)
174
+ true
175
+ else
176
+ @manager.remove_broken_link(link)
177
+ false
178
+ end
179
+ end
176
180
  end
177
181
 
178
- # Append key => [value] to @broken_links.
179
- def append_broken_link(url, link)
180
- key, value = get_key_value(url, link)
181
-
182
- @lock.synchronize do
183
- @broken_links[key] = [] unless @broken_links[key]
184
- @broken_links[key] << value
185
-
186
- @all_broken_links << link
182
+ # Record each unparsable link as a broken link.
183
+ def record_unparsable_links(doc)
184
+ doc.unparsable_links.each do |link|
185
+ # We map the link ourselves because link is a String, not a Wgit::Url.
186
+ @manager.append_broken_link(doc, link, map: false)
187
+ @manager.broken_link_map[link] = link
187
188
  end
188
189
  end
189
190
 
190
- # Append key => [value] to @ignored_links.
191
- def append_ignored_link(url, link)
192
- key, value = get_key_value(url, link)
193
-
194
- @lock.synchronize do
195
- @ignored_links[key] = [] unless @ignored_links[key]
196
- @ignored_links[key] << value
191
+ # Report and reject any non supported links. Any link that is absolute and
192
+ # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
193
+ def get_supported_links(doc)
194
+ doc.all_links.reject do |link|
195
+ if link.is_absolute? && !link.start_with?('http')
196
+ @manager.append_ignored_link(doc.url, link)
197
+ true
198
+ end
197
199
  end
198
200
  end
199
201
 
200
- # Returns the correct key value depending on the @sort type.
201
- # @sort == :page ? [url, link] : [link, url]
202
- def get_key_value(url, link)
203
- case @sort
204
- when :page
205
- [url, link]
206
- when :link
207
- [link, url]
208
- else
209
- raise "Unsupported sort type: #{sort}"
210
- end
202
+ # Make the link absolute and crawl it, returning its Wgit::Document.
203
+ def crawl_link(doc, link)
204
+ link = link.make_absolute(doc)
205
+ @crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
211
206
  end
212
207
 
213
- # Sort keys and values alphabetically.
214
- def sort_links
215
- @broken_links.values.map(&:uniq!)
216
- @ignored_links.values.map(&:uniq!)
208
+ # Return if the crawled link is broken or not.
209
+ def link_broken?(doc)
210
+ doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
211
+ end
217
212
 
218
- @broken_links = @broken_links.sort_by { |k, _v| k }.to_h
219
- @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
213
+ # Returns true if the link is/contains a broken anchor/fragment.
214
+ # E.g. /about#top should contain a HTML element with an @id of 'top' etc.
215
+ def has_broken_anchor(doc)
216
+ raise 'The link document is nil' unless doc
220
217
 
221
- @broken_links.each { |_k, v| v.sort! }
222
- @ignored_links.each { |_k, v| v.sort! }
223
- end
218
+ fragment = doc.url.fragment
219
+ return false if fragment.nil? || fragment.empty?
224
220
 
225
- # Sets and returns the total number of links crawled.
226
- def set_total_links_crawled
227
- @total_links_crawled = @all_broken_links.size + @all_intact_links.size
221
+ doc.xpath("//*[@id='#{fragment}']").empty?
228
222
  end
229
223
 
230
- alias crawl_page crawl_url
231
- alias crawl_r crawl_site
232
- alias pretty_print_link_summary pretty_print_link_report
224
+ alias crawl_page crawl_url
225
+ alias crawl_r crawl_site
233
226
  end
234
227
  end