broken_link_finder 0.9.3 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +51 -0
- data/Gemfile.lock +44 -33
- data/README.md +28 -19
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +12 -3
- data/lib/broken_link_finder.rb +6 -1
- data/lib/broken_link_finder/finder.rb +134 -141
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- metadata +18 -13
- data/lib/broken_link_finder/reporter.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
|
4
|
+
data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
|
7
|
+
data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,57 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.11.1
|
13
|
+
### Added
|
14
|
+
- ...
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.11.0
|
22
|
+
### Added
|
23
|
+
- Additional crawl statistics.
|
24
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
25
|
+
### Changed/Removed
|
26
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
27
|
+
### Fixed
|
28
|
+
- Bug in html report, summary url is now an `<a>` link.
|
29
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
30
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
31
|
+
---
|
32
|
+
|
33
|
+
## v0.10.0
|
34
|
+
### Added
|
35
|
+
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
36
|
+
- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
|
37
|
+
- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
|
38
|
+
### Changed/Removed
|
39
|
+
- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
|
40
|
+
### Fixed
|
41
|
+
- ...
|
42
|
+
---
|
43
|
+
|
44
|
+
## v0.9.5
|
45
|
+
### Added
|
46
|
+
- ...
|
47
|
+
### Changed/Removed
|
48
|
+
- Now using optimistic dep versioning.
|
49
|
+
- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
|
50
|
+
### Fixed
|
51
|
+
- ...
|
52
|
+
---
|
53
|
+
|
54
|
+
## v0.9.4
|
55
|
+
### Added
|
56
|
+
- ...
|
57
|
+
### Changed/Removed
|
58
|
+
- Updated `wgit` gem to version 0.5.0 which contains improvements and bugs fixes.
|
59
|
+
### Fixed
|
60
|
+
- ...
|
61
|
+
---
|
62
|
+
|
12
63
|
## v0.9.3
|
13
64
|
### Added
|
14
65
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,53 +1,64 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
thor (~> 0.20
|
6
|
-
thread (~> 0.2
|
7
|
-
wgit (~> 0.
|
4
|
+
broken_link_finder (0.11.1)
|
5
|
+
thor (~> 0.20)
|
6
|
+
thread (~> 0.2)
|
7
|
+
wgit (~> 0.9)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
bson (4.
|
15
|
-
byebug (11.
|
16
|
-
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
|
+
bson (4.10.0)
|
15
|
+
byebug (11.1.3)
|
16
|
+
cliver (0.3.2)
|
17
|
+
coderay (1.1.3)
|
18
|
+
concurrent-ruby (1.1.6)
|
17
19
|
crack (0.4.3)
|
18
20
|
safe_yaml (~> 1.0.0)
|
19
21
|
ethon (0.12.0)
|
20
22
|
ffi (>= 1.3.0)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
ferrum (0.9)
|
24
|
+
addressable (~> 2.5)
|
25
|
+
cliver (~> 0.3)
|
26
|
+
concurrent-ruby (~> 1.1)
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
28
|
+
ffi (1.13.1)
|
29
|
+
hashdiff (1.0.1)
|
30
|
+
maxitest (3.6.0)
|
31
|
+
minitest (>= 5.0.0, < 5.14.0)
|
32
|
+
method_source (1.0.0)
|
26
33
|
mini_portile2 (2.4.0)
|
27
|
-
minitest (5.
|
28
|
-
mongo (2.
|
29
|
-
bson (>= 4.
|
30
|
-
nokogiri (1.10.
|
34
|
+
minitest (5.13.0)
|
35
|
+
mongo (2.13.0)
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
37
|
+
nokogiri (1.10.10)
|
31
38
|
mini_portile2 (~> 2.4.0)
|
32
|
-
pry (0.
|
33
|
-
coderay (~> 1.1
|
34
|
-
method_source (~>
|
35
|
-
public_suffix (
|
36
|
-
rake (
|
39
|
+
pry (0.13.1)
|
40
|
+
coderay (~> 1.1)
|
41
|
+
method_source (~> 1.0)
|
42
|
+
public_suffix (4.0.5)
|
43
|
+
rake (13.0.1)
|
37
44
|
safe_yaml (1.0.5)
|
38
45
|
thor (0.20.3)
|
39
46
|
thread (0.2.2)
|
40
|
-
typhoeus (1.
|
47
|
+
typhoeus (1.4.0)
|
41
48
|
ethon (>= 0.9.0)
|
42
|
-
webmock (3.
|
49
|
+
webmock (3.8.3)
|
43
50
|
addressable (>= 2.3.6)
|
44
51
|
crack (>= 0.3.2)
|
45
52
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
53
|
+
websocket-driver (0.7.3)
|
54
|
+
websocket-extensions (>= 0.1.0)
|
55
|
+
websocket-extensions (0.1.5)
|
56
|
+
wgit (0.9.0)
|
57
|
+
addressable (~> 2.6)
|
58
|
+
ferrum (~> 0.8)
|
59
|
+
mongo (~> 2.9)
|
60
|
+
nokogiri (~> 1.10)
|
61
|
+
typhoeus (~> 1.3)
|
51
62
|
|
52
63
|
PLATFORMS
|
53
64
|
ruby
|
@@ -58,11 +69,11 @@ DEPENDENCIES
|
|
58
69
|
byebug (~> 11.0)
|
59
70
|
maxitest (~> 3.3)
|
60
71
|
pry (~> 0.12)
|
61
|
-
rake (~>
|
72
|
+
rake (~> 13.0)
|
62
73
|
webmock (~> 3.6)
|
63
74
|
|
64
75
|
RUBY VERSION
|
65
|
-
ruby 2.
|
76
|
+
ruby 2.7.0p0
|
66
77
|
|
67
78
|
BUNDLED WITH
|
68
|
-
2.
|
79
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
4
4
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
6
|
+
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
6
8
|
|
7
9
|
## How It Works
|
8
10
|
|
@@ -10,7 +12,7 @@ Any HTML page element with a `href` or `src` attribute is considered a link. For
|
|
10
12
|
|
11
13
|
- An empty HTML response body is returned.
|
12
14
|
- A response status code of `404 Not Found` is returned.
|
13
|
-
- The HTML response body doesn't contain an element ID matching that of the link's
|
15
|
+
- The HTML response body doesn't contain an element ID matching that of the link's fragment e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
|
14
16
|
- The link redirects more than 5 times consecutively.
|
15
17
|
|
16
18
|
**Note**: Not all link types are supported.
|
@@ -55,7 +57,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
|
|
55
57
|
|
56
58
|
$ broken_link_finder crawl http://txti.es
|
57
59
|
|
58
|
-
Adding the
|
60
|
+
Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
|
59
61
|
|
60
62
|
See the [output](#Output) section below for an example of a site with broken links.
|
61
63
|
|
@@ -73,9 +75,9 @@ Below is a simple script which crawls a website and outputs its broken links to
|
|
73
75
|
require 'broken_link_finder'
|
74
76
|
|
75
77
|
finder = BrokenLinkFinder.new
|
76
|
-
finder.crawl_site 'http://txti.es'
|
77
|
-
finder.
|
78
|
-
|
78
|
+
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
79
|
+
finder.report # Or use Finder#broken_links and Finder#ignored_links
|
80
|
+
# for direct access to the link Hashes.
|
79
81
|
```
|
80
82
|
|
81
83
|
Then execute the script with:
|
@@ -89,28 +91,33 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
89
91
|
If broken links are found then the output will look something like:
|
90
92
|
|
91
93
|
```text
|
92
|
-
|
94
|
+
Crawled http://txti.es
|
95
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
96
|
+
|
97
|
+
Found 6 unique broken link(s) across 2 page(s):
|
93
98
|
|
94
99
|
The following broken links were found on 'http://txti.es/about':
|
95
100
|
http://twitter.com/thebarrytone
|
101
|
+
/doesntexist
|
96
102
|
http://twitter.com/nwbld
|
97
|
-
|
98
|
-
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
103
|
+
twitter.com/txties
|
99
104
|
|
100
105
|
The following broken links were found on 'http://txti.es/how':
|
101
106
|
http://en.wikipedia.org/wiki/Markdown
|
102
107
|
http://imgur.com
|
103
108
|
|
104
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
109
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
105
110
|
|
106
|
-
The following links were ignored on http://txti.es:
|
111
|
+
The following links were ignored on 'http://txti.es':
|
107
112
|
tel:+13174562564
|
108
113
|
mailto:big.jim@jmail.com
|
109
114
|
|
110
|
-
The following links were ignored on http://txti.es/contact:
|
115
|
+
The following links were ignored on 'http://txti.es/contact':
|
111
116
|
ftp://server.com
|
112
117
|
```
|
113
118
|
|
119
|
+
You can provide the `--html` flag if you'd prefer a HTML based report.
|
120
|
+
|
114
121
|
## Contributing
|
115
122
|
|
116
123
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
@@ -126,9 +133,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
126
133
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
127
134
|
|
128
135
|
To release a new gem version:
|
129
|
-
- Update the
|
130
|
-
-
|
131
|
-
- Run `bundle
|
132
|
-
- Run `bundle exec rake
|
133
|
-
- Run `bundle exec rake
|
134
|
-
- Run `bundle exec rake
|
136
|
+
- Update the deps in the `*.gemspec`, if necessary.
|
137
|
+
- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
|
138
|
+
- Run `bundle install`.
|
139
|
+
- Run `bundle exec rake test` ensuring all tests pass.
|
140
|
+
- Run `bundle exec rake compile` ensuring no warnings.
|
141
|
+
- Run `bundle exec rake install && rbenv rehash`.
|
142
|
+
- Manually test the executable.
|
143
|
+
- Run `bundle exec rake release[origin]`.
|
data/benchmark.rb
CHANGED
@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
|
|
10
10
|
puts Benchmark.measure { finder.crawl_site url }
|
11
11
|
puts "Links crawled: #{finder.total_links_crawled}"
|
12
12
|
|
13
|
-
# http://txti.es page crawl
|
14
|
-
# Pre
|
15
|
-
# Post
|
13
|
+
# http://txti.es page crawl with threading
|
14
|
+
# Pre: 17.5 seconds
|
15
|
+
# Post: 7.5 seconds
|
16
16
|
|
17
|
-
# http://txti.es
|
17
|
+
# http://txti.es with threading - page vs site crawl
|
18
18
|
# Page: 9.526981
|
19
19
|
# Site: 9.732416
|
20
20
|
# Multi-threading crawl_site now yields the same time as a single page
|
21
21
|
|
22
|
-
# Large site crawl -
|
22
|
+
# Large site crawl - all link recording functionality
|
23
23
|
# Pre: 608 seconds with 7665 links crawled
|
24
24
|
# Post: 355 seconds with 1099 links crawled
|
25
|
+
|
26
|
+
# Large site crawl - retry mechanism
|
27
|
+
# Pre: 140 seconds
|
28
|
+
# Post: 170 seconds
|
data/bin/console
CHANGED
@@ -5,20 +5,10 @@ require 'bundler/setup'
|
|
5
5
|
require 'pry'
|
6
6
|
require 'byebug'
|
7
7
|
require 'broken_link_finder'
|
8
|
+
require 'logger'
|
8
9
|
|
9
|
-
#
|
10
|
-
|
11
|
-
singleton_class.class_eval do
|
12
|
-
alias_method :orig_get, :get
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.get(base_url, options = {})
|
16
|
-
puts "[typhoeus] Sending GET: #{base_url}"
|
17
|
-
resp = orig_get(base_url, options)
|
18
|
-
puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
|
19
|
-
resp
|
20
|
-
end
|
21
|
-
end
|
10
|
+
# Logs all HTTP requests.
|
11
|
+
Wgit.logger.level = Logger::DEBUG
|
22
12
|
|
23
13
|
# Call reload to load all recent code changes.
|
24
14
|
def reload
|
@@ -33,12 +23,14 @@ end
|
|
33
23
|
# You can add fixtures and/or initialization code here...
|
34
24
|
reload
|
35
25
|
|
36
|
-
url
|
37
|
-
by_page
|
38
|
-
by_link
|
39
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
40
30
|
|
41
31
|
# Start the console.
|
42
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
43
35
|
|
44
|
-
|
36
|
+
Pry.start
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
|
|
15
15
|
spec.homepage = 'https://github.com/michaeltelford/broken-link-finder'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
spec.metadata = {
|
18
|
-
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
|
18
|
+
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
|
19
|
+
'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
|
20
|
+
'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
|
21
|
+
'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
|
19
22
|
}
|
20
23
|
|
21
24
|
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
|
|
41
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
42
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
43
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
45
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
46
49
|
|
47
|
-
spec.add_runtime_dependency 'thor', '~> 0.20
|
48
|
-
spec.add_runtime_dependency 'thread', '~> 0.2
|
49
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
50
|
+
spec.add_runtime_dependency 'thor', '~> 0.20'
|
51
|
+
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.9'
|
50
53
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -9,12 +9,14 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
12
13
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
13
14
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
14
15
|
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
15
16
|
def crawl(url)
|
16
17
|
url = "http://#{url}" unless url.start_with?('http')
|
17
18
|
|
19
|
+
report_type = options[:html] ? :html : :text
|
18
20
|
sort_by = options[:sort_by_link] ? :link : :page
|
19
21
|
max_threads = options[:threads]
|
20
22
|
broken_verbose = !options[:concise]
|
@@ -22,17 +24,24 @@ class BrokenLinkFinderCLI < Thor
|
|
22
24
|
|
23
25
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
24
26
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
25
|
-
finder.
|
26
|
-
|
27
|
+
finder.report(
|
28
|
+
type: report_type,
|
29
|
+
broken_verbose: broken_verbose,
|
27
30
|
ignored_verbose: ignored_verbose
|
28
31
|
)
|
29
|
-
|
32
|
+
|
33
|
+
exit 0
|
34
|
+
rescue StandardError => e
|
30
35
|
puts "An error has occurred: #{e.message}"
|
36
|
+
|
37
|
+
exit 1
|
31
38
|
end
|
32
39
|
|
33
40
|
desc 'version', 'Display the currently installed version'
|
34
41
|
def version
|
35
42
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
43
|
+
|
44
|
+
exit 0
|
36
45
|
end
|
37
46
|
end
|
38
47
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
require 'wgit/core_ext'
|
5
|
+
require 'thread/pool'
|
6
|
+
require 'set'
|
5
7
|
|
6
8
|
require_relative './broken_link_finder/wgit_extensions'
|
7
9
|
require_relative './broken_link_finder/version'
|
8
|
-
require_relative './broken_link_finder/
|
10
|
+
require_relative './broken_link_finder/link_manager'
|
11
|
+
require_relative './broken_link_finder/reporter/reporter'
|
12
|
+
require_relative './broken_link_finder/reporter/text_reporter'
|
13
|
+
require_relative './broken_link_finder/reporter/html_reporter'
|
9
14
|
require_relative './broken_link_finder/finder'
|
@@ -1,234 +1,227 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative 'reporter'
|
4
|
-
require 'thread/pool'
|
5
|
-
require 'set'
|
6
|
-
|
7
3
|
module BrokenLinkFinder
|
8
|
-
DEFAULT_MAX_THREADS = 100
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
9
6
|
|
10
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
11
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
12
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
13
10
|
end
|
14
11
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
15
13
|
class Finder
|
16
|
-
|
14
|
+
# The collection key - either :page or :link.
|
15
|
+
attr_reader :sort
|
16
|
+
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
18
|
+
attr_reader :max_threads
|
17
19
|
|
18
|
-
#
|
19
|
-
def initialize(sort: :page, max_threads:
|
20
|
+
# Returns a new Finder instance.
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
20
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
21
23
|
unless %i[page link].include?(sort)
|
22
24
|
|
23
25
|
@sort = sort
|
24
26
|
@max_threads = max_threads
|
25
|
-
@lock = Mutex.new
|
26
27
|
@crawler = Wgit::Crawler.new
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the current broken links.
|
32
|
+
def broken_links
|
33
|
+
@manager.broken_links
|
34
|
+
end
|
27
35
|
|
28
|
-
|
36
|
+
# Returns the current ignored links.
|
37
|
+
def ignored_links
|
38
|
+
@manager.ignored_links
|
29
39
|
end
|
30
40
|
|
31
|
-
#
|
32
|
-
def
|
33
|
-
@
|
34
|
-
@ignored_links = {}
|
35
|
-
@total_links_crawled = 0
|
36
|
-
@all_broken_links = Set.new
|
37
|
-
@all_intact_links = Set.new
|
41
|
+
# Returns the current crawl stats.
|
42
|
+
def crawl_stats
|
43
|
+
@manager.crawl_stats
|
38
44
|
end
|
39
45
|
|
40
|
-
# Finds broken links within a single page and
|
41
|
-
#
|
46
|
+
# Finds broken links within a single page and records them.
|
47
|
+
# Returns true if at least one broken link was found.
|
42
48
|
# Access the broken links afterwards with Finder#broken_links.
|
43
49
|
def crawl_url(url)
|
44
|
-
|
50
|
+
@manager.empty
|
45
51
|
|
46
|
-
|
47
|
-
|
52
|
+
start = Time.now
|
53
|
+
url = url.to_url
|
54
|
+
|
55
|
+
# We dup the url to avoid recording any redirects.
|
56
|
+
doc = @crawler.crawl(url.dup)
|
48
57
|
|
49
58
|
# Ensure the given page url is valid.
|
50
59
|
raise "Invalid or broken URL: #{url}" unless doc
|
51
60
|
|
52
61
|
# Get all page links and determine which are broken.
|
53
62
|
find_broken_links(doc)
|
63
|
+
retry_broken_links
|
54
64
|
|
55
|
-
|
56
|
-
|
65
|
+
@manager.sort
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
57
67
|
|
58
|
-
|
68
|
+
broken_links.any?
|
59
69
|
end
|
60
70
|
|
61
|
-
# Finds broken links within an entire site and
|
62
|
-
#
|
63
|
-
# at least one broken link was found and an Array of all pages crawled.
|
71
|
+
# Finds broken links within an entire site and records them.
|
72
|
+
# Returns true if at least one broken link was found.
|
64
73
|
# Access the broken links afterwards with Finder#broken_links.
|
65
|
-
def crawl_site(url)
|
66
|
-
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
75
|
+
@manager.empty
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
start = Time.now
|
78
|
+
url = url.to_url
|
79
|
+
pool = Thread.pool(@max_threads)
|
80
|
+
crawled = Set.new
|
71
81
|
|
72
82
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
74
|
-
|
83
|
+
# We dup the url to avoid recording any redirects.
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
86
|
+
crawled << doc.url
|
75
87
|
next unless doc
|
76
88
|
|
77
89
|
# Start a thread for each page, checking for broken links.
|
78
90
|
pool.process { find_broken_links(doc) }
|
79
91
|
end
|
80
92
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
94
|
+
pool.shutdown
|
95
|
+
|
81
96
|
# Ensure the given website url is valid.
|
82
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
83
98
|
|
84
|
-
|
85
|
-
pool.shutdown
|
99
|
+
retry_broken_links
|
86
100
|
|
87
|
-
|
88
|
-
|
101
|
+
@manager.sort
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
89
103
|
|
90
|
-
|
104
|
+
broken_links.any?
|
105
|
+
ensure
|
106
|
+
pool.shutdown if defined?(pool)
|
91
107
|
end
|
92
108
|
|
93
|
-
#
|
109
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
94
110
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
111
|
+
def report(stream = STDOUT, type: :text,
|
112
|
+
broken_verbose: true, ignored_verbose: false)
|
113
|
+
klass = case type
|
114
|
+
when :text
|
115
|
+
BrokenLinkFinder::TextReporter
|
116
|
+
when :html
|
117
|
+
BrokenLinkFinder::HTMLReporter
|
118
|
+
else
|
119
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
120
|
+
end
|
121
|
+
|
122
|
+
reporter = klass.new(stream, @sort,
|
123
|
+
broken_links, ignored_links,
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
125
|
+
reporter.call(broken_verbose: broken_verbose,
|
126
|
+
ignored_verbose: ignored_verbose)
|
110
127
|
end
|
111
128
|
|
112
129
|
private
|
113
130
|
|
114
131
|
# Finds which links are unsupported or broken and records the details.
|
115
|
-
def find_broken_links(
|
116
|
-
|
132
|
+
def find_broken_links(page)
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
134
|
+
|
135
|
+
links = get_supported_links(page)
|
117
136
|
|
118
137
|
# Iterate over the supported links checking if they're broken or not.
|
119
138
|
links.each do |link|
|
120
|
-
#
|
121
|
-
next if @all_intact_links.include?(link)
|
139
|
+
# Skip if the link has been encountered previously.
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
122
141
|
|
123
|
-
if @all_broken_links.include?(link)
|
124
|
-
|
142
|
+
if @manager.all_broken_links.include?(link)
|
143
|
+
# The link has already been proven broken so simply record it.
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
125
145
|
next
|
126
146
|
end
|
127
147
|
|
128
|
-
# The link hasn't been
|
129
|
-
link_doc = crawl_link(
|
148
|
+
# The link hasn't been encountered before so we crawl it.
|
149
|
+
link_doc = crawl_link(page, link)
|
130
150
|
|
131
|
-
# Determine if the crawled link is broken or not.
|
132
|
-
if
|
133
|
-
|
134
|
-
has_broken_anchor(link_doc)
|
135
|
-
append_broken_link(doc.url, link)
|
151
|
+
# Determine if the crawled link is broken or not and record it.
|
152
|
+
if link_broken?(link_doc)
|
153
|
+
@manager.append_broken_link(page, link)
|
136
154
|
else
|
137
|
-
@
|
155
|
+
@manager.append_intact_link(link)
|
138
156
|
end
|
139
157
|
end
|
140
158
|
|
141
159
|
nil
|
142
160
|
end
|
143
161
|
|
144
|
-
#
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
148
|
-
.reject do |link|
|
149
|
-
if link.is_absolute? && !link.start_with?('http')
|
150
|
-
append_ignored_link(doc.url, link)
|
151
|
-
true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
162
|
+
# Implements a retry mechanism for each of the broken links found.
|
163
|
+
# Removes any broken links found to be working OK.
|
164
|
+
def retry_broken_links
|
165
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
155
166
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
@crawler.crawl(link)
|
160
|
-
end
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
168
|
+
# Don't retry unparsable links (which are Strings).
|
169
|
+
next(true) unless href.is_a?(Wgit::Url)
|
161
170
|
|
162
|
-
|
163
|
-
def get_absolute_link(doc, link)
|
164
|
-
link.is_relative? ? doc.base_url(link: link).concat(link) : link
|
165
|
-
end
|
171
|
+
doc = @crawler.crawl(href.dup)
|
166
172
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
anchor = anchor[1..-1] if anchor.start_with?('#')
|
175
|
-
doc.xpath("//*[@id='#{anchor}']").empty?
|
173
|
+
if link_broken?(doc)
|
174
|
+
true
|
175
|
+
else
|
176
|
+
@manager.remove_broken_link(link)
|
177
|
+
false
|
178
|
+
end
|
179
|
+
end
|
176
180
|
end
|
177
181
|
|
178
|
-
#
|
179
|
-
def
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
@
|
184
|
-
@broken_links[key] << value
|
185
|
-
|
186
|
-
@all_broken_links << link
|
182
|
+
# Record each unparsable link as a broken link.
|
183
|
+
def record_unparsable_links(doc)
|
184
|
+
doc.unparsable_links.each do |link|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
187
|
+
@manager.broken_link_map[link] = link
|
187
188
|
end
|
188
189
|
end
|
189
190
|
|
190
|
-
#
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
191
|
+
# Report and reject any non supported links. Any link that is absolute and
|
192
|
+
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
193
|
+
def get_supported_links(doc)
|
194
|
+
doc.all_links.reject do |link|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
197
|
+
true
|
198
|
+
end
|
197
199
|
end
|
198
200
|
end
|
199
201
|
|
200
|
-
#
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
when :page
|
205
|
-
[url, link]
|
206
|
-
when :link
|
207
|
-
[link, url]
|
208
|
-
else
|
209
|
-
raise "Unsupported sort type: #{sort}"
|
210
|
-
end
|
202
|
+
# Make the link absolute and crawl it, returning its Wgit::Document.
|
203
|
+
def crawl_link(doc, link)
|
204
|
+
link = link.make_absolute(doc)
|
205
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
211
206
|
end
|
212
207
|
|
213
|
-
#
|
214
|
-
def
|
215
|
-
@
|
216
|
-
|
208
|
+
# Return if the crawled link is broken or not.
|
209
|
+
def link_broken?(doc)
|
210
|
+
doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
|
211
|
+
end
|
217
212
|
|
218
|
-
|
219
|
-
|
213
|
+
# Returns true if the link is/contains a broken anchor/fragment.
|
214
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
215
|
+
def has_broken_anchor(doc)
|
216
|
+
raise 'The link document is nil' unless doc
|
220
217
|
|
221
|
-
|
222
|
-
|
223
|
-
end
|
218
|
+
fragment = doc.url.fragment
|
219
|
+
return false if fragment.nil? || fragment.empty?
|
224
220
|
|
225
|
-
|
226
|
-
def set_total_links_crawled
|
227
|
-
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
221
|
+
doc.xpath("//*[@id='#{fragment}']").empty?
|
228
222
|
end
|
229
223
|
|
230
|
-
alias crawl_page
|
231
|
-
alias crawl_r
|
232
|
-
alias pretty_print_link_summary pretty_print_link_report
|
224
|
+
alias crawl_page crawl_url
|
225
|
+
alias crawl_r crawl_site
|
233
226
|
end
|
234
227
|
end
|