broken_link_finder 0.9.3 → 0.11.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +51 -0
- data/Gemfile.lock +44 -33
- data/README.md +28 -19
- data/benchmark.rb +9 -5
- data/bin/console +11 -19
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +8 -5
- data/exe/broken_link_finder +12 -3
- data/lib/broken_link_finder.rb +6 -1
- data/lib/broken_link_finder/finder.rb +134 -141
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +137 -0
- data/lib/broken_link_finder/reporter/reporter.rb +76 -0
- data/lib/broken_link_finder/reporter/text_reporter.rb +88 -0
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- metadata +18 -13
- data/lib/broken_link_finder/reporter.rb +0 -116
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
|
4
|
+
data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
|
7
|
+
data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.0
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,57 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.11.1
|
13
|
+
### Added
|
14
|
+
- ...
|
15
|
+
### Changed/Removed
|
16
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.11.0
|
22
|
+
### Added
|
23
|
+
- Additional crawl statistics.
|
24
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
25
|
+
### Changed/Removed
|
26
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
27
|
+
### Fixed
|
28
|
+
- Bug in html report, summary url is now an `<a>` link.
|
29
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
30
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
31
|
+
---
|
32
|
+
|
33
|
+
## v0.10.0
|
34
|
+
### Added
|
35
|
+
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
36
|
+
- Added a 'retry' mechanism for any broken links found. This is essentially a verification step before generating a report.
|
37
|
+
- `Finder#crawl_stats` for info such as crawl duration, total links crawled etc.
|
38
|
+
### Changed/Removed
|
39
|
+
- The API has changed somewhat. See the [docs](https://www.rubydoc.info/gems/broken_link_finder) for the up to date code signatures if you're using `broken_link_finder` outside of its executable.
|
40
|
+
### Fixed
|
41
|
+
- ...
|
42
|
+
---
|
43
|
+
|
44
|
+
## v0.9.5
|
45
|
+
### Added
|
46
|
+
- ...
|
47
|
+
### Changed/Removed
|
48
|
+
- Now using optimistic dep versioning.
|
49
|
+
- Updated `wgit` to version 0.5.1 containing improvements and bug fixes.
|
50
|
+
### Fixed
|
51
|
+
- ...
|
52
|
+
---
|
53
|
+
|
54
|
+
## v0.9.4
|
55
|
+
### Added
|
56
|
+
- ...
|
57
|
+
### Changed/Removed
|
58
|
+
- Updated `wgit` gem to version 0.5.0 which contains improvements and bugs fixes.
|
59
|
+
### Fixed
|
60
|
+
- ...
|
61
|
+
---
|
62
|
+
|
12
63
|
## v0.9.3
|
13
64
|
### Added
|
14
65
|
- ...
|
data/Gemfile.lock
CHANGED
@@ -1,53 +1,64 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
5
|
-
thor (~> 0.20
|
6
|
-
thread (~> 0.2
|
7
|
-
wgit (~> 0.
|
4
|
+
broken_link_finder (0.11.1)
|
5
|
+
thor (~> 0.20)
|
6
|
+
thread (~> 0.2)
|
7
|
+
wgit (~> 0.9)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
bson (4.
|
15
|
-
byebug (11.
|
16
|
-
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
|
+
bson (4.10.0)
|
15
|
+
byebug (11.1.3)
|
16
|
+
cliver (0.3.2)
|
17
|
+
coderay (1.1.3)
|
18
|
+
concurrent-ruby (1.1.6)
|
17
19
|
crack (0.4.3)
|
18
20
|
safe_yaml (~> 1.0.0)
|
19
21
|
ethon (0.12.0)
|
20
22
|
ffi (>= 1.3.0)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
ferrum (0.9)
|
24
|
+
addressable (~> 2.5)
|
25
|
+
cliver (~> 0.3)
|
26
|
+
concurrent-ruby (~> 1.1)
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
28
|
+
ffi (1.13.1)
|
29
|
+
hashdiff (1.0.1)
|
30
|
+
maxitest (3.6.0)
|
31
|
+
minitest (>= 5.0.0, < 5.14.0)
|
32
|
+
method_source (1.0.0)
|
26
33
|
mini_portile2 (2.4.0)
|
27
|
-
minitest (5.
|
28
|
-
mongo (2.
|
29
|
-
bson (>= 4.
|
30
|
-
nokogiri (1.10.
|
34
|
+
minitest (5.13.0)
|
35
|
+
mongo (2.13.0)
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
37
|
+
nokogiri (1.10.10)
|
31
38
|
mini_portile2 (~> 2.4.0)
|
32
|
-
pry (0.
|
33
|
-
coderay (~> 1.1
|
34
|
-
method_source (~>
|
35
|
-
public_suffix (
|
36
|
-
rake (
|
39
|
+
pry (0.13.1)
|
40
|
+
coderay (~> 1.1)
|
41
|
+
method_source (~> 1.0)
|
42
|
+
public_suffix (4.0.5)
|
43
|
+
rake (13.0.1)
|
37
44
|
safe_yaml (1.0.5)
|
38
45
|
thor (0.20.3)
|
39
46
|
thread (0.2.2)
|
40
|
-
typhoeus (1.
|
47
|
+
typhoeus (1.4.0)
|
41
48
|
ethon (>= 0.9.0)
|
42
|
-
webmock (3.
|
49
|
+
webmock (3.8.3)
|
43
50
|
addressable (>= 2.3.6)
|
44
51
|
crack (>= 0.3.2)
|
45
52
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
53
|
+
websocket-driver (0.7.3)
|
54
|
+
websocket-extensions (>= 0.1.0)
|
55
|
+
websocket-extensions (0.1.5)
|
56
|
+
wgit (0.9.0)
|
57
|
+
addressable (~> 2.6)
|
58
|
+
ferrum (~> 0.8)
|
59
|
+
mongo (~> 2.9)
|
60
|
+
nokogiri (~> 1.10)
|
61
|
+
typhoeus (~> 1.3)
|
51
62
|
|
52
63
|
PLATFORMS
|
53
64
|
ruby
|
@@ -58,11 +69,11 @@ DEPENDENCIES
|
|
58
69
|
byebug (~> 11.0)
|
59
70
|
maxitest (~> 3.3)
|
60
71
|
pry (~> 0.12)
|
61
|
-
rake (~>
|
72
|
+
rake (~> 13.0)
|
62
73
|
webmock (~> 3.6)
|
63
74
|
|
64
75
|
RUBY VERSION
|
65
|
-
ruby 2.
|
76
|
+
ruby 2.7.0p0
|
66
77
|
|
67
78
|
BUNDLED WITH
|
68
|
-
2.
|
79
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
4
4
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
6
|
+
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
6
8
|
|
7
9
|
## How It Works
|
8
10
|
|
@@ -10,7 +12,7 @@ Any HTML page element with a `href` or `src` attribute is considered a link. For
|
|
10
12
|
|
11
13
|
- An empty HTML response body is returned.
|
12
14
|
- A response status code of `404 Not Found` is returned.
|
13
|
-
- The HTML response body doesn't contain an element ID matching that of the link's
|
15
|
+
- The HTML response body doesn't contain an element ID matching that of the link's fragment e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
|
14
16
|
- The link redirects more than 5 times consecutively.
|
15
17
|
|
16
18
|
**Note**: Not all link types are supported.
|
@@ -55,7 +57,7 @@ Installing this gem installs the `broken_link_finder` executable into your `$PAT
|
|
55
57
|
|
56
58
|
$ broken_link_finder crawl http://txti.es
|
57
59
|
|
58
|
-
Adding the
|
60
|
+
Adding the `--recursive` flag would crawl the entire `txti.es` site, not just its index page.
|
59
61
|
|
60
62
|
See the [output](#Output) section below for an example of a site with broken links.
|
61
63
|
|
@@ -73,9 +75,9 @@ Below is a simple script which crawls a website and outputs its broken links to
|
|
73
75
|
require 'broken_link_finder'
|
74
76
|
|
75
77
|
finder = BrokenLinkFinder.new
|
76
|
-
finder.crawl_site 'http://txti.es'
|
77
|
-
finder.
|
78
|
-
|
78
|
+
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
79
|
+
finder.report # Or use Finder#broken_links and Finder#ignored_links
|
80
|
+
# for direct access to the link Hashes.
|
79
81
|
```
|
80
82
|
|
81
83
|
Then execute the script with:
|
@@ -89,28 +91,33 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
89
91
|
If broken links are found then the output will look something like:
|
90
92
|
|
91
93
|
```text
|
92
|
-
|
94
|
+
Crawled http://txti.es
|
95
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
96
|
+
|
97
|
+
Found 6 unique broken link(s) across 2 page(s):
|
93
98
|
|
94
99
|
The following broken links were found on 'http://txti.es/about':
|
95
100
|
http://twitter.com/thebarrytone
|
101
|
+
/doesntexist
|
96
102
|
http://twitter.com/nwbld
|
97
|
-
|
98
|
-
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
103
|
+
twitter.com/txties
|
99
104
|
|
100
105
|
The following broken links were found on 'http://txti.es/how':
|
101
106
|
http://en.wikipedia.org/wiki/Markdown
|
102
107
|
http://imgur.com
|
103
108
|
|
104
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
109
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
105
110
|
|
106
|
-
The following links were ignored on http://txti.es:
|
111
|
+
The following links were ignored on 'http://txti.es':
|
107
112
|
tel:+13174562564
|
108
113
|
mailto:big.jim@jmail.com
|
109
114
|
|
110
|
-
The following links were ignored on http://txti.es/contact:
|
115
|
+
The following links were ignored on 'http://txti.es/contact':
|
111
116
|
ftp://server.com
|
112
117
|
```
|
113
118
|
|
119
|
+
You can provide the `--html` flag if you'd prefer a HTML based report.
|
120
|
+
|
114
121
|
## Contributing
|
115
122
|
|
116
123
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
@@ -126,9 +133,11 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
126
133
|
To install this gem onto your local machine, run `bundle exec rake install`.
|
127
134
|
|
128
135
|
To release a new gem version:
|
129
|
-
- Update the
|
130
|
-
-
|
131
|
-
- Run `bundle
|
132
|
-
- Run `bundle exec rake
|
133
|
-
- Run `bundle exec rake
|
134
|
-
- Run `bundle exec rake
|
136
|
+
- Update the deps in the `*.gemspec`, if necessary.
|
137
|
+
- Update the version number in `version.rb` and add the new version to the `CHANGELOG`.
|
138
|
+
- Run `bundle install`.
|
139
|
+
- Run `bundle exec rake test` ensuring all tests pass.
|
140
|
+
- Run `bundle exec rake compile` ensuring no warnings.
|
141
|
+
- Run `bundle exec rake install && rbenv rehash`.
|
142
|
+
- Manually test the executable.
|
143
|
+
- Run `bundle exec rake release[origin]`.
|
data/benchmark.rb
CHANGED
@@ -10,15 +10,19 @@ finder = BrokenLinkFinder::Finder.new
|
|
10
10
|
puts Benchmark.measure { finder.crawl_site url }
|
11
11
|
puts "Links crawled: #{finder.total_links_crawled}"
|
12
12
|
|
13
|
-
# http://txti.es page crawl
|
14
|
-
# Pre
|
15
|
-
# Post
|
13
|
+
# http://txti.es page crawl with threading
|
14
|
+
# Pre: 17.5 seconds
|
15
|
+
# Post: 7.5 seconds
|
16
16
|
|
17
|
-
# http://txti.es
|
17
|
+
# http://txti.es with threading - page vs site crawl
|
18
18
|
# Page: 9.526981
|
19
19
|
# Site: 9.732416
|
20
20
|
# Multi-threading crawl_site now yields the same time as a single page
|
21
21
|
|
22
|
-
# Large site crawl -
|
22
|
+
# Large site crawl - all link recording functionality
|
23
23
|
# Pre: 608 seconds with 7665 links crawled
|
24
24
|
# Post: 355 seconds with 1099 links crawled
|
25
|
+
|
26
|
+
# Large site crawl - retry mechanism
|
27
|
+
# Pre: 140 seconds
|
28
|
+
# Post: 170 seconds
|
data/bin/console
CHANGED
@@ -5,20 +5,10 @@ require 'bundler/setup'
|
|
5
5
|
require 'pry'
|
6
6
|
require 'byebug'
|
7
7
|
require 'broken_link_finder'
|
8
|
+
require 'logger'
|
8
9
|
|
9
|
-
#
|
10
|
-
|
11
|
-
singleton_class.class_eval do
|
12
|
-
alias_method :orig_get, :get
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.get(base_url, options = {})
|
16
|
-
puts "[typhoeus] Sending GET: #{base_url}"
|
17
|
-
resp = orig_get(base_url, options)
|
18
|
-
puts "[typhoeus] Status: #{resp.code} (#{resp.body.length} bytes in #{resp.total_time} seconds)"
|
19
|
-
resp
|
20
|
-
end
|
21
|
-
end
|
10
|
+
# Logs all HTTP requests.
|
11
|
+
Wgit.logger.level = Logger::DEBUG
|
22
12
|
|
23
13
|
# Call reload to load all recent code changes.
|
24
14
|
def reload
|
@@ -33,12 +23,14 @@ end
|
|
33
23
|
# You can add fixtures and/or initialization code here...
|
34
24
|
reload
|
35
25
|
|
36
|
-
url
|
37
|
-
by_page
|
38
|
-
by_link
|
39
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
40
30
|
|
41
31
|
# Start the console.
|
42
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
43
35
|
|
44
|
-
|
36
|
+
Pry.start
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
@@ -15,7 +15,10 @@ Gem::Specification.new do |spec|
|
|
15
15
|
spec.homepage = 'https://github.com/michaeltelford/broken-link-finder'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
spec.metadata = {
|
18
|
-
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder'
|
18
|
+
'source_code_uri' => 'https://github.com/michaeltelford/broken-link-finder',
|
19
|
+
'changelog_uri' => 'https://github.com/michaeltelford/broken-link-finder/blob/master/CHANGELOG.md',
|
20
|
+
'bug_tracker_uri' => 'https://github.com/michaeltelford/broken-link-finder/issues',
|
21
|
+
'documentation_uri' => 'https://www.rubydoc.info/gems/broken_link_finder'
|
19
22
|
}
|
20
23
|
|
21
24
|
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
@@ -41,10 +44,10 @@ Gem::Specification.new do |spec|
|
|
41
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
42
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
43
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
45
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
46
49
|
|
47
|
-
spec.add_runtime_dependency 'thor', '~> 0.20
|
48
|
-
spec.add_runtime_dependency 'thread', '~> 0.2
|
49
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
50
|
+
spec.add_runtime_dependency 'thor', '~> 0.20'
|
51
|
+
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.9'
|
50
53
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -9,12 +9,14 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
12
13
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
13
14
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
14
15
|
option :concise, type: :boolean, aliases: [:c], default: false, desc: 'Display only a summary of broken links.'
|
15
16
|
def crawl(url)
|
16
17
|
url = "http://#{url}" unless url.start_with?('http')
|
17
18
|
|
19
|
+
report_type = options[:html] ? :html : :text
|
18
20
|
sort_by = options[:sort_by_link] ? :link : :page
|
19
21
|
max_threads = options[:threads]
|
20
22
|
broken_verbose = !options[:concise]
|
@@ -22,17 +24,24 @@ class BrokenLinkFinderCLI < Thor
|
|
22
24
|
|
23
25
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
24
26
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
25
|
-
finder.
|
26
|
-
|
27
|
+
finder.report(
|
28
|
+
type: report_type,
|
29
|
+
broken_verbose: broken_verbose,
|
27
30
|
ignored_verbose: ignored_verbose
|
28
31
|
)
|
29
|
-
|
32
|
+
|
33
|
+
exit 0
|
34
|
+
rescue StandardError => e
|
30
35
|
puts "An error has occurred: #{e.message}"
|
36
|
+
|
37
|
+
exit 1
|
31
38
|
end
|
32
39
|
|
33
40
|
desc 'version', 'Display the currently installed version'
|
34
41
|
def version
|
35
42
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
43
|
+
|
44
|
+
exit 0
|
36
45
|
end
|
37
46
|
end
|
38
47
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
require 'wgit/core_ext'
|
5
|
+
require 'thread/pool'
|
6
|
+
require 'set'
|
5
7
|
|
6
8
|
require_relative './broken_link_finder/wgit_extensions'
|
7
9
|
require_relative './broken_link_finder/version'
|
8
|
-
require_relative './broken_link_finder/
|
10
|
+
require_relative './broken_link_finder/link_manager'
|
11
|
+
require_relative './broken_link_finder/reporter/reporter'
|
12
|
+
require_relative './broken_link_finder/reporter/text_reporter'
|
13
|
+
require_relative './broken_link_finder/reporter/html_reporter'
|
9
14
|
require_relative './broken_link_finder/finder'
|
@@ -1,234 +1,227 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative 'reporter'
|
4
|
-
require 'thread/pool'
|
5
|
-
require 'set'
|
6
|
-
|
7
3
|
module BrokenLinkFinder
|
8
|
-
DEFAULT_MAX_THREADS = 100
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
9
6
|
|
10
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
11
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
12
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
13
10
|
end
|
14
11
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
15
13
|
class Finder
|
16
|
-
|
14
|
+
# The collection key - either :page or :link.
|
15
|
+
attr_reader :sort
|
16
|
+
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
18
|
+
attr_reader :max_threads
|
17
19
|
|
18
|
-
#
|
19
|
-
def initialize(sort: :page, max_threads:
|
20
|
+
# Returns a new Finder instance.
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
20
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
21
23
|
unless %i[page link].include?(sort)
|
22
24
|
|
23
25
|
@sort = sort
|
24
26
|
@max_threads = max_threads
|
25
|
-
@lock = Mutex.new
|
26
27
|
@crawler = Wgit::Crawler.new
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the current broken links.
|
32
|
+
def broken_links
|
33
|
+
@manager.broken_links
|
34
|
+
end
|
27
35
|
|
28
|
-
|
36
|
+
# Returns the current ignored links.
|
37
|
+
def ignored_links
|
38
|
+
@manager.ignored_links
|
29
39
|
end
|
30
40
|
|
31
|
-
#
|
32
|
-
def
|
33
|
-
@
|
34
|
-
@ignored_links = {}
|
35
|
-
@total_links_crawled = 0
|
36
|
-
@all_broken_links = Set.new
|
37
|
-
@all_intact_links = Set.new
|
41
|
+
# Returns the current crawl stats.
|
42
|
+
def crawl_stats
|
43
|
+
@manager.crawl_stats
|
38
44
|
end
|
39
45
|
|
40
|
-
# Finds broken links within a single page and
|
41
|
-
#
|
46
|
+
# Finds broken links within a single page and records them.
|
47
|
+
# Returns true if at least one broken link was found.
|
42
48
|
# Access the broken links afterwards with Finder#broken_links.
|
43
49
|
def crawl_url(url)
|
44
|
-
|
50
|
+
@manager.empty
|
45
51
|
|
46
|
-
|
47
|
-
|
52
|
+
start = Time.now
|
53
|
+
url = url.to_url
|
54
|
+
|
55
|
+
# We dup the url to avoid recording any redirects.
|
56
|
+
doc = @crawler.crawl(url.dup)
|
48
57
|
|
49
58
|
# Ensure the given page url is valid.
|
50
59
|
raise "Invalid or broken URL: #{url}" unless doc
|
51
60
|
|
52
61
|
# Get all page links and determine which are broken.
|
53
62
|
find_broken_links(doc)
|
63
|
+
retry_broken_links
|
54
64
|
|
55
|
-
|
56
|
-
|
65
|
+
@manager.sort
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
57
67
|
|
58
|
-
|
68
|
+
broken_links.any?
|
59
69
|
end
|
60
70
|
|
61
|
-
# Finds broken links within an entire site and
|
62
|
-
#
|
63
|
-
# at least one broken link was found and an Array of all pages crawled.
|
71
|
+
# Finds broken links within an entire site and records them.
|
72
|
+
# Returns true if at least one broken link was found.
|
64
73
|
# Access the broken links afterwards with Finder#broken_links.
|
65
|
-
def crawl_site(url)
|
66
|
-
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
75
|
+
@manager.empty
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
start = Time.now
|
78
|
+
url = url.to_url
|
79
|
+
pool = Thread.pool(@max_threads)
|
80
|
+
crawled = Set.new
|
71
81
|
|
72
82
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
74
|
-
|
83
|
+
# We dup the url to avoid recording any redirects.
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
86
|
+
crawled << doc.url
|
75
87
|
next unless doc
|
76
88
|
|
77
89
|
# Start a thread for each page, checking for broken links.
|
78
90
|
pool.process { find_broken_links(doc) }
|
79
91
|
end
|
80
92
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
94
|
+
pool.shutdown
|
95
|
+
|
81
96
|
# Ensure the given website url is valid.
|
82
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
83
98
|
|
84
|
-
|
85
|
-
pool.shutdown
|
99
|
+
retry_broken_links
|
86
100
|
|
87
|
-
|
88
|
-
|
101
|
+
@manager.sort
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
89
103
|
|
90
|
-
|
104
|
+
broken_links.any?
|
105
|
+
ensure
|
106
|
+
pool.shutdown if defined?(pool)
|
91
107
|
end
|
92
108
|
|
93
|
-
#
|
109
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
94
110
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
111
|
+
def report(stream = STDOUT, type: :text,
|
112
|
+
broken_verbose: true, ignored_verbose: false)
|
113
|
+
klass = case type
|
114
|
+
when :text
|
115
|
+
BrokenLinkFinder::TextReporter
|
116
|
+
when :html
|
117
|
+
BrokenLinkFinder::HTMLReporter
|
118
|
+
else
|
119
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
120
|
+
end
|
121
|
+
|
122
|
+
reporter = klass.new(stream, @sort,
|
123
|
+
broken_links, ignored_links,
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
125
|
+
reporter.call(broken_verbose: broken_verbose,
|
126
|
+
ignored_verbose: ignored_verbose)
|
110
127
|
end
|
111
128
|
|
112
129
|
private
|
113
130
|
|
114
131
|
# Finds which links are unsupported or broken and records the details.
|
115
|
-
def find_broken_links(
|
116
|
-
|
132
|
+
def find_broken_links(page)
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
134
|
+
|
135
|
+
links = get_supported_links(page)
|
117
136
|
|
118
137
|
# Iterate over the supported links checking if they're broken or not.
|
119
138
|
links.each do |link|
|
120
|
-
#
|
121
|
-
next if @all_intact_links.include?(link)
|
139
|
+
# Skip if the link has been encountered previously.
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
122
141
|
|
123
|
-
if @all_broken_links.include?(link)
|
124
|
-
|
142
|
+
if @manager.all_broken_links.include?(link)
|
143
|
+
# The link has already been proven broken so simply record it.
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
125
145
|
next
|
126
146
|
end
|
127
147
|
|
128
|
-
# The link hasn't been
|
129
|
-
link_doc = crawl_link(
|
148
|
+
# The link hasn't been encountered before so we crawl it.
|
149
|
+
link_doc = crawl_link(page, link)
|
130
150
|
|
131
|
-
# Determine if the crawled link is broken or not.
|
132
|
-
if
|
133
|
-
|
134
|
-
has_broken_anchor(link_doc)
|
135
|
-
append_broken_link(doc.url, link)
|
151
|
+
# Determine if the crawled link is broken or not and record it.
|
152
|
+
if link_broken?(link_doc)
|
153
|
+
@manager.append_broken_link(page, link)
|
136
154
|
else
|
137
|
-
@
|
155
|
+
@manager.append_intact_link(link)
|
138
156
|
end
|
139
157
|
end
|
140
158
|
|
141
159
|
nil
|
142
160
|
end
|
143
161
|
|
144
|
-
#
|
145
|
-
#
|
146
|
-
def
|
147
|
-
|
148
|
-
.reject do |link|
|
149
|
-
if link.is_absolute? && !link.start_with?('http')
|
150
|
-
append_ignored_link(doc.url, link)
|
151
|
-
true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
162
|
+
# Implements a retry mechanism for each of the broken links found.
|
163
|
+
# Removes any broken links found to be working OK.
|
164
|
+
def retry_broken_links
|
165
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
155
166
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
@crawler.crawl(link)
|
160
|
-
end
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
168
|
+
# Don't retry unparsable links (which are Strings).
|
169
|
+
next(true) unless href.is_a?(Wgit::Url)
|
161
170
|
|
162
|
-
|
163
|
-
def get_absolute_link(doc, link)
|
164
|
-
link.is_relative? ? doc.base_url(link: link).concat(link) : link
|
165
|
-
end
|
171
|
+
doc = @crawler.crawl(href.dup)
|
166
172
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
anchor = anchor[1..-1] if anchor.start_with?('#')
|
175
|
-
doc.xpath("//*[@id='#{anchor}']").empty?
|
173
|
+
if link_broken?(doc)
|
174
|
+
true
|
175
|
+
else
|
176
|
+
@manager.remove_broken_link(link)
|
177
|
+
false
|
178
|
+
end
|
179
|
+
end
|
176
180
|
end
|
177
181
|
|
178
|
-
#
|
179
|
-
def
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
@
|
184
|
-
@broken_links[key] << value
|
185
|
-
|
186
|
-
@all_broken_links << link
|
182
|
+
# Record each unparsable link as a broken link.
|
183
|
+
def record_unparsable_links(doc)
|
184
|
+
doc.unparsable_links.each do |link|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
187
|
+
@manager.broken_link_map[link] = link
|
187
188
|
end
|
188
189
|
end
|
189
190
|
|
190
|
-
#
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
191
|
+
# Report and reject any non supported links. Any link that is absolute and
|
192
|
+
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
193
|
+
def get_supported_links(doc)
|
194
|
+
doc.all_links.reject do |link|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
197
|
+
true
|
198
|
+
end
|
197
199
|
end
|
198
200
|
end
|
199
201
|
|
200
|
-
#
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
when :page
|
205
|
-
[url, link]
|
206
|
-
when :link
|
207
|
-
[link, url]
|
208
|
-
else
|
209
|
-
raise "Unsupported sort type: #{sort}"
|
210
|
-
end
|
202
|
+
# Make the link absolute and crawl it, returning its Wgit::Document.
|
203
|
+
def crawl_link(doc, link)
|
204
|
+
link = link.make_absolute(doc)
|
205
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
211
206
|
end
|
212
207
|
|
213
|
-
#
|
214
|
-
def
|
215
|
-
@
|
216
|
-
|
208
|
+
# Return if the crawled link is broken or not.
|
209
|
+
def link_broken?(doc)
|
210
|
+
doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc)
|
211
|
+
end
|
217
212
|
|
218
|
-
|
219
|
-
|
213
|
+
# Returns true if the link is/contains a broken anchor/fragment.
|
214
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
215
|
+
def has_broken_anchor(doc)
|
216
|
+
raise 'The link document is nil' unless doc
|
220
217
|
|
221
|
-
|
222
|
-
|
223
|
-
end
|
218
|
+
fragment = doc.url.fragment
|
219
|
+
return false if fragment.nil? || fragment.empty?
|
224
220
|
|
225
|
-
|
226
|
-
def set_total_links_crawled
|
227
|
-
@total_links_crawled = @all_broken_links.size + @all_intact_links.size
|
221
|
+
doc.xpath("//*[@id='#{fragment}']").empty?
|
228
222
|
end
|
229
223
|
|
230
|
-
alias crawl_page
|
231
|
-
alias crawl_r
|
232
|
-
alias pretty_print_link_summary pretty_print_link_report
|
224
|
+
alias crawl_page crawl_url
|
225
|
+
alias crawl_r crawl_site
|
233
226
|
end
|
234
227
|
end
|