broken_link_finder 0.10.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +40 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +48 -35
- data/README.md +48 -16
- data/bin/console +8 -6
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +3 -3
- data/exe/broken_link_finder +9 -1
- data/lib/broken_link_finder/finder.rb +98 -135
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +9 -6
- data/lib/broken_link_finder/reporter/reporter.rb +2 -3
- data/lib/broken_link_finder/reporter/text_reporter.rb +7 -5
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +25 -5
- data/lib/broken_link_finder/xpath.rb +14 -0
- data/lib/broken_link_finder.rb +3 -1
- metadata +16 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88b1e96f1de644a1a3c06ba7cc0ee1b53f75a3de6686b343e55028e8fa69da9f
|
4
|
+
data.tar.gz: e399ca05a4b0b9b2c0644b2846fa9dc6be6acd664e1bdc58758eb9ca7a5543cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57a1604358b0297b66604d1fc5a60a9d1bda05aa9bd5f6b91135ddc2aec4a6eb703c00ef4d905ac156170b190bf500481ce56cf6319f07e8b57447cca4c6a210
|
7
|
+
data.tar.gz: f4b88e66c9c4fcd2bcbca2fe882abdede7c531e1d5e752a2ac986e39cf51d87714852dcb6e7e8e4870b623d54b468cc8f3ec88c253e7182c1fe89c0af91366a4
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.0.2
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,46 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.12.1
|
13
|
+
### Added
|
14
|
+
- Support for Ruby 3.
|
15
|
+
### Changed/Removed
|
16
|
+
- Removed support for Ruby 2.5 (as it's too old).
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.12.0
|
22
|
+
### Added
|
23
|
+
- `BrokenLinkFinder::link_xpath` and `link_xpath=` methods so you can customise how links are extracted from each crawled page using the API.
|
24
|
+
- An `--xpath` (or just `-x`) command line flag so you can customise how links are extracted when using the command line.
|
25
|
+
### Changed/Removed
|
26
|
+
- Changed the default way in which links are extracted from a page. Previously any element with a `href` or `src` attribute was extracted and checked; now only those links inside the `<body>` are extracted and checked, ignoring the `<head>` section entirely. You can change this behaviour back with: `BrokenLinkFinder::link_xpath = '//*/@href | //*/@src'` before you perform a crawl. Alternatively, if using the command line, use the `--xpath //*/@href | //*/@src` option.
|
27
|
+
### Fixed
|
28
|
+
- [Scheme relative bug](https://github.com/michaeltelford/broken_link_finder/issues/16) by upgrading to `wgit v0.10.0`.
|
29
|
+
---
|
30
|
+
|
31
|
+
## v0.11.1
|
32
|
+
### Added
|
33
|
+
- ...
|
34
|
+
### Changed/Removed
|
35
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
36
|
+
### Fixed
|
37
|
+
- ...
|
38
|
+
---
|
39
|
+
|
40
|
+
## v0.11.0
|
41
|
+
### Added
|
42
|
+
- Additional crawl statistics.
|
43
|
+
- Exit code handling to executable. `0` for success, `1` for an error scenario.
|
44
|
+
### Changed/Removed
|
45
|
+
- Updated the report formats slightly bringing various improvements such as the total number of links crawled etc.
|
46
|
+
### Fixed
|
47
|
+
- Bug in html report, summary url is now an `<a>` link.
|
48
|
+
- Bug in `Finder@broken_link_map` URLs and `Finder#crawl_stats[:url]` URL during redirects.
|
49
|
+
- Bug causing an error on crawling unparsable/invalid URL's.
|
50
|
+
---
|
51
|
+
|
12
52
|
## v0.10.0
|
13
53
|
### Added
|
14
54
|
- A `--html` flag to the `crawl` executable command which produces a HTML report (instead of text).
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,50 +1,63 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.12.1)
|
5
5
|
thor (~> 0.20)
|
6
6
|
thread (~> 0.2)
|
7
|
-
wgit (~> 0.
|
7
|
+
wgit (~> 0.10)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
14
|
-
bson (4.
|
15
|
-
byebug (11.
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
12
|
+
addressable (2.8.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
|
+
bson (4.12.1)
|
15
|
+
byebug (11.1.3)
|
16
|
+
cliver (0.3.2)
|
17
|
+
coderay (1.1.3)
|
18
|
+
concurrent-ruby (1.1.9)
|
19
|
+
crack (0.4.5)
|
20
|
+
rexml
|
21
|
+
ethon (0.15.0)
|
22
|
+
ffi (>= 1.15.0)
|
23
|
+
ferrum (0.11)
|
24
|
+
addressable (~> 2.5)
|
25
|
+
cliver (~> 0.3)
|
26
|
+
concurrent-ruby (~> 1.1)
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
28
|
+
ffi (1.15.4)
|
29
|
+
hashdiff (1.0.1)
|
30
|
+
maxitest (3.7.0)
|
31
|
+
minitest (>= 5.0.0, < 5.15.0)
|
32
|
+
method_source (1.0.0)
|
33
|
+
mini_portile2 (2.6.1)
|
34
|
+
minitest (5.14.4)
|
35
|
+
mongo (2.17.0)
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
37
|
+
nokogiri (1.12.5)
|
38
|
+
mini_portile2 (~> 2.6.1)
|
39
|
+
racc (~> 1.4)
|
40
|
+
pry (0.14.1)
|
41
|
+
coderay (~> 1.1)
|
42
|
+
method_source (~> 1.0)
|
43
|
+
public_suffix (4.0.6)
|
44
|
+
racc (1.6.0)
|
45
|
+
rake (13.0.6)
|
46
|
+
rexml (3.2.5)
|
38
47
|
thor (0.20.3)
|
39
48
|
thread (0.2.2)
|
40
|
-
typhoeus (1.
|
49
|
+
typhoeus (1.4.0)
|
41
50
|
ethon (>= 0.9.0)
|
42
|
-
webmock (3.
|
43
|
-
addressable (>= 2.
|
51
|
+
webmock (3.14.0)
|
52
|
+
addressable (>= 2.8.0)
|
44
53
|
crack (>= 0.3.2)
|
45
54
|
hashdiff (>= 0.4.0, < 2.0.0)
|
46
|
-
|
55
|
+
websocket-driver (0.7.5)
|
56
|
+
websocket-extensions (>= 0.1.0)
|
57
|
+
websocket-extensions (0.1.5)
|
58
|
+
wgit (0.10.2)
|
47
59
|
addressable (~> 2.6)
|
60
|
+
ferrum (~> 0.8)
|
48
61
|
mongo (~> 2.9)
|
49
62
|
nokogiri (~> 1.10)
|
50
63
|
typhoeus (~> 1.3)
|
@@ -58,11 +71,11 @@ DEPENDENCIES
|
|
58
71
|
byebug (~> 11.0)
|
59
72
|
maxitest (~> 3.3)
|
60
73
|
pry (~> 0.12)
|
61
|
-
rake (~>
|
74
|
+
rake (~> 13.0)
|
62
75
|
webmock (~> 3.6)
|
63
76
|
|
64
77
|
RUBY VERSION
|
65
|
-
ruby
|
78
|
+
ruby 3.0.2p107
|
66
79
|
|
67
80
|
BUNDLED WITH
|
68
|
-
2.
|
81
|
+
2.2.22
|
data/README.md
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
4
4
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
6
6
|
|
7
|
-
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
8
8
|
|
9
9
|
## How It Works
|
10
10
|
|
11
|
-
Any HTML
|
11
|
+
Any HTML element within `<body>` with a `href` or `src` attribute is considered a link (this is [configurable](#Link-Extraction) however).
|
12
|
+
|
13
|
+
For each link on a given page, any of the following conditions constitutes that the link is broken:
|
12
14
|
|
13
15
|
- An empty HTML response body is returned.
|
14
16
|
- A response status code of `404 Not Found` is returned.
|
@@ -29,27 +31,27 @@ With that said, the usual array of HTTP URL features are supported including anc
|
|
29
31
|
|
30
32
|
## Installation
|
31
33
|
|
32
|
-
|
34
|
+
Only MRI Ruby is tested and supported, but `broken_link_finder` may work with other Ruby implementations.
|
33
35
|
|
34
|
-
|
35
|
-
gem 'broken_link_finder'
|
36
|
-
```
|
36
|
+
Currently, the required MRI Ruby version is:
|
37
37
|
|
38
|
-
|
38
|
+
`ruby '>= 2.6', '< 4'`
|
39
39
|
|
40
|
-
|
40
|
+
### Using Bundler
|
41
41
|
|
42
|
-
|
42
|
+
$ bundle add broken_link_finder
|
43
|
+
|
44
|
+
### Using RubyGems
|
43
45
|
|
44
46
|
$ gem install broken_link_finder
|
45
47
|
|
46
|
-
|
48
|
+
### Verify
|
47
49
|
|
48
50
|
$ broken_link_finder version
|
49
51
|
|
50
52
|
## Usage
|
51
53
|
|
52
|
-
You can check for broken links via the
|
54
|
+
You can check for broken links via the executable or library.
|
53
55
|
|
54
56
|
### Executable
|
55
57
|
|
@@ -91,9 +93,10 @@ See the full source code documentation [here](https://www.rubydoc.info/gems/brok
|
|
91
93
|
If broken links are found then the output will look something like:
|
92
94
|
|
93
95
|
```text
|
94
|
-
Crawled http://txti.es
|
96
|
+
Crawled http://txti.es
|
97
|
+
7 page(s) containing 32 unique link(s) in 6.82 seconds
|
95
98
|
|
96
|
-
Found 6 broken link(s) across 2 page(s):
|
99
|
+
Found 6 unique broken link(s) across 2 page(s):
|
97
100
|
|
98
101
|
The following broken links were found on 'http://txti.es/about':
|
99
102
|
http://twitter.com/thebarrytone
|
@@ -105,7 +108,7 @@ The following broken links were found on 'http://txti.es/how':
|
|
105
108
|
http://en.wikipedia.org/wiki/Markdown
|
106
109
|
http://imgur.com
|
107
110
|
|
108
|
-
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
111
|
+
Ignored 3 unique unsupported link(s) across 2 page(s), which you should check manually:
|
109
112
|
|
110
113
|
The following links were ignored on 'http://txti.es':
|
111
114
|
tel:+13174562564
|
@@ -117,6 +120,35 @@ ftp://server.com
|
|
117
120
|
|
118
121
|
You can provide the `--html` flag if you'd prefer a HTML based report.
|
119
122
|
|
123
|
+
## Link Extraction
|
124
|
+
|
125
|
+
You can customise the XPath used to extract links from each crawled page. This can be done via the executable or library.
|
126
|
+
|
127
|
+
### Executable
|
128
|
+
|
129
|
+
Add the `--xpath` (or `-x`) flag to the crawl command e.g.
|
130
|
+
|
131
|
+
$ broken_link_finder crawl http://txti.es -x //img/@src
|
132
|
+
|
133
|
+
### Library
|
134
|
+
|
135
|
+
Set the desired XPath using the accessor methods provided:
|
136
|
+
|
137
|
+
> main.rb
|
138
|
+
|
139
|
+
```ruby
|
140
|
+
require 'broken_link_finder'
|
141
|
+
|
142
|
+
# Set your desired xpath before crawling...
|
143
|
+
BrokenLinkFinder::link_xpath = '//img/@src'
|
144
|
+
|
145
|
+
# Now crawl as normal and only your custom targeted links will be checked.
|
146
|
+
BrokenLinkFinder.new.crawl_page 'http://txti.es'
|
147
|
+
|
148
|
+
# Go back to using the default provided xpath as needed.
|
149
|
+
BrokenLinkFinder::link_xpath = BrokenLinkFinder::DEFAULT_LINK_XPATH
|
150
|
+
```
|
151
|
+
|
120
152
|
## Contributing
|
121
153
|
|
122
154
|
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
data/bin/console
CHANGED
@@ -23,12 +23,14 @@ end
|
|
23
23
|
# You can add fixtures and/or initialization code here...
|
24
24
|
reload
|
25
25
|
|
26
|
-
url
|
27
|
-
by_page
|
28
|
-
by_link
|
29
|
-
finder
|
26
|
+
def url; @url ||= 'http://txti.es/'; end
|
27
|
+
def by_page; @by_page ||= Finder.new; end
|
28
|
+
def by_link; @by_link ||= Finder.new(sort: :link); end
|
29
|
+
def finder; @finder ||= by_page; end
|
30
30
|
|
31
31
|
# Start the console.
|
32
|
-
puts
|
32
|
+
puts
|
33
|
+
puts "broken_link_finder v#{BrokenLinkFinder::VERSION} (#{Wgit.version_str})"
|
34
|
+
puts
|
33
35
|
|
34
|
-
|
36
|
+
Pry.start
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
@@ -38,16 +38,16 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.require_paths = ['lib']
|
39
39
|
spec.post_install_message = "Added the executable 'broken_link_finder' to $PATH"
|
40
40
|
|
41
|
-
spec.required_ruby_version = '
|
41
|
+
spec.required_ruby_version = '>= 2.6', '< 4'
|
42
42
|
|
43
43
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
44
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
45
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
46
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
47
|
-
spec.add_development_dependency 'rake', '~>
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
48
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
49
49
|
|
50
50
|
spec.add_runtime_dependency 'thor', '~> 0.20'
|
51
51
|
spec.add_runtime_dependency 'thread', '~> 0.2'
|
52
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.10'
|
53
53
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -9,6 +9,7 @@ class BrokenLinkFinderCLI < Thor
|
|
9
9
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
10
10
|
option :recursive, type: :boolean, aliases: [:r], default: false, desc: 'Crawl the entire site.'
|
11
11
|
option :threads, type: :numeric, aliases: [:t], default: BrokenLinkFinder::DEFAULT_MAX_THREADS, desc: 'Max number of threads to use when crawling recursively; 1 thread per web page.'
|
12
|
+
option :xpath, type: :string, aliases: [:x], default: BrokenLinkFinder::DEFAULT_LINK_XPATH
|
12
13
|
option :html, type: :boolean, aliases: [:h], default: false, desc: 'Produce a HTML report (instead of text)'
|
13
14
|
option :sort_by_link, type: :boolean, aliases: [:l], default: false, desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
14
15
|
option :verbose, type: :boolean, aliases: [:v], default: false, desc: 'Display all ignored links.'
|
@@ -22,6 +23,7 @@ class BrokenLinkFinderCLI < Thor
|
|
22
23
|
broken_verbose = !options[:concise]
|
23
24
|
ignored_verbose = options[:verbose]
|
24
25
|
|
26
|
+
BrokenLinkFinder.link_xpath = options[:xpath]
|
25
27
|
finder = BrokenLinkFinder::Finder.new(sort: sort_by, max_threads: max_threads)
|
26
28
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
27
29
|
finder.report(
|
@@ -29,13 +31,19 @@ class BrokenLinkFinderCLI < Thor
|
|
29
31
|
broken_verbose: broken_verbose,
|
30
32
|
ignored_verbose: ignored_verbose
|
31
33
|
)
|
32
|
-
|
34
|
+
|
35
|
+
exit 0
|
36
|
+
rescue StandardError => e
|
33
37
|
puts "An error has occurred: #{e.message}"
|
38
|
+
|
39
|
+
exit 1
|
34
40
|
end
|
35
41
|
|
36
42
|
desc 'version', 'Display the currently installed version'
|
37
43
|
def version
|
38
44
|
puts "broken_link_finder v#{BrokenLinkFinder::VERSION}"
|
45
|
+
|
46
|
+
exit 0
|
39
47
|
end
|
40
48
|
end
|
41
49
|
|
@@ -1,48 +1,59 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module BrokenLinkFinder
|
4
|
-
DEFAULT_MAX_THREADS = 100
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
5
6
|
|
6
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
7
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
8
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
9
10
|
end
|
10
11
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
11
13
|
class Finder
|
12
|
-
|
14
|
+
# The collection key - either :page or :link.
|
15
|
+
attr_reader :sort
|
13
16
|
|
14
|
-
#
|
15
|
-
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
18
|
+
attr_reader :max_threads
|
19
|
+
|
20
|
+
# Returns a new Finder instance.
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
16
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
17
23
|
unless %i[page link].include?(sort)
|
18
24
|
|
19
25
|
@sort = sort
|
20
26
|
@max_threads = max_threads
|
21
|
-
@lock = Mutex.new
|
22
27
|
@crawler = Wgit::Crawler.new
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns the current broken links.
|
32
|
+
def broken_links
|
33
|
+
@manager.broken_links
|
34
|
+
end
|
23
35
|
|
24
|
-
|
36
|
+
# Returns the current ignored links.
|
37
|
+
def ignored_links
|
38
|
+
@manager.ignored_links
|
25
39
|
end
|
26
40
|
|
27
|
-
#
|
28
|
-
def
|
29
|
-
@
|
30
|
-
@ignored_links = {}
|
31
|
-
@all_broken_links = Set.new # Used to prevent crawling a link twice.
|
32
|
-
@all_intact_links = Set.new # "
|
33
|
-
@broken_link_map = {} # Maps a link to its absolute form.
|
34
|
-
@crawl_stats = {} # Records crawl stats e.g. duration etc.
|
41
|
+
# Returns the current crawl stats.
|
42
|
+
def crawl_stats
|
43
|
+
@manager.crawl_stats
|
35
44
|
end
|
36
45
|
|
37
|
-
# Finds broken links within a single page and
|
38
|
-
#
|
46
|
+
# Finds broken links within a single page and records them.
|
47
|
+
# Returns true if at least one broken link was found.
|
39
48
|
# Access the broken links afterwards with Finder#broken_links.
|
40
49
|
def crawl_url(url)
|
41
|
-
|
50
|
+
@manager.empty
|
42
51
|
|
43
52
|
start = Time.now
|
44
53
|
url = url.to_url
|
45
|
-
|
54
|
+
|
55
|
+
# We dup the url to avoid recording any redirects.
|
56
|
+
doc = @crawler.crawl(url.dup)
|
46
57
|
|
47
58
|
# Ensure the given page url is valid.
|
48
59
|
raise "Invalid or broken URL: #{url}" unless doc
|
@@ -51,18 +62,17 @@ module BrokenLinkFinder
|
|
51
62
|
find_broken_links(doc)
|
52
63
|
retry_broken_links
|
53
64
|
|
54
|
-
|
55
|
-
|
65
|
+
@manager.sort
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
56
67
|
|
57
|
-
|
68
|
+
broken_links.any?
|
58
69
|
end
|
59
70
|
|
60
|
-
# Finds broken links within an entire site and
|
61
|
-
#
|
62
|
-
# at least one broken link was found and an Array of all pages crawled.
|
71
|
+
# Finds broken links within an entire site and records them.
|
72
|
+
# Returns true if at least one broken link was found.
|
63
73
|
# Access the broken links afterwards with Finder#broken_links.
|
64
|
-
def crawl_site(url)
|
65
|
-
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
75
|
+
@manager.empty
|
66
76
|
|
67
77
|
start = Time.now
|
68
78
|
url = url.to_url
|
@@ -70,7 +80,9 @@ module BrokenLinkFinder
|
|
70
80
|
crawled = Set.new
|
71
81
|
|
72
82
|
# Crawl the site's HTML web pages looking for links.
|
73
|
-
|
83
|
+
# We dup the url to avoid recording any redirects.
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
74
86
|
crawled << doc.url
|
75
87
|
next unless doc
|
76
88
|
|
@@ -78,35 +90,39 @@ module BrokenLinkFinder
|
|
78
90
|
pool.process { find_broken_links(doc) }
|
79
91
|
end
|
80
92
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
94
|
+
pool.shutdown
|
95
|
+
|
81
96
|
# Ensure the given website url is valid.
|
82
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
83
98
|
|
84
|
-
# Wait for all threads to finish.
|
85
|
-
pool.shutdown
|
86
99
|
retry_broken_links
|
87
100
|
|
88
|
-
|
89
|
-
|
101
|
+
@manager.sort
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
90
103
|
|
91
|
-
|
104
|
+
broken_links.any?
|
105
|
+
ensure
|
106
|
+
pool.shutdown if defined?(pool)
|
92
107
|
end
|
93
108
|
|
94
|
-
#
|
109
|
+
# Outputs the link report into a stream e.g. STDOUT or a file,
|
95
110
|
# anything that respond_to? :puts. Defaults to STDOUT.
|
96
|
-
def report(stream = STDOUT,
|
97
|
-
|
111
|
+
def report(stream = STDOUT, type: :text,
|
112
|
+
broken_verbose: true, ignored_verbose: false)
|
98
113
|
klass = case type
|
99
114
|
when :text
|
100
115
|
BrokenLinkFinder::TextReporter
|
101
116
|
when :html
|
102
117
|
BrokenLinkFinder::HTMLReporter
|
103
118
|
else
|
104
|
-
raise "type: must be :text or :html, not: :#{type}"
|
119
|
+
raise "The type: must be :text or :html, not: :#{type}"
|
105
120
|
end
|
106
121
|
|
107
|
-
reporter = klass.new(stream, @sort,
|
108
|
-
|
109
|
-
|
122
|
+
reporter = klass.new(stream, @sort,
|
123
|
+
broken_links, ignored_links,
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
125
|
+
reporter.call(broken_verbose: broken_verbose,
|
110
126
|
ignored_verbose: ignored_verbose)
|
111
127
|
end
|
112
128
|
|
@@ -114,26 +130,29 @@ module BrokenLinkFinder
|
|
114
130
|
|
115
131
|
# Finds which links are unsupported or broken and records the details.
|
116
132
|
def find_broken_links(page)
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
134
|
+
|
117
135
|
links = get_supported_links(page)
|
118
136
|
|
119
137
|
# Iterate over the supported links checking if they're broken or not.
|
120
138
|
links.each do |link|
|
121
|
-
# Skip if the link has been
|
122
|
-
next if @all_intact_links.include?(link)
|
139
|
+
# Skip if the link has been encountered previously.
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
123
141
|
|
124
|
-
if @all_broken_links.include?(link)
|
125
|
-
|
142
|
+
if @manager.all_broken_links.include?(link)
|
143
|
+
# The link has already been proven broken so simply record it.
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
126
145
|
next
|
127
146
|
end
|
128
147
|
|
129
|
-
# The link hasn't been
|
148
|
+
# The link hasn't been encountered before so we crawl it.
|
130
149
|
link_doc = crawl_link(page, link)
|
131
150
|
|
132
|
-
# Determine if the crawled link is broken or not.
|
151
|
+
# Determine if the crawled link is broken or not and record it.
|
133
152
|
if link_broken?(link_doc)
|
134
|
-
append_broken_link(page
|
153
|
+
@manager.append_broken_link(page, link)
|
135
154
|
else
|
136
|
-
@
|
155
|
+
@manager.append_intact_link(link)
|
137
156
|
end
|
138
157
|
end
|
139
158
|
|
@@ -143,30 +162,47 @@ module BrokenLinkFinder
|
|
143
162
|
# Implements a retry mechanism for each of the broken links found.
|
144
163
|
# Removes any broken links found to be working OK.
|
145
164
|
def retry_broken_links
|
146
|
-
sleep(
|
165
|
+
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
166
|
+
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
168
|
+
# Don't retry unparsable links (which are Strings).
|
169
|
+
next(true) unless href.is_a?(Wgit::Url)
|
170
|
+
|
171
|
+
doc = @crawler.crawl(href.dup)
|
147
172
|
|
148
|
-
|
149
|
-
|
150
|
-
|
173
|
+
if link_broken?(doc)
|
174
|
+
true
|
175
|
+
else
|
176
|
+
@manager.remove_broken_link(link)
|
177
|
+
false
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Record each unparsable link as a broken link.
|
183
|
+
def record_unparsable_links(doc)
|
184
|
+
doc.unparsable_links.each do |link|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
187
|
+
@manager.broken_link_map[link] = link
|
151
188
|
end
|
152
189
|
end
|
153
190
|
|
154
191
|
# Report and reject any non supported links. Any link that is absolute and
|
155
192
|
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
156
193
|
def get_supported_links(doc)
|
157
|
-
doc.all_links
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
194
|
+
doc.all_links.reject do |link|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
197
|
+
true
|
198
|
+
end
|
199
|
+
end
|
164
200
|
end
|
165
201
|
|
166
202
|
# Make the link absolute and crawl it, returning its Wgit::Document.
|
167
203
|
def crawl_link(doc, link)
|
168
|
-
link = link.
|
169
|
-
@crawler.crawl(link)
|
204
|
+
link = link.make_absolute(doc)
|
205
|
+
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
170
206
|
end
|
171
207
|
|
172
208
|
# Return if the crawled link is broken or not.
|
@@ -175,8 +211,9 @@ module BrokenLinkFinder
|
|
175
211
|
end
|
176
212
|
|
177
213
|
# Returns true if the link is/contains a broken anchor/fragment.
|
214
|
+
# E.g. /about#top should contain a HTML element with an @id of 'top' etc.
|
178
215
|
def has_broken_anchor(doc)
|
179
|
-
raise 'link document is nil' unless doc
|
216
|
+
raise 'The link document is nil' unless doc
|
180
217
|
|
181
218
|
fragment = doc.url.fragment
|
182
219
|
return false if fragment.nil? || fragment.empty?
|
@@ -184,80 +221,6 @@ module BrokenLinkFinder
|
|
184
221
|
doc.xpath("//*[@id='#{fragment}']").empty?
|
185
222
|
end
|
186
223
|
|
187
|
-
# Append key => [value] to @broken_links.
|
188
|
-
# If doc: is provided then the link will be recorded in absolute form.
|
189
|
-
def append_broken_link(url, link, doc: nil)
|
190
|
-
key, value = get_key_value(url, link)
|
191
|
-
|
192
|
-
@lock.synchronize do
|
193
|
-
@broken_links[key] = [] unless @broken_links[key]
|
194
|
-
@broken_links[key] << value
|
195
|
-
|
196
|
-
@all_broken_links << link
|
197
|
-
|
198
|
-
@broken_link_map[link] = link.prefix_base(doc) if doc
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
# Remove the broken_link from the necessary collections.
|
203
|
-
def remove_broken_link(link)
|
204
|
-
@lock.synchronize do
|
205
|
-
if @sort == :page
|
206
|
-
@broken_links.each { |_k, links| links.delete(link) }
|
207
|
-
@broken_links.delete_if { |_k, links| links.empty? }
|
208
|
-
else
|
209
|
-
@broken_links.delete(link)
|
210
|
-
end
|
211
|
-
|
212
|
-
@all_broken_links.delete(link)
|
213
|
-
@all_intact_links << link
|
214
|
-
end
|
215
|
-
end
|
216
|
-
|
217
|
-
# Append key => [value] to @ignored_links.
|
218
|
-
def append_ignored_link(url, link)
|
219
|
-
key, value = get_key_value(url, link)
|
220
|
-
|
221
|
-
@lock.synchronize do
|
222
|
-
@ignored_links[key] = [] unless @ignored_links[key]
|
223
|
-
@ignored_links[key] << value
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
# Returns the correct key value depending on the @sort type.
|
228
|
-
# @sort == :page ? [url, link] : [link, url]
|
229
|
-
def get_key_value(url, link)
|
230
|
-
case @sort
|
231
|
-
when :page
|
232
|
-
[url, link]
|
233
|
-
when :link
|
234
|
-
[link, url]
|
235
|
-
else
|
236
|
-
raise "Unsupported sort type: #{sort}"
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
# Sort keys and values alphabetically.
|
241
|
-
def sort_links
|
242
|
-
@broken_links.values.map(&:uniq!)
|
243
|
-
@ignored_links.values.map(&:uniq!)
|
244
|
-
|
245
|
-
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
246
|
-
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
247
|
-
|
248
|
-
@broken_links.each { |_k, v| v.sort! }
|
249
|
-
@ignored_links.each { |_k, v| v.sort! }
|
250
|
-
end
|
251
|
-
|
252
|
-
# Sets and returns the total number of links crawled.
|
253
|
-
def set_crawl_stats(url:, pages_crawled:, start:)
|
254
|
-
@crawl_stats[:url] = url
|
255
|
-
@crawl_stats[:pages_crawled] = pages_crawled
|
256
|
-
@crawl_stats[:num_pages] = pages_crawled.size
|
257
|
-
@crawl_stats[:num_links] = @all_broken_links.size + @all_intact_links.size
|
258
|
-
@crawl_stats[:duration] = Time.now - start
|
259
|
-
end
|
260
|
-
|
261
224
|
alias crawl_page crawl_url
|
262
225
|
alias crawl_r crawl_site
|
263
226
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Class responsible for handling the link collection logic.
|
5
|
+
class LinkManager
|
6
|
+
# Used for mapping pages to broken links.
|
7
|
+
attr_reader :broken_links
|
8
|
+
|
9
|
+
# Used for mapping pages to ignored links.
|
10
|
+
attr_reader :ignored_links
|
11
|
+
|
12
|
+
# Used to record crawl statistics e.g. duration etc.
|
13
|
+
attr_reader :crawl_stats
|
14
|
+
|
15
|
+
# Used to map a link (as is) to its absolute (crawlable) form.
|
16
|
+
attr_reader :broken_link_map
|
17
|
+
|
18
|
+
# Used to prevent crawling a broken link twice.
|
19
|
+
attr_reader :all_broken_links
|
20
|
+
|
21
|
+
# Used to prevent crawling an intact link twice.
|
22
|
+
attr_reader :all_intact_links
|
23
|
+
|
24
|
+
# Used for building crawl statistics.
|
25
|
+
attr_reader :all_ignored_links
|
26
|
+
|
27
|
+
# Returns a new LinkManager instance with empty link collections.
|
28
|
+
def initialize(sort)
|
29
|
+
raise "Sort by either :page or :link, not #{sort}" \
|
30
|
+
unless %i[page link].include?(sort)
|
31
|
+
|
32
|
+
@sort = sort
|
33
|
+
@lock = Mutex.new
|
34
|
+
|
35
|
+
empty # Initialises the link collections.
|
36
|
+
end
|
37
|
+
|
38
|
+
# Initialise/empty the link collection objects.
|
39
|
+
def empty
|
40
|
+
@broken_links = {}
|
41
|
+
@ignored_links = {}
|
42
|
+
@crawl_stats = {}
|
43
|
+
@broken_link_map = {}
|
44
|
+
@all_broken_links = Set.new
|
45
|
+
@all_intact_links = Set.new
|
46
|
+
@all_ignored_links = Set.new
|
47
|
+
end
|
48
|
+
|
49
|
+
# Append key => [value] to the broken link collections.
|
50
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
51
|
+
def append_broken_link(doc, link, map: true)
|
52
|
+
key, value = get_key_value(doc.url, link)
|
53
|
+
|
54
|
+
@lock.synchronize do
|
55
|
+
@broken_links[key] = [] unless @broken_links[key]
|
56
|
+
@broken_links[key] << value
|
57
|
+
|
58
|
+
@all_broken_links << link
|
59
|
+
|
60
|
+
@broken_link_map[link] = link.make_absolute(doc) if map
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove the broken link from the necessary collections.
|
65
|
+
def remove_broken_link(link)
|
66
|
+
@lock.synchronize do
|
67
|
+
if @sort == :page
|
68
|
+
@broken_links.each { |_k, links| links.delete(link) }
|
69
|
+
@broken_links.delete_if { |_k, links| links.empty? }
|
70
|
+
else
|
71
|
+
@broken_links.delete(link)
|
72
|
+
end
|
73
|
+
|
74
|
+
@all_broken_links.delete(link)
|
75
|
+
@all_intact_links << link
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Append key => [value] to the ignored link collections.
|
80
|
+
def append_ignored_link(url, link)
|
81
|
+
key, value = get_key_value(url, link)
|
82
|
+
|
83
|
+
@lock.synchronize do
|
84
|
+
@ignored_links[key] = [] unless @ignored_links[key]
|
85
|
+
@ignored_links[key] << value
|
86
|
+
|
87
|
+
@all_ignored_links << link
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Append link to @all_intact_links.
|
92
|
+
def append_intact_link(link)
|
93
|
+
@lock.synchronize { @all_intact_links << link }
|
94
|
+
end
|
95
|
+
|
96
|
+
# Sorts the link collection's keys and values alphabetically.
|
97
|
+
def sort
|
98
|
+
@broken_links.values.map(&:uniq!)
|
99
|
+
@ignored_links.values.map(&:uniq!)
|
100
|
+
|
101
|
+
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
102
|
+
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
103
|
+
|
104
|
+
@broken_links.each { |_k, v| v.sort! }
|
105
|
+
@ignored_links.each { |_k, v| v.sort! }
|
106
|
+
end
|
107
|
+
|
108
|
+
# Tally's up various statistics about the crawl and its links.
|
109
|
+
def tally(url:, pages_crawled:, start:)
|
110
|
+
@crawl_stats[:url] = url
|
111
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
112
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
113
|
+
@crawl_stats[:num_links] = (
|
114
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
115
|
+
)
|
116
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
117
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
118
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
119
|
+
@crawl_stats[:duration] = Time.now - start
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
# Returns the correct key value depending on the @sort type.
|
125
|
+
# @sort == :page ? [url, link] : [link, url]
|
126
|
+
def get_key_value(url, link)
|
127
|
+
case @sort
|
128
|
+
when :page
|
129
|
+
[url, link]
|
130
|
+
when :link
|
131
|
+
[link, url]
|
132
|
+
else
|
133
|
+
raise "Unsupported sort type: #{sort}"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a HTML format.
|
4
5
|
class HTMLReporter < Reporter
|
5
|
-
#
|
6
|
+
# Returns a new HTMLReporter instance.
|
6
7
|
# stream is any Object that responds to :puts and :print.
|
7
8
|
def initialize(stream, sort,
|
8
9
|
broken_links, ignored_links,
|
@@ -28,9 +29,11 @@ module BrokenLinkFinder
|
|
28
29
|
# Report a summary of the overall crawl.
|
29
30
|
def report_crawl_summary
|
30
31
|
puts format(
|
31
|
-
'<p class="crawl_summary">Crawled %s (%s
|
32
|
+
'<p class="crawl_summary">Crawled <a href="%s">%s</a><br />%s page(s) containing %s unique link(s) in %s seconds</p>',
|
33
|
+
@crawl_stats[:url],
|
32
34
|
@crawl_stats[:url],
|
33
35
|
@crawl_stats[:num_pages],
|
36
|
+
@crawl_stats[:num_links],
|
34
37
|
@crawl_stats[:duration]&.truncate(2)
|
35
38
|
)
|
36
39
|
end
|
@@ -43,7 +46,7 @@ module BrokenLinkFinder
|
|
43
46
|
puts_summary 'Good news, there are no broken links!', type: :broken
|
44
47
|
else
|
45
48
|
num_pages, num_links = get_hash_stats(@broken_links)
|
46
|
-
puts_summary "Found #{num_links} broken link(s) across #{num_pages} page(s):", type: :broken
|
49
|
+
puts_summary "Found #{num_links} unique broken link(s) across #{num_pages} page(s):", type: :broken
|
47
50
|
|
48
51
|
@broken_links.each do |key, values|
|
49
52
|
puts_group(key, type: :broken) # Puts the opening <p> element.
|
@@ -70,7 +73,7 @@ module BrokenLinkFinder
|
|
70
73
|
|
71
74
|
if @ignored_links.any?
|
72
75
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
73
|
-
puts_summary "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
76
|
+
puts_summary "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:", type: :ignored
|
74
77
|
|
75
78
|
@ignored_links.each do |key, values|
|
76
79
|
puts_group(key, type: :ignored) # Puts the opening <p> element.
|
@@ -125,8 +128,8 @@ module BrokenLinkFinder
|
|
125
128
|
end
|
126
129
|
|
127
130
|
def build_url(link)
|
128
|
-
|
129
|
-
|
131
|
+
href = @broken_link_map[link]
|
132
|
+
href || link
|
130
133
|
end
|
131
134
|
|
132
135
|
alias_method :report, :call
|
@@ -6,7 +6,7 @@ module BrokenLinkFinder
|
|
6
6
|
# The amount of pages/links to display when verbose is false.
|
7
7
|
NUM_VALUES = 3
|
8
8
|
|
9
|
-
#
|
9
|
+
# Returns a new Reporter instance.
|
10
10
|
# stream is any Object that responds to :puts and :print.
|
11
11
|
def initialize(stream, sort,
|
12
12
|
broken_links, ignored_links,
|
@@ -42,8 +42,7 @@ module BrokenLinkFinder
|
|
42
42
|
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
43
43
|
def get_hash_stats(hash)
|
44
44
|
num_keys = hash.keys.length
|
45
|
-
|
46
|
-
num_values = sort_by_page? ? values.length : values.uniq.length
|
45
|
+
num_values = hash.values.flatten.uniq.length
|
47
46
|
|
48
47
|
sort_by_page? ?
|
49
48
|
[num_keys, num_values] :
|
@@ -1,8 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module BrokenLinkFinder
|
4
|
+
# Class responsible for reporting in a text format.
|
4
5
|
class TextReporter < Reporter
|
5
|
-
#
|
6
|
+
# Returns a new TextReporter instance.
|
6
7
|
# stream is any Object that responds to :puts and :print.
|
7
8
|
def initialize(stream, sort,
|
8
9
|
broken_links, ignored_links,
|
@@ -23,10 +24,11 @@ module BrokenLinkFinder
|
|
23
24
|
|
24
25
|
# Report a summary of the overall crawl.
|
25
26
|
def report_crawl_summary
|
27
|
+
puts "Crawled #{@crawl_stats[:url]}"
|
26
28
|
putsn format(
|
27
|
-
'
|
28
|
-
@crawl_stats[:url],
|
29
|
+
'%s page(s) containing %s unique link(s) in %s seconds',
|
29
30
|
@crawl_stats[:num_pages],
|
31
|
+
@crawl_stats[:num_links],
|
30
32
|
@crawl_stats[:duration]&.truncate(2)
|
31
33
|
)
|
32
34
|
end
|
@@ -37,7 +39,7 @@ module BrokenLinkFinder
|
|
37
39
|
puts 'Good news, there are no broken links!'
|
38
40
|
else
|
39
41
|
num_pages, num_links = get_hash_stats(@broken_links)
|
40
|
-
puts "Found #{num_links} broken link(s) across #{num_pages} page(s):"
|
42
|
+
puts "Found #{num_links} unique broken link(s) across #{num_pages} page(s):"
|
41
43
|
|
42
44
|
@broken_links.each do |key, values|
|
43
45
|
msg = sort_by_page? ?
|
@@ -61,7 +63,7 @@ module BrokenLinkFinder
|
|
61
63
|
def report_ignored_links(verbose: false)
|
62
64
|
if @ignored_links.any?
|
63
65
|
num_pages, num_links = get_hash_stats(@ignored_links)
|
64
|
-
nputs "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
66
|
+
nputs "Ignored #{num_links} unique unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
65
67
|
|
66
68
|
@ignored_links.each do |key, values|
|
67
69
|
msg = sort_by_page? ?
|
@@ -1,11 +1,31 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
#
|
4
|
-
|
3
|
+
# Define a method on each doc for recording unparsable links.
|
4
|
+
# Unparsable links are recorded as broken links by Finder.
|
5
|
+
class Wgit::Document
|
6
|
+
def unparsable_links
|
7
|
+
@unparsable_links ||= []
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a Wgit::Url or nil (if link is unparsable).
|
12
|
+
# A proc is preferrable to a function to avoid polluting the global namespace.
|
13
|
+
parse_link = lambda do |doc, link|
|
14
|
+
Wgit::Url.new(link)
|
15
|
+
rescue StandardError
|
16
|
+
doc.unparsable_links << link
|
17
|
+
nil
|
18
|
+
end
|
19
|
+
|
20
|
+
# Define a custom extractor for all page links we're interested in checking.
|
21
|
+
Wgit::Document.define_extractor(
|
5
22
|
:all_links,
|
6
|
-
|
23
|
+
lambda { BrokenLinkFinder::link_xpath },
|
7
24
|
singleton: false,
|
8
25
|
text_content_only: true
|
9
|
-
) do |links|
|
10
|
-
links
|
26
|
+
) do |links, doc|
|
27
|
+
links
|
28
|
+
.uniq
|
29
|
+
.map { |link| parse_link.call(doc, link) }
|
30
|
+
.compact
|
11
31
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module BrokenLinkFinder
|
4
|
+
# Extract all the Document's <body> links e.g. <a>, <img>, <script> etc.
|
5
|
+
DEFAULT_LINK_XPATH = '/html/body//*/@href | /html/body//*/@src'
|
6
|
+
|
7
|
+
@link_xpath = DEFAULT_LINK_XPATH
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# The xpath used to extract links from a crawled page.
|
11
|
+
# Can be overridden as required.
|
12
|
+
attr_accessor :link_xpath
|
13
|
+
end
|
14
|
+
end
|
data/lib/broken_link_finder.rb
CHANGED
@@ -5,8 +5,10 @@ require 'wgit/core_ext'
|
|
5
5
|
require 'thread/pool'
|
6
6
|
require 'set'
|
7
7
|
|
8
|
-
require_relative './broken_link_finder/wgit_extensions'
|
9
8
|
require_relative './broken_link_finder/version'
|
9
|
+
require_relative './broken_link_finder/xpath'
|
10
|
+
require_relative './broken_link_finder/wgit_extensions'
|
11
|
+
require_relative './broken_link_finder/link_manager'
|
10
12
|
require_relative './broken_link_finder/reporter/reporter'
|
11
13
|
require_relative './broken_link_finder/reporter/text_reporter'
|
12
14
|
require_relative './broken_link_finder/reporter/html_reporter'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -72,14 +72,14 @@ dependencies:
|
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
75
|
+
version: '13.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
82
|
+
version: '13.0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: webmock
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0.
|
131
|
+
version: '0.10'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '0.
|
138
|
+
version: '0.10'
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
140
140
|
to you with a summary.
|
141
141
|
email: michael.telford@live.com
|
@@ -159,11 +159,13 @@ files:
|
|
159
159
|
- exe/broken_link_finder
|
160
160
|
- lib/broken_link_finder.rb
|
161
161
|
- lib/broken_link_finder/finder.rb
|
162
|
+
- lib/broken_link_finder/link_manager.rb
|
162
163
|
- lib/broken_link_finder/reporter/html_reporter.rb
|
163
164
|
- lib/broken_link_finder/reporter/reporter.rb
|
164
165
|
- lib/broken_link_finder/reporter/text_reporter.rb
|
165
166
|
- lib/broken_link_finder/version.rb
|
166
167
|
- lib/broken_link_finder/wgit_extensions.rb
|
168
|
+
- lib/broken_link_finder/xpath.rb
|
167
169
|
- load.rb
|
168
170
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
169
171
|
licenses:
|
@@ -180,17 +182,20 @@ require_paths:
|
|
180
182
|
- lib
|
181
183
|
required_ruby_version: !ruby/object:Gem::Requirement
|
182
184
|
requirements:
|
183
|
-
- - "
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '2.6'
|
188
|
+
- - "<"
|
184
189
|
- !ruby/object:Gem::Version
|
185
|
-
version: '
|
190
|
+
version: '4'
|
186
191
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
192
|
requirements:
|
188
193
|
- - ">="
|
189
194
|
- !ruby/object:Gem::Version
|
190
195
|
version: '0'
|
191
196
|
requirements: []
|
192
|
-
rubygems_version: 3.
|
193
|
-
signing_key:
|
197
|
+
rubygems_version: 3.2.22
|
198
|
+
signing_key:
|
194
199
|
specification_version: 4
|
195
200
|
summary: Finds a website's broken links and reports back to you with a summary.
|
196
201
|
test_files: []
|