broken_link_finder 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +10 -2
- data/README.md +28 -20
- data/benchmark.rb +18 -0
- data/bin/console +19 -1
- data/broken_link_finder.gemspec +2 -0
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +1 -0
- data/lib/broken_link_finder/finder.rb +83 -73
- data/lib/broken_link_finder/reporter.rb +113 -0
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +33 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be656fb33a6363c5da6bb1cd05e52f8bc9b1f7223825bd7dc4ada8af2bdea1d2
|
4
|
+
data.tar.gz: 2b4db95eaf086c10ac6f7528ec63c426ead3507f9adc4ecf0fd177c828d487f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59bdd686ff0cce9359e51415011ba0521404834790ba7eec304d29d085bbdfafb3c1c0f8fd63bd235d41c0aed61278a2211f05fdb0f3d9c15d31a53f1b18b877
|
7
|
+
data.tar.gz: 27986886e3fa6ab4123027ff3067633787470c00401001942f7b28f4ba87fcfe38a11513caa2ade8dc5bfed0efb8a9db4af9749130c8960ac7ccb84a169fc154
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Broken Link Finder Change Log
|
2
|
+
|
3
|
+
## v0.0.0 (TEMPLATE - DO NOT EDIT)
|
4
|
+
### Added
|
5
|
+
- ...
|
6
|
+
### Changed/Removed
|
7
|
+
- ...
|
8
|
+
### Fixed
|
9
|
+
- ...
|
10
|
+
---
|
11
|
+
|
12
|
+
## v0.7.0
|
13
|
+
### Added
|
14
|
+
- Added the `--verbose` flag to the executable for displaying all ignored links.
|
15
|
+
- Added the `--concise` flag to the executable for displaying the broken links in summary form.
|
16
|
+
- Added the `--sort-by-link` flag to the executable for displaying the broken links found and the pages containing that link (as opposed to sorting by page by default).
|
17
|
+
### Changed/Removed
|
18
|
+
- Changed the **default** sorting (format) for ignored links to be summarised (much more concise) reducing noise in the reports.
|
19
|
+
- Updated the `README.md` to reflect the new changes.
|
20
|
+
### Fixed
|
21
|
+
- Bug where the broken/ignored links weren't being ordered consistently between runs. Now, all links are reported alphabetically. This will change existing report formats.
|
22
|
+
- Bug where an anchor of `#` was being returned as broken when it shouldn't.
|
23
|
+
---
|
24
|
+
|
25
|
+
## v0.6.0
|
26
|
+
### Added
|
27
|
+
- Support for ignored links e.g. mailto's, tel's etc. The README has been updated.
|
28
|
+
### Changed/Removed
|
29
|
+
- Only HTML files now have their links verified, JS files for example, do not have their contents checked. This also boosts crawl speed.
|
30
|
+
- Links are now reported exactly as they appear in the HTML (for easier location after reading the reports).
|
31
|
+
### Fixed
|
32
|
+
- Links with anchors aren't regarded as separate pages during a crawl anymore, thus removing duplicate reports.
|
33
|
+
---
|
34
|
+
|
35
|
+
## v0.5.0
|
36
|
+
### Added
|
37
|
+
- Anchor support is now included meaning the response HTML must include an element with an ID matching that of the anchor in the link's URL; otherwise, it's regarded as broken. Previously, there was no anchor support.
|
38
|
+
- The README now includes a How It Works section detailing what constitutes a broken link. See this for more information.
|
39
|
+
### Changed/Removed
|
40
|
+
- Any element with a href or src attribute is now regarded as a link. Before it was just `<a>` elements.
|
41
|
+
### Fixed
|
42
|
+
- ...
|
43
|
+
---
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.7.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
7
|
wgit (= 0.0.13)
|
@@ -17,17 +17,23 @@ GEM
|
|
17
17
|
crack (0.4.3)
|
18
18
|
safe_yaml (~> 1.0.0)
|
19
19
|
hashdiff (0.4.0)
|
20
|
+
httplog (1.3.2)
|
21
|
+
rack (>= 1.0)
|
22
|
+
rainbow (>= 2.0.0)
|
23
|
+
memory_profiler (0.9.14)
|
20
24
|
method_source (0.9.2)
|
21
25
|
mini_portile2 (2.4.0)
|
22
26
|
minitest (5.11.3)
|
23
27
|
mongo (2.8.0)
|
24
28
|
bson (>= 4.4.2, < 5.0.0)
|
25
|
-
nokogiri (1.10.
|
29
|
+
nokogiri (1.10.4)
|
26
30
|
mini_portile2 (~> 2.4.0)
|
27
31
|
pry (0.12.2)
|
28
32
|
coderay (~> 1.1.0)
|
29
33
|
method_source (~> 0.9.0)
|
30
34
|
public_suffix (3.1.0)
|
35
|
+
rack (2.0.7)
|
36
|
+
rainbow (3.0.0)
|
31
37
|
rake (10.5.0)
|
32
38
|
safe_yaml (1.0.5)
|
33
39
|
thor (0.20.3)
|
@@ -47,6 +53,8 @@ DEPENDENCIES
|
|
47
53
|
broken_link_finder!
|
48
54
|
bundler (~> 2.0)
|
49
55
|
byebug (~> 11.0)
|
56
|
+
httplog (~> 1.3)
|
57
|
+
memory_profiler (~> 0.9)
|
50
58
|
minitest (~> 5.0)
|
51
59
|
pry (~> 0.12)
|
52
60
|
rake (~> 10.0)
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin; Finds a website's broken links.
|
4
4
|
|
5
5
|
Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a nice concise summary of the broken links found.
|
6
6
|
|
@@ -41,19 +41,25 @@ Or install it yourself as:
|
|
41
41
|
|
42
42
|
## Usage
|
43
43
|
|
44
|
+
You can check for broken links via the library or executable.
|
45
|
+
|
44
46
|
### Executable
|
45
47
|
|
46
48
|
Installing this gem installs the `broken_link_finder` executable into your `$PATH`. The executable allows you to find broken links from your command line. For example:
|
47
49
|
|
48
50
|
$ broken_link_finder crawl http://txti.es
|
49
51
|
|
50
|
-
Adding the `-r`
|
52
|
+
Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
|
51
53
|
|
52
54
|
See the [output](#Output) section below for an example of a site with broken links.
|
53
55
|
|
56
|
+
You can peruse all of the available executable flags with:
|
57
|
+
|
58
|
+
$ broken_link_finder help crawl
|
59
|
+
|
54
60
|
### Library
|
55
61
|
|
56
|
-
Below is a simple script which crawls a website and outputs
|
62
|
+
Below is a simple script which crawls a website and outputs its broken links to `STDOUT`:
|
57
63
|
|
58
64
|
> main.rb
|
59
65
|
|
@@ -61,8 +67,8 @@ Below is a simple script which crawls a website and outputs it's broken links to
|
|
61
67
|
require 'broken_link_finder'
|
62
68
|
|
63
69
|
finder = BrokenLinkFinder.new
|
64
|
-
finder.crawl_site
|
65
|
-
finder.
|
70
|
+
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
71
|
+
finder.pretty_print_link_report # Or use Finder#broken_links and Finder#ignored_links
|
66
72
|
# for direct access to the link Hashes.
|
67
73
|
```
|
68
74
|
|
@@ -70,24 +76,26 @@ Then execute the script with:
|
|
70
76
|
|
71
77
|
$ ruby main.rb
|
72
78
|
|
79
|
+
See the full source code documentation [here](https://www.rubydoc.info/gems/broken_link_finder).
|
80
|
+
|
73
81
|
## Output
|
74
82
|
|
75
83
|
If broken links are found then the output will look something like:
|
76
84
|
|
77
85
|
```text
|
78
|
-
|
86
|
+
Found 6 broken link(s) across 2 page(s):
|
79
87
|
|
80
|
-
The following broken links
|
88
|
+
The following broken links were found on 'http://txti.es/about':
|
81
89
|
http://twitter.com/thebarrytone
|
82
90
|
http://twitter.com/nwbld
|
83
91
|
http://twitter.com/txties
|
84
92
|
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
85
93
|
|
86
|
-
The following broken links
|
94
|
+
The following broken links were found on 'http://txti.es/how':
|
87
95
|
http://en.wikipedia.org/wiki/Markdown
|
88
96
|
http://imgur.com
|
89
97
|
|
90
|
-
|
98
|
+
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
91
99
|
|
92
100
|
The following links were ignored on http://txti.es:
|
93
101
|
tel:+13174562564
|
@@ -97,20 +105,20 @@ The following links were ignored on http://txti.es/contact:
|
|
97
105
|
ftp://server.com
|
98
106
|
```
|
99
107
|
|
100
|
-
## TODO
|
101
|
-
|
102
|
-
- Add logger functionality (especially useful in the console during development).
|
103
|
-
|
104
|
-
## Development
|
105
|
-
|
106
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
107
|
-
|
108
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the `*.gem` file to [rubygems.org](https://rubygems.org).
|
109
|
-
|
110
108
|
## Contributing
|
111
109
|
|
112
|
-
Bug reports and
|
110
|
+
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
113
111
|
|
114
112
|
## License
|
115
113
|
|
116
114
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
115
|
+
|
116
|
+
## Development
|
117
|
+
|
118
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
119
|
+
|
120
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new gem version:
|
121
|
+
- Update the version number in `version.rb`
|
122
|
+
- Run `bundle install`
|
123
|
+
- Run `bundle exec rake test` ensuring all tests pass
|
124
|
+
- Run `bundle exec rake release[origin]`
|
data/benchmark.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'broken_link_finder'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'memory_profiler'
|
4
|
+
|
5
|
+
url = ARGV[0] || "http://txti.es"
|
6
|
+
finder = BrokenLinkFinder::Finder.new
|
7
|
+
|
8
|
+
puts Benchmark.measure { finder.crawl_page url }
|
9
|
+
puts Benchmark.measure { finder.crawl_site url }
|
10
|
+
|
11
|
+
# http://txti.es
|
12
|
+
# Pre threading: 17.591528
|
13
|
+
# Post threading: 7.508828 :-)
|
14
|
+
|
15
|
+
# http://txti.es
|
16
|
+
# Page: 9.526981
|
17
|
+
# Site: 9.732416
|
18
|
+
# Multi-threading crawl_site now yields the same time as a single page.
|
data/bin/console
CHANGED
@@ -2,8 +2,24 @@
|
|
2
2
|
|
3
3
|
require "bundler/setup"
|
4
4
|
require "pry"
|
5
|
+
require "byebug"
|
5
6
|
require "broken_link_finder"
|
6
7
|
require 'wgit/core_ext'
|
8
|
+
require 'httplog'
|
9
|
+
|
10
|
+
# Monkey patch all Net:HTTP network calls and log them.
|
11
|
+
HttpLog.configure do |config|
|
12
|
+
config.log_connect = false
|
13
|
+
config.log_request = true
|
14
|
+
config.log_headers = false
|
15
|
+
config.log_data = false
|
16
|
+
config.log_status = true
|
17
|
+
config.log_response = false
|
18
|
+
config.log_benchmark = true
|
19
|
+
|
20
|
+
config.compact_log = true
|
21
|
+
config.json_log = true
|
22
|
+
end
|
7
23
|
|
8
24
|
# Call reload to load all recent code changes.
|
9
25
|
def reload
|
@@ -18,6 +34,8 @@ end
|
|
18
34
|
# You can add fixtures and/or initialization code here...
|
19
35
|
reload
|
20
36
|
url = "http://txti.es/"
|
21
|
-
|
37
|
+
by_page = Finder.new
|
38
|
+
by_link = Finder.new sort: :link
|
39
|
+
finder = by_page
|
22
40
|
|
23
41
|
binding.pry
|
data/broken_link_finder.gemspec
CHANGED
@@ -42,6 +42,8 @@ Gem::Specification.new do |spec|
|
|
42
42
|
spec.add_development_dependency "pry", "~> 0.12"
|
43
43
|
spec.add_development_dependency "byebug", "~> 11.0"
|
44
44
|
spec.add_development_dependency "webmock", "~> 3.5"
|
45
|
+
spec.add_development_dependency "httplog", "~> 1.3"
|
46
|
+
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
45
47
|
|
46
48
|
spec.add_runtime_dependency "wgit", "0.0.13"
|
47
49
|
spec.add_runtime_dependency "thread", "0.2"
|
data/exe/broken_link_finder
CHANGED
@@ -6,12 +6,23 @@ require 'thor'
|
|
6
6
|
|
7
7
|
class BrokenLinkFinderCLI < Thor
|
8
8
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
9
|
-
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site'
|
9
|
+
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
|
10
|
+
option :sort_by_link, type: :boolean, aliases: [:l], desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
11
|
+
option :verbose, type: :boolean, aliases: [:v], desc: 'Display all ignored links.'
|
12
|
+
option :concise, type: :boolean, aliases: [:c], desc: 'Display only a summary of broken links.'
|
10
13
|
def crawl(url)
|
11
14
|
url = "http://#{url}" unless url.start_with?('http')
|
12
|
-
|
15
|
+
|
16
|
+
sort_by = options[:sort_by_link] ? :link : :page
|
17
|
+
broken_verbose = options[:concise] ? false : true
|
18
|
+
ignored_verbose = options[:verbose] ? true : false
|
19
|
+
|
20
|
+
finder = BrokenLinkFinder::Finder.new(sort: sort_by)
|
13
21
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
14
|
-
finder.
|
22
|
+
finder.pretty_print_link_report(
|
23
|
+
broken_verbose: broken_verbose,
|
24
|
+
ignored_verbose: ignored_verbose
|
25
|
+
)
|
15
26
|
end
|
16
27
|
end
|
17
28
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
+
require_relative 'reporter'
|
1
2
|
require 'wgit'
|
2
3
|
require 'thread/pool'
|
3
4
|
|
4
5
|
module BrokenLinkFinder
|
5
6
|
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
6
7
|
# override the max_threads variable.
|
7
|
-
def self.new
|
8
|
-
Finder.new
|
8
|
+
def self.new(sort: :page)
|
9
|
+
Finder.new(sort: sort)
|
9
10
|
end
|
10
11
|
|
11
12
|
class Finder
|
@@ -13,11 +14,17 @@ module BrokenLinkFinder
|
|
13
14
|
|
14
15
|
attr_reader :broken_links, :ignored_links
|
15
16
|
|
16
|
-
#
|
17
|
-
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
17
|
+
# Creates a new Finder instance.
|
18
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
19
|
+
unless [:page, :link].include?(sort)
|
20
|
+
raise "sort by either :page or :link, not #{sort}"
|
21
|
+
end
|
22
|
+
|
23
|
+
@sort = sort
|
18
24
|
@max_threads = max_threads
|
19
25
|
@lock = Mutex.new
|
20
26
|
@crawler = Wgit::Crawler.new
|
27
|
+
|
21
28
|
clear_links
|
22
29
|
end
|
23
30
|
|
@@ -27,6 +34,24 @@ module BrokenLinkFinder
|
|
27
34
|
@ignored_links = {}
|
28
35
|
end
|
29
36
|
|
37
|
+
# Finds broken links within a single page and appends them to the
|
38
|
+
# @broken_links array. Returns true if at least one broken link was found.
|
39
|
+
# Access the broken links with Finder#broken_links.
|
40
|
+
def crawl_url(url)
|
41
|
+
clear_links
|
42
|
+
url = Wgit::Url.new(url)
|
43
|
+
|
44
|
+
# Ensure the given page url is valid.
|
45
|
+
doc = @crawler.crawl_url(url)
|
46
|
+
raise "Invalid URL: #{url}" unless doc
|
47
|
+
|
48
|
+
# Get all page links and determine which are broken.
|
49
|
+
find_broken_links(doc)
|
50
|
+
|
51
|
+
sort_links
|
52
|
+
@broken_links.any?
|
53
|
+
end
|
54
|
+
|
30
55
|
# Finds broken links within an entire site and appends them to the
|
31
56
|
# @broken_links array. Returns a tuple containing a Boolean of true if
|
32
57
|
# at least one broken link was found and an Array of all pages crawled.
|
@@ -41,10 +66,6 @@ module BrokenLinkFinder
|
|
41
66
|
@crawler.crawl_site(url) do |doc|
|
42
67
|
# Ensure the given website url is valid.
|
43
68
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
44
|
-
|
45
|
-
# Ensure we only process each page once. For example, /about.html might
|
46
|
-
# be linked to several times throughout the entire site.
|
47
|
-
next if crawled_pages.include?(doc.url)
|
48
69
|
crawled_pages << doc.url
|
49
70
|
|
50
71
|
# Get all page links and determine which are broken.
|
@@ -52,65 +73,26 @@ module BrokenLinkFinder
|
|
52
73
|
pool.process { find_broken_links(doc) }
|
53
74
|
end
|
54
75
|
|
55
|
-
pool.shutdown
|
76
|
+
pool.shutdown # Wait for all threads to finish.
|
77
|
+
sort_links
|
56
78
|
[@broken_links.any?, crawled_pages]
|
57
79
|
end
|
58
80
|
|
59
|
-
#
|
60
|
-
#
|
61
|
-
# Access the broken links with Finder#broken_links.
|
62
|
-
def crawl_url(url)
|
63
|
-
clear_links
|
64
|
-
url = Wgit::Url.new(url)
|
65
|
-
|
66
|
-
# Ensure the given page url is valid.
|
67
|
-
doc = @crawler.crawl_url(url)
|
68
|
-
raise "Invalid URL: #{url}" unless doc
|
69
|
-
|
70
|
-
# Get all page links and determine which are broken.
|
71
|
-
find_broken_links(doc)
|
72
|
-
|
73
|
-
@broken_links.any?
|
74
|
-
end
|
75
|
-
|
76
|
-
# Pretty prints the link summary into a stream e.g. Kernel
|
77
|
-
# (STDOUT) or a file - anything that respond_to? :puts.
|
81
|
+
# Pretty prints the link report into a stream e.g. STDOUT or a file,
|
82
|
+
# anything that respond_to? :puts. Defaults to STDOUT.
|
78
83
|
# Returns true if there were broken links and vice versa.
|
79
|
-
def
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
stream
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
@broken_links.each do |page, links|
|
92
|
-
stream.puts("The following broken links exist on #{page}:")
|
93
|
-
links.each do |link|
|
94
|
-
stream.puts(link)
|
95
|
-
end
|
96
|
-
stream.puts("")
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
# Ignored link summary.
|
101
|
-
if @ignored_links.any?
|
102
|
-
stream.puts("Below is a breakdown of the non supported links found, \
|
103
|
-
you should check these manually:")
|
104
|
-
stream.puts("")
|
105
|
-
|
106
|
-
@ignored_links.each do |page, links|
|
107
|
-
stream.puts("The following links were ignored on #{page}:")
|
108
|
-
links.each do |link|
|
109
|
-
stream.puts(link)
|
110
|
-
end
|
111
|
-
stream.puts("")
|
112
|
-
end
|
113
|
-
end
|
84
|
+
def pretty_print_link_report(
|
85
|
+
stream = STDOUT,
|
86
|
+
broken_verbose: true,
|
87
|
+
ignored_verbose: false
|
88
|
+
)
|
89
|
+
reporter = BrokenLinkFinder::Reporter.new(
|
90
|
+
stream, @sort, @broken_links, @ignored_links
|
91
|
+
)
|
92
|
+
reporter.pretty_print_link_report(
|
93
|
+
broken_verbose: broken_verbose,
|
94
|
+
ignored_verbose: ignored_verbose
|
95
|
+
)
|
114
96
|
|
115
97
|
@broken_links.any?
|
116
98
|
end
|
@@ -140,37 +122,65 @@ you should check these manually:")
|
|
140
122
|
append_broken_link(doc.url, link)
|
141
123
|
end
|
142
124
|
end
|
125
|
+
|
126
|
+
nil
|
143
127
|
end
|
144
128
|
|
145
129
|
# Returns true if the link is/contains a broken anchor.
|
146
130
|
def has_broken_anchor(doc)
|
147
131
|
raise "link document is nil" unless doc
|
148
|
-
return false unless doc.url.anchor
|
149
132
|
|
150
|
-
anchor = doc.url.anchor
|
133
|
+
anchor = doc.url.anchor
|
134
|
+
return false if anchor.nil? or anchor == '#'
|
135
|
+
|
136
|
+
anchor = anchor[1..-1] if anchor.start_with?('#')
|
151
137
|
doc.xpath("//*[@id='#{anchor}']").empty?
|
152
138
|
end
|
153
139
|
|
154
|
-
# Append
|
140
|
+
# Append key => [value] to @broken_links.
|
155
141
|
def append_broken_link(url, link)
|
142
|
+
key, value = get_key_value(url, link)
|
156
143
|
@lock.synchronize do
|
157
|
-
unless @broken_links[
|
158
|
-
@broken_links[
|
144
|
+
unless @broken_links[key]
|
145
|
+
@broken_links[key] = []
|
159
146
|
end
|
160
|
-
@broken_links[
|
147
|
+
@broken_links[key] << value
|
161
148
|
end
|
162
149
|
end
|
163
150
|
|
164
|
-
# Append
|
151
|
+
# Append key => [value] to @ignored_links.
|
165
152
|
def append_ignored_link(url, link)
|
153
|
+
key, value = get_key_value(url, link)
|
166
154
|
@lock.synchronize do
|
167
|
-
unless @ignored_links[
|
168
|
-
@ignored_links[
|
155
|
+
unless @ignored_links[key]
|
156
|
+
@ignored_links[key] = []
|
169
157
|
end
|
170
|
-
@ignored_links[
|
158
|
+
@ignored_links[key] << value
|
171
159
|
end
|
172
160
|
end
|
173
161
|
|
162
|
+
# Returns the correct key value depending on the @sort type.
|
163
|
+
# @sort == :page ? [url, link] : [link, url]
|
164
|
+
def get_key_value(url, link)
|
165
|
+
if @sort == :page
|
166
|
+
[url, link]
|
167
|
+
elsif @sort == :link
|
168
|
+
[link, url]
|
169
|
+
else
|
170
|
+
raise "Unsupported sort type: #{sort}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# Sort keys and values alphabetically.
|
175
|
+
def sort_links
|
176
|
+
@broken_links = @broken_links.sort_by { |k, v| k }.to_h
|
177
|
+
@ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
|
178
|
+
|
179
|
+
@broken_links.each { |k, v| v.sort! }
|
180
|
+
@ignored_links.each { |k, v| v.sort! }
|
181
|
+
end
|
182
|
+
|
174
183
|
alias_method :crawl_page, :crawl_url
|
184
|
+
alias_method :pretty_print_link_summary, :pretty_print_link_report
|
175
185
|
end
|
176
186
|
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module BrokenLinkFinder
|
2
|
+
class Reporter
|
3
|
+
# The amount of pages/links to display when verbose is false.
|
4
|
+
NUM_VALUES = 3.freeze
|
5
|
+
|
6
|
+
# Creates a new Reporter instance.
|
7
|
+
# stream is any Object that responds to :puts.
|
8
|
+
def initialize(stream, sort, broken_links, ignored_links)
|
9
|
+
raise "stream must respond_to? :puts" unless stream.respond_to?(:puts)
|
10
|
+
unless [:page, :link].include?(sort)
|
11
|
+
raise "sort by either :page or :link, not #{sort}"
|
12
|
+
end
|
13
|
+
|
14
|
+
@stream = stream
|
15
|
+
@sort = sort
|
16
|
+
@broken_links = broken_links
|
17
|
+
@ignored_links = ignored_links
|
18
|
+
end
|
19
|
+
|
20
|
+
# Pretty print a report detailing the link summary.
|
21
|
+
def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
|
22
|
+
report_broken_links(verbose: broken_verbose)
|
23
|
+
report_ignored_links(verbose: ignored_verbose)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Report a summary of the broken links.
|
29
|
+
def report_broken_links(verbose: true)
|
30
|
+
if @broken_links.empty?
|
31
|
+
print "Good news, there are no broken links!"
|
32
|
+
else
|
33
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
34
|
+
print "Found #{num_links} broken link(s) across #{num_pages} page(s):"
|
35
|
+
|
36
|
+
@broken_links.each do |key, values|
|
37
|
+
msg = sort_by_page? ?
|
38
|
+
"The following broken links were found on '#{key}':" :
|
39
|
+
"The broken link '#{key}' was found on the following pages:"
|
40
|
+
nprint msg
|
41
|
+
|
42
|
+
if verbose or values.length <= NUM_VALUES
|
43
|
+
values.each { |value| print value }
|
44
|
+
else # Only print N values and summarise the rest.
|
45
|
+
NUM_VALUES.times { |i| print values[i] }
|
46
|
+
|
47
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
48
|
+
print "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Report a summary of the ignored links.
|
55
|
+
def report_ignored_links(verbose: false)
|
56
|
+
if @ignored_links.any?
|
57
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
58
|
+
nprint "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
59
|
+
|
60
|
+
@ignored_links.each do |key, values|
|
61
|
+
msg = sort_by_page? ?
|
62
|
+
"The following links were ignored on '#{key}':" :
|
63
|
+
"The link '#{key}' was ignored on the following pages:"
|
64
|
+
nprint msg
|
65
|
+
|
66
|
+
if verbose or values.length <= NUM_VALUES
|
67
|
+
values.each { |value| print value }
|
68
|
+
else # Only print N values and summarise the rest.
|
69
|
+
NUM_VALUES.times { |i| print values[i] }
|
70
|
+
|
71
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
72
|
+
print "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Return true if the sort is by page.
|
79
|
+
def sort_by_page?
|
80
|
+
@sort == :page
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the key/value statistics of hash e.g. the number of keys and
|
84
|
+
# combined values. The hash should be of the format: { 'str' => [...] }.
|
85
|
+
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
86
|
+
def get_hash_stats(hash)
|
87
|
+
num_keys = hash.keys.length
|
88
|
+
values = hash.values.flatten
|
89
|
+
num_values = sort_by_page? ? values.length : values.uniq.length
|
90
|
+
|
91
|
+
sort_by_page? ?
|
92
|
+
[num_keys, num_values] :
|
93
|
+
[num_values, num_keys]
|
94
|
+
end
|
95
|
+
|
96
|
+
# Prints the text + \n. Defaults to a blank line.
|
97
|
+
def print(text = '')
|
98
|
+
@stream.puts(text)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Prints text + \n\n.
|
102
|
+
def printn(text)
|
103
|
+
print(text)
|
104
|
+
print
|
105
|
+
end
|
106
|
+
|
107
|
+
# Prints \n + text + \n.
|
108
|
+
def nprint(text)
|
109
|
+
print
|
110
|
+
print(text)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,34 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '3.5'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: httplog
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.3'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: memory_profiler
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.9'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.9'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: wgit
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -147,17 +175,20 @@ files:
|
|
147
175
|
- ".gitignore"
|
148
176
|
- ".ruby-version"
|
149
177
|
- ".travis.yml"
|
178
|
+
- CHANGELOG.md
|
150
179
|
- Gemfile
|
151
180
|
- Gemfile.lock
|
152
181
|
- LICENSE.txt
|
153
182
|
- README.md
|
154
183
|
- Rakefile
|
184
|
+
- benchmark.rb
|
155
185
|
- bin/console
|
156
186
|
- bin/setup
|
157
187
|
- broken_link_finder.gemspec
|
158
188
|
- exe/broken_link_finder
|
159
189
|
- lib/broken_link_finder.rb
|
160
190
|
- lib/broken_link_finder/finder.rb
|
191
|
+
- lib/broken_link_finder/reporter.rb
|
161
192
|
- lib/broken_link_finder/version.rb
|
162
193
|
- lib/broken_link_finder/wgit_extensions.rb
|
163
194
|
- load.rb
|