broken_link_finder 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -2
- data/CHANGELOG.md +43 -0
- data/Gemfile.lock +10 -2
- data/README.md +28 -20
- data/benchmark.rb +18 -0
- data/bin/console +19 -1
- data/broken_link_finder.gemspec +2 -0
- data/exe/broken_link_finder +14 -3
- data/lib/broken_link_finder.rb +1 -0
- data/lib/broken_link_finder/finder.rb +83 -73
- data/lib/broken_link_finder/reporter.rb +113 -0
- data/lib/broken_link_finder/version.rb +1 -1
- metadata +33 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be656fb33a6363c5da6bb1cd05e52f8bc9b1f7223825bd7dc4ada8af2bdea1d2
|
4
|
+
data.tar.gz: 2b4db95eaf086c10ac6f7528ec63c426ead3507f9adc4ecf0fd177c828d487f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59bdd686ff0cce9359e51415011ba0521404834790ba7eec304d29d085bbdfafb3c1c0f8fd63bd235d41c0aed61278a2211f05fdb0f3d9c15d31a53f1b18b877
|
7
|
+
data.tar.gz: 27986886e3fa6ab4123027ff3067633787470c00401001942f7b28f4ba87fcfe38a11513caa2ade8dc5bfed0efb8a9db4af9749130c8960ac7ccb84a169fc154
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Broken Link Finder Change Log
|
2
|
+
|
3
|
+
## v0.0.0 (TEMPLATE - DO NOT EDIT)
|
4
|
+
### Added
|
5
|
+
- ...
|
6
|
+
### Changed/Removed
|
7
|
+
- ...
|
8
|
+
### Fixed
|
9
|
+
- ...
|
10
|
+
---
|
11
|
+
|
12
|
+
## v0.7.0
|
13
|
+
### Added
|
14
|
+
- Added the `--verbose` flag to the executable for displaying all ignored links.
|
15
|
+
- Added the `--concise` flag to the executable for displaying the broken links in summary form.
|
16
|
+
- Added the `--sort-by-link` flag to the executable for displaying the broken links found and the pages containing that link (as opposed to sorting by page by default).
|
17
|
+
### Changed/Removed
|
18
|
+
- Changed the **default** sorting (format) for ignored links to be summarised (much more concise) reducing noise in the reports.
|
19
|
+
- Updated the `README.md` to reflect the new changes.
|
20
|
+
### Fixed
|
21
|
+
- Bug where the broken/ignored links weren't being ordered consistently between runs. Now, all links are reported alphabetically. This will change existing report formats.
|
22
|
+
- Bug where an anchor of `#` was being returned as broken when it shouldn't.
|
23
|
+
---
|
24
|
+
|
25
|
+
## v0.6.0
|
26
|
+
### Added
|
27
|
+
- Support for ignored links e.g. mailto's, tel's etc. The README has been updated.
|
28
|
+
### Changed/Removed
|
29
|
+
- Only HTML files now have their links verified, JS files for example, do not have their contents checked. This also boosts crawl speed.
|
30
|
+
- Links are now reported exactly as they appear in the HTML (for easier location after reading the reports).
|
31
|
+
### Fixed
|
32
|
+
- Links with anchors aren't regarded as separate pages during a crawl anymore, thus removing duplicate reports.
|
33
|
+
---
|
34
|
+
|
35
|
+
## v0.5.0
|
36
|
+
### Added
|
37
|
+
- Anchor support is now included meaning the response HTML must include an element with an ID matching that of the anchor in the link's URL; otherwise, it's regarded as broken. Previously, there was no anchor support.
|
38
|
+
- The README now includes a How It Works section detailing what constitutes a broken link. See this for more information.
|
39
|
+
### Changed/Removed
|
40
|
+
- Any element with a href or src attribute is now regarded as a link. Before it was just `<a>` elements.
|
41
|
+
### Fixed
|
42
|
+
- ...
|
43
|
+
---
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.7.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
7
|
wgit (= 0.0.13)
|
@@ -17,17 +17,23 @@ GEM
|
|
17
17
|
crack (0.4.3)
|
18
18
|
safe_yaml (~> 1.0.0)
|
19
19
|
hashdiff (0.4.0)
|
20
|
+
httplog (1.3.2)
|
21
|
+
rack (>= 1.0)
|
22
|
+
rainbow (>= 2.0.0)
|
23
|
+
memory_profiler (0.9.14)
|
20
24
|
method_source (0.9.2)
|
21
25
|
mini_portile2 (2.4.0)
|
22
26
|
minitest (5.11.3)
|
23
27
|
mongo (2.8.0)
|
24
28
|
bson (>= 4.4.2, < 5.0.0)
|
25
|
-
nokogiri (1.10.
|
29
|
+
nokogiri (1.10.4)
|
26
30
|
mini_portile2 (~> 2.4.0)
|
27
31
|
pry (0.12.2)
|
28
32
|
coderay (~> 1.1.0)
|
29
33
|
method_source (~> 0.9.0)
|
30
34
|
public_suffix (3.1.0)
|
35
|
+
rack (2.0.7)
|
36
|
+
rainbow (3.0.0)
|
31
37
|
rake (10.5.0)
|
32
38
|
safe_yaml (1.0.5)
|
33
39
|
thor (0.20.3)
|
@@ -47,6 +53,8 @@ DEPENDENCIES
|
|
47
53
|
broken_link_finder!
|
48
54
|
bundler (~> 2.0)
|
49
55
|
byebug (~> 11.0)
|
56
|
+
httplog (~> 1.3)
|
57
|
+
memory_profiler (~> 0.9)
|
50
58
|
minitest (~> 5.0)
|
51
59
|
pry (~> 0.12)
|
52
60
|
rake (~> 10.0)
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Broken Link Finder
|
2
2
|
|
3
|
-
Does what it says on the tin
|
3
|
+
Does what it says on the tin; Finds a website's broken links.
|
4
4
|
|
5
5
|
Simply point it at a website and it will crawl all of its webpages searching for and identifing any broken links. You will then be presented with a nice concise summary of the broken links found.
|
6
6
|
|
@@ -41,19 +41,25 @@ Or install it yourself as:
|
|
41
41
|
|
42
42
|
## Usage
|
43
43
|
|
44
|
+
You can check for broken links via the library or executable.
|
45
|
+
|
44
46
|
### Executable
|
45
47
|
|
46
48
|
Installing this gem installs the `broken_link_finder` executable into your `$PATH`. The executable allows you to find broken links from your command line. For example:
|
47
49
|
|
48
50
|
$ broken_link_finder crawl http://txti.es
|
49
51
|
|
50
|
-
Adding the `-r`
|
52
|
+
Adding the `-r` flag would crawl the entire `txti.es` site, not just its index page.
|
51
53
|
|
52
54
|
See the [output](#Output) section below for an example of a site with broken links.
|
53
55
|
|
56
|
+
You can peruse all of the available executable flags with:
|
57
|
+
|
58
|
+
$ broken_link_finder help crawl
|
59
|
+
|
54
60
|
### Library
|
55
61
|
|
56
|
-
Below is a simple script which crawls a website and outputs
|
62
|
+
Below is a simple script which crawls a website and outputs its broken links to `STDOUT`:
|
57
63
|
|
58
64
|
> main.rb
|
59
65
|
|
@@ -61,8 +67,8 @@ Below is a simple script which crawls a website and outputs it's broken links to
|
|
61
67
|
require 'broken_link_finder'
|
62
68
|
|
63
69
|
finder = BrokenLinkFinder.new
|
64
|
-
finder.crawl_site
|
65
|
-
finder.
|
70
|
+
finder.crawl_site 'http://txti.es' # Or use Finder#crawl_page for a single webpage.
|
71
|
+
finder.pretty_print_link_report # Or use Finder#broken_links and Finder#ignored_links
|
66
72
|
# for direct access to the link Hashes.
|
67
73
|
```
|
68
74
|
|
@@ -70,24 +76,26 @@ Then execute the script with:
|
|
70
76
|
|
71
77
|
$ ruby main.rb
|
72
78
|
|
79
|
+
See the full source code documentation [here](https://www.rubydoc.info/gems/broken_link_finder).
|
80
|
+
|
73
81
|
## Output
|
74
82
|
|
75
83
|
If broken links are found then the output will look something like:
|
76
84
|
|
77
85
|
```text
|
78
|
-
|
86
|
+
Found 6 broken link(s) across 2 page(s):
|
79
87
|
|
80
|
-
The following broken links
|
88
|
+
The following broken links were found on 'http://txti.es/about':
|
81
89
|
http://twitter.com/thebarrytone
|
82
90
|
http://twitter.com/nwbld
|
83
91
|
http://twitter.com/txties
|
84
92
|
https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FBUU
|
85
93
|
|
86
|
-
The following broken links
|
94
|
+
The following broken links were found on 'http://txti.es/how':
|
87
95
|
http://en.wikipedia.org/wiki/Markdown
|
88
96
|
http://imgur.com
|
89
97
|
|
90
|
-
|
98
|
+
Ignored 3 unsupported link(s) across 2 page(s), which you should check manually:
|
91
99
|
|
92
100
|
The following links were ignored on http://txti.es:
|
93
101
|
tel:+13174562564
|
@@ -97,20 +105,20 @@ The following links were ignored on http://txti.es/contact:
|
|
97
105
|
ftp://server.com
|
98
106
|
```
|
99
107
|
|
100
|
-
## TODO
|
101
|
-
|
102
|
-
- Add logger functionality (especially useful in the console during development).
|
103
|
-
|
104
|
-
## Development
|
105
|
-
|
106
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
107
|
-
|
108
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release[origin]`, which will create a git tag for the version, push git commits and tags, and push the `*.gem` file to [rubygems.org](https://rubygems.org).
|
109
|
-
|
110
108
|
## Contributing
|
111
109
|
|
112
|
-
Bug reports and
|
110
|
+
Bug reports and feature requests are welcome on [GitHub](https://github.com/michaeltelford/broken-link-finder). Just raise an issue.
|
113
111
|
|
114
112
|
## License
|
115
113
|
|
116
114
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
115
|
+
|
116
|
+
## Development
|
117
|
+
|
118
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bundle exec rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
119
|
+
|
120
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new gem version:
|
121
|
+
- Update the version number in `version.rb`
|
122
|
+
- Run `bundle install`
|
123
|
+
- Run `bundle exec rake test` ensuring all tests pass
|
124
|
+
- Run `bundle exec rake release[origin]`
|
data/benchmark.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'broken_link_finder'
|
2
|
+
require 'benchmark'
|
3
|
+
require 'memory_profiler'
|
4
|
+
|
5
|
+
url = ARGV[0] || "http://txti.es"
|
6
|
+
finder = BrokenLinkFinder::Finder.new
|
7
|
+
|
8
|
+
puts Benchmark.measure { finder.crawl_page url }
|
9
|
+
puts Benchmark.measure { finder.crawl_site url }
|
10
|
+
|
11
|
+
# http://txti.es
|
12
|
+
# Pre threading: 17.591528
|
13
|
+
# Post threading: 7.508828 :-)
|
14
|
+
|
15
|
+
# http://txti.es
|
16
|
+
# Page: 9.526981
|
17
|
+
# Site: 9.732416
|
18
|
+
# Multi-threading crawl_site now yields the same time as a single page.
|
data/bin/console
CHANGED
@@ -2,8 +2,24 @@
|
|
2
2
|
|
3
3
|
require "bundler/setup"
|
4
4
|
require "pry"
|
5
|
+
require "byebug"
|
5
6
|
require "broken_link_finder"
|
6
7
|
require 'wgit/core_ext'
|
8
|
+
require 'httplog'
|
9
|
+
|
10
|
+
# Monkey patch all Net:HTTP network calls and log them.
|
11
|
+
HttpLog.configure do |config|
|
12
|
+
config.log_connect = false
|
13
|
+
config.log_request = true
|
14
|
+
config.log_headers = false
|
15
|
+
config.log_data = false
|
16
|
+
config.log_status = true
|
17
|
+
config.log_response = false
|
18
|
+
config.log_benchmark = true
|
19
|
+
|
20
|
+
config.compact_log = true
|
21
|
+
config.json_log = true
|
22
|
+
end
|
7
23
|
|
8
24
|
# Call reload to load all recent code changes.
|
9
25
|
def reload
|
@@ -18,6 +34,8 @@ end
|
|
18
34
|
# You can add fixtures and/or initialization code here...
|
19
35
|
reload
|
20
36
|
url = "http://txti.es/"
|
21
|
-
|
37
|
+
by_page = Finder.new
|
38
|
+
by_link = Finder.new sort: :link
|
39
|
+
finder = by_page
|
22
40
|
|
23
41
|
binding.pry
|
data/broken_link_finder.gemspec
CHANGED
@@ -42,6 +42,8 @@ Gem::Specification.new do |spec|
|
|
42
42
|
spec.add_development_dependency "pry", "~> 0.12"
|
43
43
|
spec.add_development_dependency "byebug", "~> 11.0"
|
44
44
|
spec.add_development_dependency "webmock", "~> 3.5"
|
45
|
+
spec.add_development_dependency "httplog", "~> 1.3"
|
46
|
+
spec.add_development_dependency "memory_profiler", "~> 0.9"
|
45
47
|
|
46
48
|
spec.add_runtime_dependency "wgit", "0.0.13"
|
47
49
|
spec.add_runtime_dependency "thread", "0.2"
|
data/exe/broken_link_finder
CHANGED
@@ -6,12 +6,23 @@ require 'thor'
|
|
6
6
|
|
7
7
|
class BrokenLinkFinderCLI < Thor
|
8
8
|
desc 'crawl [URL]', 'Find broken links at the URL'
|
9
|
-
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site'
|
9
|
+
option :recursive, type: :boolean, aliases: [:r], desc: 'Crawl the entire site.'
|
10
|
+
option :sort_by_link, type: :boolean, aliases: [:l], desc: 'Makes report more concise if there are more pages crawled than broken links found. Use with -r on medium/large sites.'
|
11
|
+
option :verbose, type: :boolean, aliases: [:v], desc: 'Display all ignored links.'
|
12
|
+
option :concise, type: :boolean, aliases: [:c], desc: 'Display only a summary of broken links.'
|
10
13
|
def crawl(url)
|
11
14
|
url = "http://#{url}" unless url.start_with?('http')
|
12
|
-
|
15
|
+
|
16
|
+
sort_by = options[:sort_by_link] ? :link : :page
|
17
|
+
broken_verbose = options[:concise] ? false : true
|
18
|
+
ignored_verbose = options[:verbose] ? true : false
|
19
|
+
|
20
|
+
finder = BrokenLinkFinder::Finder.new(sort: sort_by)
|
13
21
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
14
|
-
finder.
|
22
|
+
finder.pretty_print_link_report(
|
23
|
+
broken_verbose: broken_verbose,
|
24
|
+
ignored_verbose: ignored_verbose
|
25
|
+
)
|
15
26
|
end
|
16
27
|
end
|
17
28
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
+
require_relative 'reporter'
|
1
2
|
require 'wgit'
|
2
3
|
require 'thread/pool'
|
3
4
|
|
4
5
|
module BrokenLinkFinder
|
5
6
|
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
6
7
|
# override the max_threads variable.
|
7
|
-
def self.new
|
8
|
-
Finder.new
|
8
|
+
def self.new(sort: :page)
|
9
|
+
Finder.new(sort: sort)
|
9
10
|
end
|
10
11
|
|
11
12
|
class Finder
|
@@ -13,11 +14,17 @@ module BrokenLinkFinder
|
|
13
14
|
|
14
15
|
attr_reader :broken_links, :ignored_links
|
15
16
|
|
16
|
-
#
|
17
|
-
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
17
|
+
# Creates a new Finder instance.
|
18
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
19
|
+
unless [:page, :link].include?(sort)
|
20
|
+
raise "sort by either :page or :link, not #{sort}"
|
21
|
+
end
|
22
|
+
|
23
|
+
@sort = sort
|
18
24
|
@max_threads = max_threads
|
19
25
|
@lock = Mutex.new
|
20
26
|
@crawler = Wgit::Crawler.new
|
27
|
+
|
21
28
|
clear_links
|
22
29
|
end
|
23
30
|
|
@@ -27,6 +34,24 @@ module BrokenLinkFinder
|
|
27
34
|
@ignored_links = {}
|
28
35
|
end
|
29
36
|
|
37
|
+
# Finds broken links within a single page and appends them to the
|
38
|
+
# @broken_links array. Returns true if at least one broken link was found.
|
39
|
+
# Access the broken links with Finder#broken_links.
|
40
|
+
def crawl_url(url)
|
41
|
+
clear_links
|
42
|
+
url = Wgit::Url.new(url)
|
43
|
+
|
44
|
+
# Ensure the given page url is valid.
|
45
|
+
doc = @crawler.crawl_url(url)
|
46
|
+
raise "Invalid URL: #{url}" unless doc
|
47
|
+
|
48
|
+
# Get all page links and determine which are broken.
|
49
|
+
find_broken_links(doc)
|
50
|
+
|
51
|
+
sort_links
|
52
|
+
@broken_links.any?
|
53
|
+
end
|
54
|
+
|
30
55
|
# Finds broken links within an entire site and appends them to the
|
31
56
|
# @broken_links array. Returns a tuple containing a Boolean of true if
|
32
57
|
# at least one broken link was found and an Array of all pages crawled.
|
@@ -41,10 +66,6 @@ module BrokenLinkFinder
|
|
41
66
|
@crawler.crawl_site(url) do |doc|
|
42
67
|
# Ensure the given website url is valid.
|
43
68
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
44
|
-
|
45
|
-
# Ensure we only process each page once. For example, /about.html might
|
46
|
-
# be linked to several times throughout the entire site.
|
47
|
-
next if crawled_pages.include?(doc.url)
|
48
69
|
crawled_pages << doc.url
|
49
70
|
|
50
71
|
# Get all page links and determine which are broken.
|
@@ -52,65 +73,26 @@ module BrokenLinkFinder
|
|
52
73
|
pool.process { find_broken_links(doc) }
|
53
74
|
end
|
54
75
|
|
55
|
-
pool.shutdown
|
76
|
+
pool.shutdown # Wait for all threads to finish.
|
77
|
+
sort_links
|
56
78
|
[@broken_links.any?, crawled_pages]
|
57
79
|
end
|
58
80
|
|
59
|
-
#
|
60
|
-
#
|
61
|
-
# Access the broken links with Finder#broken_links.
|
62
|
-
def crawl_url(url)
|
63
|
-
clear_links
|
64
|
-
url = Wgit::Url.new(url)
|
65
|
-
|
66
|
-
# Ensure the given page url is valid.
|
67
|
-
doc = @crawler.crawl_url(url)
|
68
|
-
raise "Invalid URL: #{url}" unless doc
|
69
|
-
|
70
|
-
# Get all page links and determine which are broken.
|
71
|
-
find_broken_links(doc)
|
72
|
-
|
73
|
-
@broken_links.any?
|
74
|
-
end
|
75
|
-
|
76
|
-
# Pretty prints the link summary into a stream e.g. Kernel
|
77
|
-
# (STDOUT) or a file - anything that respond_to? :puts.
|
81
|
+
# Pretty prints the link report into a stream e.g. STDOUT or a file,
|
82
|
+
# anything that respond_to? :puts. Defaults to STDOUT.
|
78
83
|
# Returns true if there were broken links and vice versa.
|
79
|
-
def
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
stream
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
@broken_links.each do |page, links|
|
92
|
-
stream.puts("The following broken links exist on #{page}:")
|
93
|
-
links.each do |link|
|
94
|
-
stream.puts(link)
|
95
|
-
end
|
96
|
-
stream.puts("")
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
# Ignored link summary.
|
101
|
-
if @ignored_links.any?
|
102
|
-
stream.puts("Below is a breakdown of the non supported links found, \
|
103
|
-
you should check these manually:")
|
104
|
-
stream.puts("")
|
105
|
-
|
106
|
-
@ignored_links.each do |page, links|
|
107
|
-
stream.puts("The following links were ignored on #{page}:")
|
108
|
-
links.each do |link|
|
109
|
-
stream.puts(link)
|
110
|
-
end
|
111
|
-
stream.puts("")
|
112
|
-
end
|
113
|
-
end
|
84
|
+
def pretty_print_link_report(
|
85
|
+
stream = STDOUT,
|
86
|
+
broken_verbose: true,
|
87
|
+
ignored_verbose: false
|
88
|
+
)
|
89
|
+
reporter = BrokenLinkFinder::Reporter.new(
|
90
|
+
stream, @sort, @broken_links, @ignored_links
|
91
|
+
)
|
92
|
+
reporter.pretty_print_link_report(
|
93
|
+
broken_verbose: broken_verbose,
|
94
|
+
ignored_verbose: ignored_verbose
|
95
|
+
)
|
114
96
|
|
115
97
|
@broken_links.any?
|
116
98
|
end
|
@@ -140,37 +122,65 @@ you should check these manually:")
|
|
140
122
|
append_broken_link(doc.url, link)
|
141
123
|
end
|
142
124
|
end
|
125
|
+
|
126
|
+
nil
|
143
127
|
end
|
144
128
|
|
145
129
|
# Returns true if the link is/contains a broken anchor.
|
146
130
|
def has_broken_anchor(doc)
|
147
131
|
raise "link document is nil" unless doc
|
148
|
-
return false unless doc.url.anchor
|
149
132
|
|
150
|
-
anchor = doc.url.anchor
|
133
|
+
anchor = doc.url.anchor
|
134
|
+
return false if anchor.nil? or anchor == '#'
|
135
|
+
|
136
|
+
anchor = anchor[1..-1] if anchor.start_with?('#')
|
151
137
|
doc.xpath("//*[@id='#{anchor}']").empty?
|
152
138
|
end
|
153
139
|
|
154
|
-
# Append
|
140
|
+
# Append key => [value] to @broken_links.
|
155
141
|
def append_broken_link(url, link)
|
142
|
+
key, value = get_key_value(url, link)
|
156
143
|
@lock.synchronize do
|
157
|
-
unless @broken_links[
|
158
|
-
@broken_links[
|
144
|
+
unless @broken_links[key]
|
145
|
+
@broken_links[key] = []
|
159
146
|
end
|
160
|
-
@broken_links[
|
147
|
+
@broken_links[key] << value
|
161
148
|
end
|
162
149
|
end
|
163
150
|
|
164
|
-
# Append
|
151
|
+
# Append key => [value] to @ignored_links.
|
165
152
|
def append_ignored_link(url, link)
|
153
|
+
key, value = get_key_value(url, link)
|
166
154
|
@lock.synchronize do
|
167
|
-
unless @ignored_links[
|
168
|
-
@ignored_links[
|
155
|
+
unless @ignored_links[key]
|
156
|
+
@ignored_links[key] = []
|
169
157
|
end
|
170
|
-
@ignored_links[
|
158
|
+
@ignored_links[key] << value
|
171
159
|
end
|
172
160
|
end
|
173
161
|
|
162
|
+
# Returns the correct key value depending on the @sort type.
|
163
|
+
# @sort == :page ? [url, link] : [link, url]
|
164
|
+
def get_key_value(url, link)
|
165
|
+
if @sort == :page
|
166
|
+
[url, link]
|
167
|
+
elsif @sort == :link
|
168
|
+
[link, url]
|
169
|
+
else
|
170
|
+
raise "Unsupported sort type: #{sort}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# Sort keys and values alphabetically.
|
175
|
+
def sort_links
|
176
|
+
@broken_links = @broken_links.sort_by { |k, v| k }.to_h
|
177
|
+
@ignored_links = @ignored_links.sort_by { |k, v| k }.to_h
|
178
|
+
|
179
|
+
@broken_links.each { |k, v| v.sort! }
|
180
|
+
@ignored_links.each { |k, v| v.sort! }
|
181
|
+
end
|
182
|
+
|
174
183
|
alias_method :crawl_page, :crawl_url
|
184
|
+
alias_method :pretty_print_link_summary, :pretty_print_link_report
|
175
185
|
end
|
176
186
|
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module BrokenLinkFinder
|
2
|
+
class Reporter
|
3
|
+
# The amount of pages/links to display when verbose is false.
|
4
|
+
NUM_VALUES = 3.freeze
|
5
|
+
|
6
|
+
# Creates a new Reporter instance.
|
7
|
+
# stream is any Object that responds to :puts.
|
8
|
+
def initialize(stream, sort, broken_links, ignored_links)
|
9
|
+
raise "stream must respond_to? :puts" unless stream.respond_to?(:puts)
|
10
|
+
unless [:page, :link].include?(sort)
|
11
|
+
raise "sort by either :page or :link, not #{sort}"
|
12
|
+
end
|
13
|
+
|
14
|
+
@stream = stream
|
15
|
+
@sort = sort
|
16
|
+
@broken_links = broken_links
|
17
|
+
@ignored_links = ignored_links
|
18
|
+
end
|
19
|
+
|
20
|
+
# Pretty print a report detailing the link summary.
|
21
|
+
def pretty_print_link_report(broken_verbose: true, ignored_verbose: false)
|
22
|
+
report_broken_links(verbose: broken_verbose)
|
23
|
+
report_ignored_links(verbose: ignored_verbose)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Report a summary of the broken links.
|
29
|
+
def report_broken_links(verbose: true)
|
30
|
+
if @broken_links.empty?
|
31
|
+
print "Good news, there are no broken links!"
|
32
|
+
else
|
33
|
+
num_pages, num_links = get_hash_stats(@broken_links)
|
34
|
+
print "Found #{num_links} broken link(s) across #{num_pages} page(s):"
|
35
|
+
|
36
|
+
@broken_links.each do |key, values|
|
37
|
+
msg = sort_by_page? ?
|
38
|
+
"The following broken links were found on '#{key}':" :
|
39
|
+
"The broken link '#{key}' was found on the following pages:"
|
40
|
+
nprint msg
|
41
|
+
|
42
|
+
if verbose or values.length <= NUM_VALUES
|
43
|
+
values.each { |value| print value }
|
44
|
+
else # Only print N values and summarise the rest.
|
45
|
+
NUM_VALUES.times { |i| print values[i] }
|
46
|
+
|
47
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
48
|
+
print "+ #{values.length - NUM_VALUES} other #{objects}, remove --concise to see them all"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Report a summary of the ignored links.
|
55
|
+
def report_ignored_links(verbose: false)
|
56
|
+
if @ignored_links.any?
|
57
|
+
num_pages, num_links = get_hash_stats(@ignored_links)
|
58
|
+
nprint "Ignored #{num_links} unsupported link(s) across #{num_pages} page(s), which you should check manually:"
|
59
|
+
|
60
|
+
@ignored_links.each do |key, values|
|
61
|
+
msg = sort_by_page? ?
|
62
|
+
"The following links were ignored on '#{key}':" :
|
63
|
+
"The link '#{key}' was ignored on the following pages:"
|
64
|
+
nprint msg
|
65
|
+
|
66
|
+
if verbose or values.length <= NUM_VALUES
|
67
|
+
values.each { |value| print value }
|
68
|
+
else # Only print N values and summarise the rest.
|
69
|
+
NUM_VALUES.times { |i| print values[i] }
|
70
|
+
|
71
|
+
objects = sort_by_page? ? 'link(s)' : 'page(s)'
|
72
|
+
print "+ #{values.length - NUM_VALUES} other #{objects}, use --verbose to see them all"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Return true if the sort is by page.
|
79
|
+
def sort_by_page?
|
80
|
+
@sort == :page
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the key/value statistics of hash e.g. the number of keys and
|
84
|
+
# combined values. The hash should be of the format: { 'str' => [...] }.
|
85
|
+
# Use like: `num_pages, num_links = get_hash_stats(links)`.
|
86
|
+
def get_hash_stats(hash)
|
87
|
+
num_keys = hash.keys.length
|
88
|
+
values = hash.values.flatten
|
89
|
+
num_values = sort_by_page? ? values.length : values.uniq.length
|
90
|
+
|
91
|
+
sort_by_page? ?
|
92
|
+
[num_keys, num_values] :
|
93
|
+
[num_values, num_keys]
|
94
|
+
end
|
95
|
+
|
96
|
+
# Prints the text + \n. Defaults to a blank line.
|
97
|
+
def print(text = '')
|
98
|
+
@stream.puts(text)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Prints text + \n\n.
|
102
|
+
def printn(text)
|
103
|
+
print(text)
|
104
|
+
print
|
105
|
+
end
|
106
|
+
|
107
|
+
# Prints \n + text + \n.
|
108
|
+
def nprint(text)
|
109
|
+
print
|
110
|
+
print(text)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,34 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '3.5'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: httplog
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.3'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: memory_profiler
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.9'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.9'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: wgit
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -147,17 +175,20 @@ files:
|
|
147
175
|
- ".gitignore"
|
148
176
|
- ".ruby-version"
|
149
177
|
- ".travis.yml"
|
178
|
+
- CHANGELOG.md
|
150
179
|
- Gemfile
|
151
180
|
- Gemfile.lock
|
152
181
|
- LICENSE.txt
|
153
182
|
- README.md
|
154
183
|
- Rakefile
|
184
|
+
- benchmark.rb
|
155
185
|
- bin/console
|
156
186
|
- bin/setup
|
157
187
|
- broken_link_finder.gemspec
|
158
188
|
- exe/broken_link_finder
|
159
189
|
- lib/broken_link_finder.rb
|
160
190
|
- lib/broken_link_finder/finder.rb
|
191
|
+
- lib/broken_link_finder/reporter.rb
|
161
192
|
- lib/broken_link_finder/version.rb
|
162
193
|
- lib/broken_link_finder/wgit_extensions.rb
|
163
194
|
- load.rb
|