broken_link_finder 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +25 -8
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +1 -1
- data/lib/broken_link_finder.rb +1 -0
- data/lib/broken_link_finder/finder.rb +63 -16
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +21 -0
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89e6476124fe4e40b2efe0646ad6d2708f233a464ae8a075833f03c27669a719
|
4
|
+
data.tar.gz: a85394f3013a1c073afcdd94451bd5044331dce6c9f6988d0c4ccb1c9682783c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a7e10444cedf91d3dcf77a6e943852b02d902991616e534ade4b6e194d4508ba342fdb6c1d76b83ab586499ba0fc82aa795875c96f2233fee097089112e2ea0
|
7
|
+
data.tar.gz: 75ab90b82b724eed3e837e48dec0d9584723ecc0cd606c9c8d42c87ba5350daac3a35099f7e71967a4e699a9b0817127da56c62646f6d36f49297b77731d8657
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.6.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.13)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
addressable (>= 2.3.6)
|
37
37
|
crack (>= 0.3.2)
|
38
38
|
hashdiff
|
39
|
-
wgit (0.0.
|
39
|
+
wgit (0.0.13)
|
40
40
|
mongo (~> 2.8.0)
|
41
41
|
nokogiri (~> 1.10.3)
|
42
42
|
|
data/README.md
CHANGED
@@ -6,15 +6,22 @@ Simply point it at a website and it will crawl all of its webpages searching for
|
|
6
6
|
|
7
7
|
## How It Works
|
8
8
|
|
9
|
-
Any page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions
|
9
|
+
Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
- A response status code of `404 Not Found` is returned.
|
12
|
+
- An empty HTML response body is returned.
|
13
|
+
- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
|
14
|
+
- The link redirects more than 5 times consecutively.
|
15
|
+
|
16
|
+
**Note**: Not all link types are supported.
|
17
|
+
|
18
|
+
In a nutshell, only HTTP(S) based links can be successfully verified by `broken_link_finder`. As a result some links on a page might be (recorded and) ignored. You should verify these links yourself manually. Examples of unsupported link types include `tel:*`, `mailto:*`, `ftp://*` etc.
|
19
|
+
|
20
|
+
See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
|
14
21
|
|
15
22
|
## Made Possible By
|
16
23
|
|
17
|
-
|
24
|
+
`broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
|
18
25
|
|
19
26
|
## Installation
|
20
27
|
|
@@ -53,9 +60,10 @@ Below is a simple script which crawls a website and outputs it's broken links to
|
|
53
60
|
```ruby
|
54
61
|
require 'broken_link_finder'
|
55
62
|
|
56
|
-
finder = BrokenLinkFinder
|
57
|
-
finder.crawl_site "http://txti.es"
|
58
|
-
finder.
|
63
|
+
finder = BrokenLinkFinder.new
|
64
|
+
finder.crawl_site "http://txti.es" # Or use Finder#crawl_page for a single webpage.
|
65
|
+
finder.pretty_print_link_summary # Or use Finder#broken_links and Finder#ignored_links
|
66
|
+
# for direct access to the link Hashes.
|
59
67
|
```
|
60
68
|
|
61
69
|
Then execute the script with:
|
@@ -78,6 +86,15 @@ https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FB
|
|
78
86
|
The following broken links exist in http://txti.es/how:
|
79
87
|
http://en.wikipedia.org/wiki/Markdown
|
80
88
|
http://imgur.com
|
89
|
+
|
90
|
+
Below is a breakdown of the non supported (ignored) links found, you should check these manually:
|
91
|
+
|
92
|
+
The following links were ignored on http://txti.es:
|
93
|
+
tel:+13174562564
|
94
|
+
mailto:big.jim@jmail.com
|
95
|
+
|
96
|
+
The following links were ignored on http://txti.es/contact:
|
97
|
+
ftp://server.com
|
81
98
|
```
|
82
99
|
|
83
100
|
## TODO
|
data/broken_link_finder.gemspec
CHANGED
@@ -43,7 +43,7 @@ Gem::Specification.new do |spec|
|
|
43
43
|
spec.add_development_dependency "byebug", "~> 11.0"
|
44
44
|
spec.add_development_dependency "webmock", "~> 3.5"
|
45
45
|
|
46
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
46
|
+
spec.add_runtime_dependency "wgit", "0.0.13"
|
47
47
|
spec.add_runtime_dependency "thread", "0.2"
|
48
48
|
spec.add_runtime_dependency "thor", "0.20.3"
|
49
49
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -11,7 +11,7 @@ class BrokenLinkFinderCLI < Thor
|
|
11
11
|
url = "http://#{url}" unless url.start_with?('http')
|
12
12
|
finder = BrokenLinkFinder::Finder.new
|
13
13
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
14
|
-
finder.
|
14
|
+
finder.pretty_print_link_summary
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,22 +2,29 @@ require 'wgit'
|
|
2
2
|
require 'thread/pool'
|
3
3
|
|
4
4
|
module BrokenLinkFinder
|
5
|
+
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
6
|
+
# override the max_threads variable.
|
7
|
+
def self.new
|
8
|
+
Finder.new
|
9
|
+
end
|
10
|
+
|
5
11
|
class Finder
|
6
12
|
DEFAULT_MAX_THREADS = 30.freeze
|
7
13
|
|
8
|
-
attr_reader :broken_links
|
14
|
+
attr_reader :broken_links, :ignored_links
|
9
15
|
|
10
16
|
# Create a new Finder instance.
|
11
17
|
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
12
18
|
@max_threads = max_threads
|
13
19
|
@lock = Mutex.new
|
14
20
|
@crawler = Wgit::Crawler.new
|
15
|
-
|
21
|
+
clear_links
|
16
22
|
end
|
17
23
|
|
18
|
-
# Clear/empty the
|
19
|
-
def
|
24
|
+
# Clear/empty the link collection Hashes.
|
25
|
+
def clear_links
|
20
26
|
@broken_links = {}
|
27
|
+
@ignored_links = {}
|
21
28
|
end
|
22
29
|
|
23
30
|
# Finds broken links within an entire site and appends them to the
|
@@ -25,11 +32,12 @@ module BrokenLinkFinder
|
|
25
32
|
# at least one broken link was found and an Array of all pages crawled.
|
26
33
|
# Access the broken links with Finder#broken_links.
|
27
34
|
def crawl_site(url)
|
28
|
-
|
35
|
+
clear_links
|
29
36
|
url = Wgit::Url.new(url)
|
30
37
|
pool = Thread.pool(@max_threads)
|
31
38
|
crawled_pages = []
|
32
39
|
|
40
|
+
# Crawl the site's HTML web pages looking for links.
|
33
41
|
@crawler.crawl_site(url) do |doc|
|
34
42
|
# Ensure the given website url is valid.
|
35
43
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
@@ -45,14 +53,14 @@ module BrokenLinkFinder
|
|
45
53
|
end
|
46
54
|
|
47
55
|
pool.shutdown
|
48
|
-
[
|
56
|
+
[@broken_links.any?, crawled_pages]
|
49
57
|
end
|
50
58
|
|
51
59
|
# Finds broken links within a single page and appends them to the
|
52
60
|
# @broken_links array. Returns true if at least one broken link was found.
|
53
61
|
# Access the broken links with Finder#broken_links.
|
54
62
|
def crawl_url(url)
|
55
|
-
|
63
|
+
clear_links
|
56
64
|
url = Wgit::Url.new(url)
|
57
65
|
|
58
66
|
# Ensure the given page url is valid.
|
@@ -62,41 +70,70 @@ module BrokenLinkFinder
|
|
62
70
|
# Get all page links and determine which are broken.
|
63
71
|
find_broken_links(doc)
|
64
72
|
|
65
|
-
|
73
|
+
@broken_links.any?
|
66
74
|
end
|
67
75
|
|
68
|
-
# Pretty prints the
|
76
|
+
# Pretty prints the link summary into a stream e.g. Kernel
|
69
77
|
# (STDOUT) or a file - anything that respond_to? :puts.
|
70
78
|
# Returns true if there were broken links and vice versa.
|
71
|
-
def
|
79
|
+
def pretty_print_link_summary(stream = Kernel)
|
72
80
|
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
73
81
|
|
82
|
+
# Broken link summary.
|
74
83
|
if @broken_links.empty?
|
75
84
|
stream.puts("Good news, there are no broken links!")
|
76
|
-
|
85
|
+
stream.puts("")
|
77
86
|
else
|
78
87
|
stream.puts("Below is a breakdown of the different pages and their \
|
79
88
|
broken links...")
|
80
89
|
stream.puts("")
|
81
90
|
|
82
91
|
@broken_links.each do |page, links|
|
83
|
-
stream.puts("The following broken links exist
|
92
|
+
stream.puts("The following broken links exist on #{page}:")
|
93
|
+
links.each do |link|
|
94
|
+
stream.puts(link)
|
95
|
+
end
|
96
|
+
stream.puts("")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Ignored link summary.
|
101
|
+
if @ignored_links.any?
|
102
|
+
stream.puts("Below is a breakdown of the non supported links found, \
|
103
|
+
you should check these manually:")
|
104
|
+
stream.puts("")
|
105
|
+
|
106
|
+
@ignored_links.each do |page, links|
|
107
|
+
stream.puts("The following links were ignored on #{page}:")
|
84
108
|
links.each do |link|
|
85
109
|
stream.puts(link)
|
86
110
|
end
|
87
111
|
stream.puts("")
|
88
112
|
end
|
89
|
-
true
|
90
113
|
end
|
114
|
+
|
115
|
+
@broken_links.any?
|
91
116
|
end
|
92
117
|
|
93
118
|
private
|
94
119
|
|
95
|
-
# Finds which links are broken and
|
120
|
+
# Finds which links are unsupported or broken and records the details.
|
96
121
|
def find_broken_links(doc)
|
97
|
-
links
|
122
|
+
# Process the Document's links before checking if they're broke.
|
123
|
+
links = doc.all_links.
|
124
|
+
reject do |link|
|
125
|
+
if !link.is_relative? and !link.start_with?('http')
|
126
|
+
append_ignored_link(doc.url, link)
|
127
|
+
true
|
128
|
+
end
|
129
|
+
end.
|
130
|
+
uniq
|
131
|
+
|
132
|
+
# Iterate over the supported links checking if they're broken or not.
|
98
133
|
links.each do |link|
|
99
|
-
|
134
|
+
link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
|
135
|
+
link_doc = @crawler.crawl_url(link_url)
|
136
|
+
|
100
137
|
if @crawler.last_response.is_a?(Net::HTTPNotFound) or
|
101
138
|
link_doc.nil? or
|
102
139
|
has_broken_anchor(link_doc)
|
@@ -124,6 +161,16 @@ broken links...")
|
|
124
161
|
end
|
125
162
|
end
|
126
163
|
|
164
|
+
# Append url => [link] to @ignored_links.
|
165
|
+
def append_ignored_link(url, link)
|
166
|
+
@lock.synchronize do
|
167
|
+
unless @ignored_links[url]
|
168
|
+
@ignored_links[url] = []
|
169
|
+
end
|
170
|
+
@ignored_links[url] << link
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
127
174
|
alias_method :crawl_page, :crawl_url
|
128
175
|
end
|
129
176
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'wgit'
|
2
|
+
|
3
|
+
# We pull out all of a Document's links, not just the links to other webpages.
|
4
|
+
Wgit::Document.define_extension(
|
5
|
+
:all_links,
|
6
|
+
'//*/@href | //*/@src',
|
7
|
+
singleton: false,
|
8
|
+
text_content_only: true,
|
9
|
+
) do |links|
|
10
|
+
if links
|
11
|
+
links = links.
|
12
|
+
map do |link|
|
13
|
+
Wgit::Url.new(link)
|
14
|
+
rescue
|
15
|
+
nil
|
16
|
+
end.
|
17
|
+
compact.
|
18
|
+
uniq
|
19
|
+
end
|
20
|
+
links
|
21
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - '='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.0.
|
103
|
+
version: 0.0.13
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - '='
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.0.
|
110
|
+
version: 0.0.13
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: thread
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- lib/broken_link_finder.rb
|
160
160
|
- lib/broken_link_finder/finder.rb
|
161
161
|
- lib/broken_link_finder/version.rb
|
162
|
+
- lib/broken_link_finder/wgit_extensions.rb
|
162
163
|
- load.rb
|
163
164
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
164
165
|
licenses:
|