broken_link_finder 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -3
- data/README.md +25 -8
- data/broken_link_finder.gemspec +1 -1
- data/exe/broken_link_finder +1 -1
- data/lib/broken_link_finder.rb +1 -0
- data/lib/broken_link_finder/finder.rb +63 -16
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +21 -0
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 89e6476124fe4e40b2efe0646ad6d2708f233a464ae8a075833f03c27669a719
|
4
|
+
data.tar.gz: a85394f3013a1c073afcdd94451bd5044331dce6c9f6988d0c4ccb1c9682783c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a7e10444cedf91d3dcf77a6e943852b02d902991616e534ade4b6e194d4508ba342fdb6c1d76b83ab586499ba0fc82aa795875c96f2233fee097089112e2ea0
|
7
|
+
data.tar.gz: 75ab90b82b724eed3e837e48dec0d9584723ecc0cd606c9c8d42c87ba5350daac3a35099f7e71967a4e699a9b0817127da56c62646f6d36f49297b77731d8657
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
broken_link_finder (0.
|
4
|
+
broken_link_finder (0.6.0)
|
5
5
|
thor (= 0.20.3)
|
6
6
|
thread (= 0.2)
|
7
|
-
wgit (= 0.0.
|
7
|
+
wgit (= 0.0.13)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -36,7 +36,7 @@ GEM
|
|
36
36
|
addressable (>= 2.3.6)
|
37
37
|
crack (>= 0.3.2)
|
38
38
|
hashdiff
|
39
|
-
wgit (0.0.
|
39
|
+
wgit (0.0.13)
|
40
40
|
mongo (~> 2.8.0)
|
41
41
|
nokogiri (~> 1.10.3)
|
42
42
|
|
data/README.md
CHANGED
@@ -6,15 +6,22 @@ Simply point it at a website and it will crawl all of its webpages searching for
|
|
6
6
|
|
7
7
|
## How It Works
|
8
8
|
|
9
|
-
Any page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions
|
9
|
+
Any HTML page element with a `href` or `src` attribute is considered a link. For each link on a given page, any of the following conditions constitutes that the link is broken:
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
- A response status code of `404 Not Found` is returned.
|
12
|
+
- An empty HTML response body is returned.
|
13
|
+
- The HTML response body doesn't contain an element ID matching that of the link's anchor e.g. `http://server.com#about` must contain an element with `id="about"` or the link is considered broken.
|
14
|
+
- The link redirects more than 5 times consecutively.
|
15
|
+
|
16
|
+
**Note**: Not all link types are supported.
|
17
|
+
|
18
|
+
In a nutshell, only HTTP(S) based links can be successfully verified by `broken_link_finder`. As a result some links on a page might be (recorded and) ignored. You should verify these links yourself manually. Examples of unsupported link types include `tel:*`, `mailto:*`, `ftp://*` etc.
|
19
|
+
|
20
|
+
See the [usage](#Usage) section below on how to check which links have been ignored during a crawl.
|
14
21
|
|
15
22
|
## Made Possible By
|
16
23
|
|
17
|
-
|
24
|
+
`broken_link_finder` relies heavily on the `wgit` Ruby gem. See its [repository](https://github.com/michaeltelford/wgit) for more details.
|
18
25
|
|
19
26
|
## Installation
|
20
27
|
|
@@ -53,9 +60,10 @@ Below is a simple script which crawls a website and outputs it's broken links to
|
|
53
60
|
```ruby
|
54
61
|
require 'broken_link_finder'
|
55
62
|
|
56
|
-
finder = BrokenLinkFinder
|
57
|
-
finder.crawl_site "http://txti.es"
|
58
|
-
finder.
|
63
|
+
finder = BrokenLinkFinder.new
|
64
|
+
finder.crawl_site "http://txti.es" # Or use Finder#crawl_page for a single webpage.
|
65
|
+
finder.pretty_print_link_summary # Or use Finder#broken_links and Finder#ignored_links
|
66
|
+
# for direct access to the link Hashes.
|
59
67
|
```
|
60
68
|
|
61
69
|
Then execute the script with:
|
@@ -78,6 +86,15 @@ https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=84L4BDS86FB
|
|
78
86
|
The following broken links exist in http://txti.es/how:
|
79
87
|
http://en.wikipedia.org/wiki/Markdown
|
80
88
|
http://imgur.com
|
89
|
+
|
90
|
+
Below is a breakdown of the non supported (ignored) links found, you should check these manually:
|
91
|
+
|
92
|
+
The following links were ignored on http://txti.es:
|
93
|
+
tel:+13174562564
|
94
|
+
mailto:big.jim@jmail.com
|
95
|
+
|
96
|
+
The following links were ignored on http://txti.es/contact:
|
97
|
+
ftp://server.com
|
81
98
|
```
|
82
99
|
|
83
100
|
## TODO
|
data/broken_link_finder.gemspec
CHANGED
@@ -43,7 +43,7 @@ Gem::Specification.new do |spec|
|
|
43
43
|
spec.add_development_dependency "byebug", "~> 11.0"
|
44
44
|
spec.add_development_dependency "webmock", "~> 3.5"
|
45
45
|
|
46
|
-
spec.add_runtime_dependency "wgit", "0.0.
|
46
|
+
spec.add_runtime_dependency "wgit", "0.0.13"
|
47
47
|
spec.add_runtime_dependency "thread", "0.2"
|
48
48
|
spec.add_runtime_dependency "thor", "0.20.3"
|
49
49
|
end
|
data/exe/broken_link_finder
CHANGED
@@ -11,7 +11,7 @@ class BrokenLinkFinderCLI < Thor
|
|
11
11
|
url = "http://#{url}" unless url.start_with?('http')
|
12
12
|
finder = BrokenLinkFinder::Finder.new
|
13
13
|
options[:recursive] ? finder.crawl_site(url) : finder.crawl_page(url)
|
14
|
-
finder.
|
14
|
+
finder.pretty_print_link_summary
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
data/lib/broken_link_finder.rb
CHANGED
@@ -2,22 +2,29 @@ require 'wgit'
|
|
2
2
|
require 'thread/pool'
|
3
3
|
|
4
4
|
module BrokenLinkFinder
|
5
|
+
# Alias for BrokenLinkFinder::Finder.new, don't use this if you want to
|
6
|
+
# override the max_threads variable.
|
7
|
+
def self.new
|
8
|
+
Finder.new
|
9
|
+
end
|
10
|
+
|
5
11
|
class Finder
|
6
12
|
DEFAULT_MAX_THREADS = 30.freeze
|
7
13
|
|
8
|
-
attr_reader :broken_links
|
14
|
+
attr_reader :broken_links, :ignored_links
|
9
15
|
|
10
16
|
# Create a new Finder instance.
|
11
17
|
def initialize(max_threads: DEFAULT_MAX_THREADS)
|
12
18
|
@max_threads = max_threads
|
13
19
|
@lock = Mutex.new
|
14
20
|
@crawler = Wgit::Crawler.new
|
15
|
-
|
21
|
+
clear_links
|
16
22
|
end
|
17
23
|
|
18
|
-
# Clear/empty the
|
19
|
-
def
|
24
|
+
# Clear/empty the link collection Hashes.
|
25
|
+
def clear_links
|
20
26
|
@broken_links = {}
|
27
|
+
@ignored_links = {}
|
21
28
|
end
|
22
29
|
|
23
30
|
# Finds broken links within an entire site and appends them to the
|
@@ -25,11 +32,12 @@ module BrokenLinkFinder
|
|
25
32
|
# at least one broken link was found and an Array of all pages crawled.
|
26
33
|
# Access the broken links with Finder#broken_links.
|
27
34
|
def crawl_site(url)
|
28
|
-
|
35
|
+
clear_links
|
29
36
|
url = Wgit::Url.new(url)
|
30
37
|
pool = Thread.pool(@max_threads)
|
31
38
|
crawled_pages = []
|
32
39
|
|
40
|
+
# Crawl the site's HTML web pages looking for links.
|
33
41
|
@crawler.crawl_site(url) do |doc|
|
34
42
|
# Ensure the given website url is valid.
|
35
43
|
raise "Invalid URL: #{url}" if doc.url == url and doc.empty?
|
@@ -45,14 +53,14 @@ module BrokenLinkFinder
|
|
45
53
|
end
|
46
54
|
|
47
55
|
pool.shutdown
|
48
|
-
[
|
56
|
+
[@broken_links.any?, crawled_pages]
|
49
57
|
end
|
50
58
|
|
51
59
|
# Finds broken links within a single page and appends them to the
|
52
60
|
# @broken_links array. Returns true if at least one broken link was found.
|
53
61
|
# Access the broken links with Finder#broken_links.
|
54
62
|
def crawl_url(url)
|
55
|
-
|
63
|
+
clear_links
|
56
64
|
url = Wgit::Url.new(url)
|
57
65
|
|
58
66
|
# Ensure the given page url is valid.
|
@@ -62,41 +70,70 @@ module BrokenLinkFinder
|
|
62
70
|
# Get all page links and determine which are broken.
|
63
71
|
find_broken_links(doc)
|
64
72
|
|
65
|
-
|
73
|
+
@broken_links.any?
|
66
74
|
end
|
67
75
|
|
68
|
-
# Pretty prints the
|
76
|
+
# Pretty prints the link summary into a stream e.g. Kernel
|
69
77
|
# (STDOUT) or a file - anything that respond_to? :puts.
|
70
78
|
# Returns true if there were broken links and vice versa.
|
71
|
-
def
|
79
|
+
def pretty_print_link_summary(stream = Kernel)
|
72
80
|
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
73
81
|
|
82
|
+
# Broken link summary.
|
74
83
|
if @broken_links.empty?
|
75
84
|
stream.puts("Good news, there are no broken links!")
|
76
|
-
|
85
|
+
stream.puts("")
|
77
86
|
else
|
78
87
|
stream.puts("Below is a breakdown of the different pages and their \
|
79
88
|
broken links...")
|
80
89
|
stream.puts("")
|
81
90
|
|
82
91
|
@broken_links.each do |page, links|
|
83
|
-
stream.puts("The following broken links exist
|
92
|
+
stream.puts("The following broken links exist on #{page}:")
|
93
|
+
links.each do |link|
|
94
|
+
stream.puts(link)
|
95
|
+
end
|
96
|
+
stream.puts("")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Ignored link summary.
|
101
|
+
if @ignored_links.any?
|
102
|
+
stream.puts("Below is a breakdown of the non supported links found, \
|
103
|
+
you should check these manually:")
|
104
|
+
stream.puts("")
|
105
|
+
|
106
|
+
@ignored_links.each do |page, links|
|
107
|
+
stream.puts("The following links were ignored on #{page}:")
|
84
108
|
links.each do |link|
|
85
109
|
stream.puts(link)
|
86
110
|
end
|
87
111
|
stream.puts("")
|
88
112
|
end
|
89
|
-
true
|
90
113
|
end
|
114
|
+
|
115
|
+
@broken_links.any?
|
91
116
|
end
|
92
117
|
|
93
118
|
private
|
94
119
|
|
95
|
-
# Finds which links are broken and
|
120
|
+
# Finds which links are unsupported or broken and records the details.
|
96
121
|
def find_broken_links(doc)
|
97
|
-
links
|
122
|
+
# Process the Document's links before checking if they're broke.
|
123
|
+
links = doc.all_links.
|
124
|
+
reject do |link|
|
125
|
+
if !link.is_relative? and !link.start_with?('http')
|
126
|
+
append_ignored_link(doc.url, link)
|
127
|
+
true
|
128
|
+
end
|
129
|
+
end.
|
130
|
+
uniq
|
131
|
+
|
132
|
+
# Iterate over the supported links checking if they're broken or not.
|
98
133
|
links.each do |link|
|
99
|
-
|
134
|
+
link_url = link.is_relative? ? doc.url.to_base.concat(link) : link
|
135
|
+
link_doc = @crawler.crawl_url(link_url)
|
136
|
+
|
100
137
|
if @crawler.last_response.is_a?(Net::HTTPNotFound) or
|
101
138
|
link_doc.nil? or
|
102
139
|
has_broken_anchor(link_doc)
|
@@ -124,6 +161,16 @@ broken links...")
|
|
124
161
|
end
|
125
162
|
end
|
126
163
|
|
164
|
+
# Append url => [link] to @ignored_links.
|
165
|
+
def append_ignored_link(url, link)
|
166
|
+
@lock.synchronize do
|
167
|
+
unless @ignored_links[url]
|
168
|
+
@ignored_links[url] = []
|
169
|
+
end
|
170
|
+
@ignored_links[url] << link
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
127
174
|
alias_method :crawl_page, :crawl_url
|
128
175
|
end
|
129
176
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'wgit'
|
2
|
+
|
3
|
+
# We pull out all of a Document's links, not just the links to other webpages.
|
4
|
+
Wgit::Document.define_extension(
|
5
|
+
:all_links,
|
6
|
+
'//*/@href | //*/@src',
|
7
|
+
singleton: false,
|
8
|
+
text_content_only: true,
|
9
|
+
) do |links|
|
10
|
+
if links
|
11
|
+
links = links.
|
12
|
+
map do |link|
|
13
|
+
Wgit::Url.new(link)
|
14
|
+
rescue
|
15
|
+
nil
|
16
|
+
end.
|
17
|
+
compact.
|
18
|
+
uniq
|
19
|
+
end
|
20
|
+
links
|
21
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: broken_link_finder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - '='
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 0.0.
|
103
|
+
version: 0.0.13
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - '='
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 0.0.
|
110
|
+
version: 0.0.13
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: thread
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -159,6 +159,7 @@ files:
|
|
159
159
|
- lib/broken_link_finder.rb
|
160
160
|
- lib/broken_link_finder/finder.rb
|
161
161
|
- lib/broken_link_finder/version.rb
|
162
|
+
- lib/broken_link_finder/wgit_extensions.rb
|
162
163
|
- load.rb
|
163
164
|
homepage: https://github.com/michaeltelford/broken-link-finder
|
164
165
|
licenses:
|