broken_link_finder 0.11.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +37 -26
- data/README.md +3 -3
- data/bin/setup +1 -1
- data/broken_link_finder.gemspec +2 -2
- data/lib/broken_link_finder.rb +1 -0
- data/lib/broken_link_finder/finder.rb +66 -136
- data/lib/broken_link_finder/link_manager.rb +137 -0
- data/lib/broken_link_finder/reporter/html_reporter.rb +2 -1
- data/lib/broken_link_finder/reporter/reporter.rb +1 -1
- data/lib/broken_link_finder/reporter/text_reporter.rb +2 -1
- data/lib/broken_link_finder/version.rb +1 -1
- data/lib/broken_link_finder/wgit_extensions.rb +1 -1
- metadata +8 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 42e88495f7e7742db433223408b4a380c1d48e98a5a43e6da5303d3e7b024454
|
|
4
|
+
data.tar.gz: eae7fc953f0d8aa1bb1f9d5b53183cd68a15a9f83ab341f51023744b2d148063
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4496db994bfba83deeb14a1b870f43e2cfd2afa94f30b6596ee610f23103b55ae0d84a6443a3204b02ed8875c0daf0d8e9c565aaebd21173d5c4353509dac3c8
|
|
7
|
+
data.tar.gz: 2d70ee94d7128e6e212bc385e1045fd465c121f58b9a0d036d392ae1cbb5cd9ef5ea47e29eda85b6f17a0b0f5547902ca818967b3ffb4ad87c7d0b271da5323a
|
data/.ruby-version
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.
|
|
1
|
+
2.7.0
|
data/CHANGELOG.md
CHANGED
|
@@ -9,6 +9,15 @@
|
|
|
9
9
|
- ...
|
|
10
10
|
---
|
|
11
11
|
|
|
12
|
+
## v0.11.1
|
|
13
|
+
### Added
|
|
14
|
+
- ...
|
|
15
|
+
### Changed/Removed
|
|
16
|
+
- Updated wgit gem to version 0.9.0 which contains improvements and bugs fixes.
|
|
17
|
+
### Fixed
|
|
18
|
+
- ...
|
|
19
|
+
---
|
|
20
|
+
|
|
12
21
|
## v0.11.0
|
|
13
22
|
### Added
|
|
14
23
|
- Additional crawl statistics.
|
data/Gemfile.lock
CHANGED
|
@@ -1,50 +1,61 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
broken_link_finder (0.11.
|
|
4
|
+
broken_link_finder (0.11.1)
|
|
5
5
|
thor (~> 0.20)
|
|
6
6
|
thread (~> 0.2)
|
|
7
|
-
wgit (~> 0.
|
|
7
|
+
wgit (~> 0.9)
|
|
8
8
|
|
|
9
9
|
GEM
|
|
10
10
|
remote: https://rubygems.org/
|
|
11
11
|
specs:
|
|
12
|
-
addressable (2.
|
|
13
|
-
public_suffix (>= 2.0.2, <
|
|
14
|
-
bson (4.
|
|
15
|
-
byebug (11.
|
|
16
|
-
|
|
12
|
+
addressable (2.7.0)
|
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
|
14
|
+
bson (4.10.0)
|
|
15
|
+
byebug (11.1.3)
|
|
16
|
+
cliver (0.3.2)
|
|
17
|
+
coderay (1.1.3)
|
|
18
|
+
concurrent-ruby (1.1.6)
|
|
17
19
|
crack (0.4.3)
|
|
18
20
|
safe_yaml (~> 1.0.0)
|
|
19
21
|
ethon (0.12.0)
|
|
20
22
|
ffi (>= 1.3.0)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
ferrum (0.9)
|
|
24
|
+
addressable (~> 2.5)
|
|
25
|
+
cliver (~> 0.3)
|
|
26
|
+
concurrent-ruby (~> 1.1)
|
|
27
|
+
websocket-driver (>= 0.6, < 0.8)
|
|
28
|
+
ffi (1.13.1)
|
|
29
|
+
hashdiff (1.0.1)
|
|
30
|
+
maxitest (3.6.0)
|
|
31
|
+
minitest (>= 5.0.0, < 5.14.0)
|
|
32
|
+
method_source (1.0.0)
|
|
26
33
|
mini_portile2 (2.4.0)
|
|
27
|
-
minitest (5.
|
|
28
|
-
mongo (2.
|
|
29
|
-
bson (>= 4.
|
|
30
|
-
nokogiri (1.10.
|
|
34
|
+
minitest (5.13.0)
|
|
35
|
+
mongo (2.13.0)
|
|
36
|
+
bson (>= 4.8.2, < 5.0.0)
|
|
37
|
+
nokogiri (1.10.10)
|
|
31
38
|
mini_portile2 (~> 2.4.0)
|
|
32
|
-
pry (0.
|
|
33
|
-
coderay (~> 1.1
|
|
34
|
-
method_source (~>
|
|
35
|
-
public_suffix (
|
|
36
|
-
rake (
|
|
39
|
+
pry (0.13.1)
|
|
40
|
+
coderay (~> 1.1)
|
|
41
|
+
method_source (~> 1.0)
|
|
42
|
+
public_suffix (4.0.5)
|
|
43
|
+
rake (13.0.1)
|
|
37
44
|
safe_yaml (1.0.5)
|
|
38
45
|
thor (0.20.3)
|
|
39
46
|
thread (0.2.2)
|
|
40
|
-
typhoeus (1.
|
|
47
|
+
typhoeus (1.4.0)
|
|
41
48
|
ethon (>= 0.9.0)
|
|
42
|
-
webmock (3.
|
|
49
|
+
webmock (3.8.3)
|
|
43
50
|
addressable (>= 2.3.6)
|
|
44
51
|
crack (>= 0.3.2)
|
|
45
52
|
hashdiff (>= 0.4.0, < 2.0.0)
|
|
46
|
-
|
|
53
|
+
websocket-driver (0.7.3)
|
|
54
|
+
websocket-extensions (>= 0.1.0)
|
|
55
|
+
websocket-extensions (0.1.5)
|
|
56
|
+
wgit (0.9.0)
|
|
47
57
|
addressable (~> 2.6)
|
|
58
|
+
ferrum (~> 0.8)
|
|
48
59
|
mongo (~> 2.9)
|
|
49
60
|
nokogiri (~> 1.10)
|
|
50
61
|
typhoeus (~> 1.3)
|
|
@@ -58,11 +69,11 @@ DEPENDENCIES
|
|
|
58
69
|
byebug (~> 11.0)
|
|
59
70
|
maxitest (~> 3.3)
|
|
60
71
|
pry (~> 0.12)
|
|
61
|
-
rake (~>
|
|
72
|
+
rake (~> 13.0)
|
|
62
73
|
webmock (~> 3.6)
|
|
63
74
|
|
|
64
75
|
RUBY VERSION
|
|
65
|
-
ruby 2.
|
|
76
|
+
ruby 2.7.0p0
|
|
66
77
|
|
|
67
78
|
BUNDLED WITH
|
|
68
79
|
2.1.4
|
data/README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# Broken Link Finder
|
|
2
2
|
|
|
3
|
-
Does what it says on the tin
|
|
3
|
+
Does what it says on the tin - finds a website's broken links.
|
|
4
4
|
|
|
5
|
-
Simply point it at a website and it will crawl all of its webpages searching for and identifing
|
|
5
|
+
Simply point it at a website and it will crawl all of its webpages searching for and identifing broken links. You will then be presented with a concise summary of any broken links found.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Broken Link Finder is multi-threaded and uses `libcurl` under the hood, it's fast!
|
|
8
8
|
|
|
9
9
|
## How It Works
|
|
10
10
|
|
data/bin/setup
CHANGED
data/broken_link_finder.gemspec
CHANGED
|
@@ -44,10 +44,10 @@ Gem::Specification.new do |spec|
|
|
|
44
44
|
spec.add_development_dependency 'byebug', '~> 11.0'
|
|
45
45
|
spec.add_development_dependency 'maxitest', '~> 3.3'
|
|
46
46
|
spec.add_development_dependency 'pry', '~> 0.12'
|
|
47
|
-
spec.add_development_dependency 'rake', '~>
|
|
47
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
|
48
48
|
spec.add_development_dependency 'webmock', '~> 3.6'
|
|
49
49
|
|
|
50
50
|
spec.add_runtime_dependency 'thor', '~> 0.20'
|
|
51
51
|
spec.add_runtime_dependency 'thread', '~> 0.2'
|
|
52
|
-
spec.add_runtime_dependency 'wgit', '~> 0.
|
|
52
|
+
spec.add_runtime_dependency 'wgit', '~> 0.9'
|
|
53
53
|
end
|
data/lib/broken_link_finder.rb
CHANGED
|
@@ -7,6 +7,7 @@ require 'set'
|
|
|
7
7
|
|
|
8
8
|
require_relative './broken_link_finder/wgit_extensions'
|
|
9
9
|
require_relative './broken_link_finder/version'
|
|
10
|
+
require_relative './broken_link_finder/link_manager'
|
|
10
11
|
require_relative './broken_link_finder/reporter/reporter'
|
|
11
12
|
require_relative './broken_link_finder/reporter/text_reporter'
|
|
12
13
|
require_relative './broken_link_finder/reporter/html_reporter'
|
|
@@ -1,46 +1,53 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module BrokenLinkFinder
|
|
4
|
-
DEFAULT_MAX_THREADS = 100
|
|
5
|
-
SERVER_WAIT_TIME = 0.5
|
|
4
|
+
DEFAULT_MAX_THREADS = 100 # Used by Finder#crawl_site.
|
|
5
|
+
SERVER_WAIT_TIME = 0.5 # Used by Finder#retry_broken_links.
|
|
6
6
|
|
|
7
7
|
# Alias for BrokenLinkFinder::Finder.new.
|
|
8
8
|
def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
|
9
9
|
Finder.new(sort: sort, max_threads: max_threads)
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
+
# Class responsible for finding broken links on a page or site.
|
|
12
13
|
class Finder
|
|
13
|
-
|
|
14
|
+
# The collection key - either :page or :link.
|
|
15
|
+
attr_reader :sort
|
|
14
16
|
|
|
15
|
-
#
|
|
16
|
-
|
|
17
|
+
# The max number of threads created during #crawl_site - one thread per page.
|
|
18
|
+
attr_reader :max_threads
|
|
19
|
+
|
|
20
|
+
# Returns a new Finder instance.
|
|
21
|
+
def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS)
|
|
17
22
|
raise "Sort by either :page or :link, not #{sort}" \
|
|
18
23
|
unless %i[page link].include?(sort)
|
|
19
24
|
|
|
20
25
|
@sort = sort
|
|
21
26
|
@max_threads = max_threads
|
|
22
|
-
@lock = Mutex.new
|
|
23
27
|
@crawler = Wgit::Crawler.new
|
|
28
|
+
@manager = BrokenLinkFinder::LinkManager.new(@sort)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Returns the current broken links.
|
|
32
|
+
def broken_links
|
|
33
|
+
@manager.broken_links
|
|
34
|
+
end
|
|
24
35
|
|
|
25
|
-
|
|
36
|
+
# Returns the current ignored links.
|
|
37
|
+
def ignored_links
|
|
38
|
+
@manager.ignored_links
|
|
26
39
|
end
|
|
27
40
|
|
|
28
|
-
#
|
|
29
|
-
def
|
|
30
|
-
@
|
|
31
|
-
@ignored_links = {} # Used for mapping pages to ignored links.
|
|
32
|
-
@all_broken_links = Set.new # Used to prevent crawling a broken link twice.
|
|
33
|
-
@all_intact_links = Set.new # Used to prevent crawling an intact link twice.
|
|
34
|
-
@all_ignored_links = Set.new # Used for building crawl statistics.
|
|
35
|
-
@broken_link_map = {} # Maps a link to its absolute (crawlable) form.
|
|
36
|
-
@crawl_stats = {} # Records crawl stats e.g. duration etc.
|
|
41
|
+
# Returns the current crawl stats.
|
|
42
|
+
def crawl_stats
|
|
43
|
+
@manager.crawl_stats
|
|
37
44
|
end
|
|
38
45
|
|
|
39
46
|
# Finds broken links within a single page and records them.
|
|
40
47
|
# Returns true if at least one broken link was found.
|
|
41
48
|
# Access the broken links afterwards with Finder#broken_links.
|
|
42
49
|
def crawl_url(url)
|
|
43
|
-
|
|
50
|
+
@manager.empty
|
|
44
51
|
|
|
45
52
|
start = Time.now
|
|
46
53
|
url = url.to_url
|
|
@@ -55,17 +62,17 @@ module BrokenLinkFinder
|
|
|
55
62
|
find_broken_links(doc)
|
|
56
63
|
retry_broken_links
|
|
57
64
|
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
@manager.sort
|
|
66
|
+
@manager.tally(url: url, pages_crawled: [url], start: start)
|
|
60
67
|
|
|
61
|
-
|
|
68
|
+
broken_links.any?
|
|
62
69
|
end
|
|
63
70
|
|
|
64
71
|
# Finds broken links within an entire site and records them.
|
|
65
72
|
# Returns true if at least one broken link was found.
|
|
66
73
|
# Access the broken links afterwards with Finder#broken_links.
|
|
67
|
-
def crawl_site(url)
|
|
68
|
-
|
|
74
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil)
|
|
75
|
+
@manager.empty
|
|
69
76
|
|
|
70
77
|
start = Time.now
|
|
71
78
|
url = url.to_url
|
|
@@ -74,7 +81,8 @@ module BrokenLinkFinder
|
|
|
74
81
|
|
|
75
82
|
# Crawl the site's HTML web pages looking for links.
|
|
76
83
|
# We dup the url to avoid recording any redirects.
|
|
77
|
-
|
|
84
|
+
paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
|
85
|
+
externals = @crawler.crawl_site(url.dup, **paths) do |doc|
|
|
78
86
|
crawled << doc.url
|
|
79
87
|
next unless doc
|
|
80
88
|
|
|
@@ -82,17 +90,20 @@ module BrokenLinkFinder
|
|
|
82
90
|
pool.process { find_broken_links(doc) }
|
|
83
91
|
end
|
|
84
92
|
|
|
93
|
+
# Wait for all threads to finish, even if url was invalid.
|
|
94
|
+
pool.shutdown
|
|
95
|
+
|
|
85
96
|
# Ensure the given website url is valid.
|
|
86
97
|
raise "Invalid or broken URL: #{url}" unless externals
|
|
87
98
|
|
|
88
|
-
# Wait for all threads to finish.
|
|
89
|
-
pool.shutdown
|
|
90
99
|
retry_broken_links
|
|
91
100
|
|
|
92
|
-
|
|
93
|
-
|
|
101
|
+
@manager.sort
|
|
102
|
+
@manager.tally(url: url, pages_crawled: crawled.to_a, start: start)
|
|
94
103
|
|
|
95
|
-
|
|
104
|
+
broken_links.any?
|
|
105
|
+
ensure
|
|
106
|
+
pool.shutdown if defined?(pool)
|
|
96
107
|
end
|
|
97
108
|
|
|
98
109
|
# Outputs the link report into a stream e.g. STDOUT or a file,
|
|
@@ -109,8 +120,8 @@ module BrokenLinkFinder
|
|
|
109
120
|
end
|
|
110
121
|
|
|
111
122
|
reporter = klass.new(stream, @sort,
|
|
112
|
-
|
|
113
|
-
@broken_link_map,
|
|
123
|
+
broken_links, ignored_links,
|
|
124
|
+
@manager.broken_link_map, crawl_stats)
|
|
114
125
|
reporter.call(broken_verbose: broken_verbose,
|
|
115
126
|
ignored_verbose: ignored_verbose)
|
|
116
127
|
end
|
|
@@ -119,18 +130,18 @@ module BrokenLinkFinder
|
|
|
119
130
|
|
|
120
131
|
# Finds which links are unsupported or broken and records the details.
|
|
121
132
|
def find_broken_links(page)
|
|
122
|
-
|
|
133
|
+
record_unparsable_links(page) # Record them as broken.
|
|
123
134
|
|
|
124
135
|
links = get_supported_links(page)
|
|
125
136
|
|
|
126
137
|
# Iterate over the supported links checking if they're broken or not.
|
|
127
138
|
links.each do |link|
|
|
128
139
|
# Skip if the link has been encountered previously.
|
|
129
|
-
next if @all_intact_links.include?(link)
|
|
140
|
+
next if @manager.all_intact_links.include?(link)
|
|
130
141
|
|
|
131
|
-
if @all_broken_links.include?(link)
|
|
142
|
+
if @manager.all_broken_links.include?(link)
|
|
132
143
|
# The link has already been proven broken so simply record it.
|
|
133
|
-
append_broken_link(page, link, map: false)
|
|
144
|
+
@manager.append_broken_link(page, link, map: false)
|
|
134
145
|
next
|
|
135
146
|
end
|
|
136
147
|
|
|
@@ -139,29 +150,21 @@ module BrokenLinkFinder
|
|
|
139
150
|
|
|
140
151
|
# Determine if the crawled link is broken or not and record it.
|
|
141
152
|
if link_broken?(link_doc)
|
|
142
|
-
append_broken_link(page, link)
|
|
143
|
-
else
|
|
144
|
-
@
|
|
153
|
+
@manager.append_broken_link(page, link)
|
|
154
|
+
else
|
|
155
|
+
@manager.append_intact_link(link)
|
|
145
156
|
end
|
|
146
157
|
end
|
|
147
158
|
|
|
148
159
|
nil
|
|
149
160
|
end
|
|
150
161
|
|
|
151
|
-
# Record each unparsable link as a broken link.
|
|
152
|
-
def process_unparsable_links(doc)
|
|
153
|
-
doc.unparsable_links.each do |link|
|
|
154
|
-
append_broken_link(doc, link, map: false)
|
|
155
|
-
@broken_link_map[link] = link
|
|
156
|
-
end
|
|
157
|
-
end
|
|
158
|
-
|
|
159
162
|
# Implements a retry mechanism for each of the broken links found.
|
|
160
163
|
# Removes any broken links found to be working OK.
|
|
161
164
|
def retry_broken_links
|
|
162
165
|
sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links.
|
|
163
166
|
|
|
164
|
-
@broken_link_map.select! do |link, href|
|
|
167
|
+
@manager.broken_link_map.select! do |link, href|
|
|
165
168
|
# Don't retry unparsable links (which are Strings).
|
|
166
169
|
next(true) unless href.is_a?(Wgit::Url)
|
|
167
170
|
|
|
@@ -170,27 +173,35 @@ module BrokenLinkFinder
|
|
|
170
173
|
if link_broken?(doc)
|
|
171
174
|
true
|
|
172
175
|
else
|
|
173
|
-
remove_broken_link(link)
|
|
176
|
+
@manager.remove_broken_link(link)
|
|
174
177
|
false
|
|
175
178
|
end
|
|
176
179
|
end
|
|
177
180
|
end
|
|
178
181
|
|
|
182
|
+
# Record each unparsable link as a broken link.
|
|
183
|
+
def record_unparsable_links(doc)
|
|
184
|
+
doc.unparsable_links.each do |link|
|
|
185
|
+
# We map the link ourselves because link is a String, not a Wgit::Url.
|
|
186
|
+
@manager.append_broken_link(doc, link, map: false)
|
|
187
|
+
@manager.broken_link_map[link] = link
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
179
191
|
# Report and reject any non supported links. Any link that is absolute and
|
|
180
192
|
# doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc.
|
|
181
193
|
def get_supported_links(doc)
|
|
182
|
-
doc.all_links
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
end
|
|
194
|
+
doc.all_links.reject do |link|
|
|
195
|
+
if link.is_absolute? && !link.start_with?('http')
|
|
196
|
+
@manager.append_ignored_link(doc.url, link)
|
|
197
|
+
true
|
|
198
|
+
end
|
|
199
|
+
end
|
|
189
200
|
end
|
|
190
201
|
|
|
191
202
|
# Make the link absolute and crawl it, returning its Wgit::Document.
|
|
192
203
|
def crawl_link(doc, link)
|
|
193
|
-
link = link.
|
|
204
|
+
link = link.make_absolute(doc)
|
|
194
205
|
@crawler.crawl(link.dup) # We dup link to avoid recording any redirects.
|
|
195
206
|
end
|
|
196
207
|
|
|
@@ -210,87 +221,6 @@ module BrokenLinkFinder
|
|
|
210
221
|
doc.xpath("//*[@id='#{fragment}']").empty?
|
|
211
222
|
end
|
|
212
223
|
|
|
213
|
-
# Append key => [value] to the broken link collections.
|
|
214
|
-
# If map: true, then the link will also be recorded in @broken_link_map.
|
|
215
|
-
def append_broken_link(doc, link, map: true)
|
|
216
|
-
key, value = get_key_value(doc.url, link)
|
|
217
|
-
|
|
218
|
-
@lock.synchronize do
|
|
219
|
-
@broken_links[key] = [] unless @broken_links[key]
|
|
220
|
-
@broken_links[key] << value
|
|
221
|
-
|
|
222
|
-
@all_broken_links << link
|
|
223
|
-
|
|
224
|
-
@broken_link_map[link] = link.prefix_base(doc) if map
|
|
225
|
-
end
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
# Remove the broken link from the necessary collections.
|
|
229
|
-
def remove_broken_link(link)
|
|
230
|
-
@lock.synchronize do
|
|
231
|
-
if @sort == :page
|
|
232
|
-
@broken_links.each { |_k, links| links.delete(link) }
|
|
233
|
-
@broken_links.delete_if { |_k, links| links.empty? }
|
|
234
|
-
else
|
|
235
|
-
@broken_links.delete(link)
|
|
236
|
-
end
|
|
237
|
-
|
|
238
|
-
@all_broken_links.delete(link)
|
|
239
|
-
@all_intact_links << link
|
|
240
|
-
end
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
# Append key => [value] to the ignored link collections.
|
|
244
|
-
def append_ignored_link(url, link)
|
|
245
|
-
key, value = get_key_value(url, link)
|
|
246
|
-
|
|
247
|
-
@lock.synchronize do
|
|
248
|
-
@ignored_links[key] = [] unless @ignored_links[key]
|
|
249
|
-
@ignored_links[key] << value
|
|
250
|
-
|
|
251
|
-
@all_ignored_links << link
|
|
252
|
-
end
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
# Returns the correct key value depending on the @sort type.
|
|
256
|
-
# @sort == :page ? [url, link] : [link, url]
|
|
257
|
-
def get_key_value(url, link)
|
|
258
|
-
case @sort
|
|
259
|
-
when :page
|
|
260
|
-
[url, link]
|
|
261
|
-
when :link
|
|
262
|
-
[link, url]
|
|
263
|
-
else
|
|
264
|
-
raise "Unsupported sort type: #{sort}"
|
|
265
|
-
end
|
|
266
|
-
end
|
|
267
|
-
|
|
268
|
-
# Sort keys and values alphabetically.
|
|
269
|
-
def sort_links
|
|
270
|
-
@broken_links.values.map(&:uniq!)
|
|
271
|
-
@ignored_links.values.map(&:uniq!)
|
|
272
|
-
|
|
273
|
-
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
|
274
|
-
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
|
275
|
-
|
|
276
|
-
@broken_links.each { |_k, v| v.sort! }
|
|
277
|
-
@ignored_links.each { |_k, v| v.sort! }
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
# Sets various statistics about the crawl and its links.
|
|
281
|
-
def set_crawl_stats(url:, pages_crawled:, start:)
|
|
282
|
-
@crawl_stats[:url] = url
|
|
283
|
-
@crawl_stats[:pages_crawled] = pages_crawled
|
|
284
|
-
@crawl_stats[:num_pages] = pages_crawled.size
|
|
285
|
-
@crawl_stats[:num_links] = (
|
|
286
|
-
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
|
287
|
-
)
|
|
288
|
-
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
|
289
|
-
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
|
290
|
-
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
|
291
|
-
@crawl_stats[:duration] = Time.now - start
|
|
292
|
-
end
|
|
293
|
-
|
|
294
224
|
alias crawl_page crawl_url
|
|
295
225
|
alias crawl_r crawl_site
|
|
296
226
|
end
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrokenLinkFinder
|
|
4
|
+
# Class responsible for handling the link collection logic.
|
|
5
|
+
class LinkManager
|
|
6
|
+
# Used for mapping pages to broken links.
|
|
7
|
+
attr_reader :broken_links
|
|
8
|
+
|
|
9
|
+
# Used for mapping pages to ignored links.
|
|
10
|
+
attr_reader :ignored_links
|
|
11
|
+
|
|
12
|
+
# Used to record crawl statistics e.g. duration etc.
|
|
13
|
+
attr_reader :crawl_stats
|
|
14
|
+
|
|
15
|
+
# Used to map a link (as is) to its absolute (crawlable) form.
|
|
16
|
+
attr_reader :broken_link_map
|
|
17
|
+
|
|
18
|
+
# Used to prevent crawling a broken link twice.
|
|
19
|
+
attr_reader :all_broken_links
|
|
20
|
+
|
|
21
|
+
# Used to prevent crawling an intact link twice.
|
|
22
|
+
attr_reader :all_intact_links
|
|
23
|
+
|
|
24
|
+
# Used for building crawl statistics.
|
|
25
|
+
attr_reader :all_ignored_links
|
|
26
|
+
|
|
27
|
+
# Returns a new LinkManager instance with empty link collections.
|
|
28
|
+
def initialize(sort)
|
|
29
|
+
raise "Sort by either :page or :link, not #{sort}" \
|
|
30
|
+
unless %i[page link].include?(sort)
|
|
31
|
+
|
|
32
|
+
@sort = sort
|
|
33
|
+
@lock = Mutex.new
|
|
34
|
+
|
|
35
|
+
empty # Initialises the link collections.
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Initialise/empty the link collection objects.
|
|
39
|
+
def empty
|
|
40
|
+
@broken_links = {}
|
|
41
|
+
@ignored_links = {}
|
|
42
|
+
@crawl_stats = {}
|
|
43
|
+
@broken_link_map = {}
|
|
44
|
+
@all_broken_links = Set.new
|
|
45
|
+
@all_intact_links = Set.new
|
|
46
|
+
@all_ignored_links = Set.new
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Append key => [value] to the broken link collections.
|
|
50
|
+
# If map: true, then the link will also be recorded in @broken_link_map.
|
|
51
|
+
def append_broken_link(doc, link, map: true)
|
|
52
|
+
key, value = get_key_value(doc.url, link)
|
|
53
|
+
|
|
54
|
+
@lock.synchronize do
|
|
55
|
+
@broken_links[key] = [] unless @broken_links[key]
|
|
56
|
+
@broken_links[key] << value
|
|
57
|
+
|
|
58
|
+
@all_broken_links << link
|
|
59
|
+
|
|
60
|
+
@broken_link_map[link] = link.make_absolute(doc) if map
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Remove the broken link from the necessary collections.
|
|
65
|
+
def remove_broken_link(link)
|
|
66
|
+
@lock.synchronize do
|
|
67
|
+
if @sort == :page
|
|
68
|
+
@broken_links.each { |_k, links| links.delete(link) }
|
|
69
|
+
@broken_links.delete_if { |_k, links| links.empty? }
|
|
70
|
+
else
|
|
71
|
+
@broken_links.delete(link)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
@all_broken_links.delete(link)
|
|
75
|
+
@all_intact_links << link
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Append key => [value] to the ignored link collections.
|
|
80
|
+
def append_ignored_link(url, link)
|
|
81
|
+
key, value = get_key_value(url, link)
|
|
82
|
+
|
|
83
|
+
@lock.synchronize do
|
|
84
|
+
@ignored_links[key] = [] unless @ignored_links[key]
|
|
85
|
+
@ignored_links[key] << value
|
|
86
|
+
|
|
87
|
+
@all_ignored_links << link
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Append link to @all_intact_links.
|
|
92
|
+
def append_intact_link(link)
|
|
93
|
+
@lock.synchronize { @all_intact_links << link }
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Sorts the link collection's keys and values alphabetically.
|
|
97
|
+
def sort
|
|
98
|
+
@broken_links.values.map(&:uniq!)
|
|
99
|
+
@ignored_links.values.map(&:uniq!)
|
|
100
|
+
|
|
101
|
+
@broken_links = @broken_links.sort_by { |k, _v| k }.to_h
|
|
102
|
+
@ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h
|
|
103
|
+
|
|
104
|
+
@broken_links.each { |_k, v| v.sort! }
|
|
105
|
+
@ignored_links.each { |_k, v| v.sort! }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Tally's up various statistics about the crawl and its links.
|
|
109
|
+
def tally(url:, pages_crawled:, start:)
|
|
110
|
+
@crawl_stats[:url] = url
|
|
111
|
+
@crawl_stats[:pages_crawled] = pages_crawled
|
|
112
|
+
@crawl_stats[:num_pages] = pages_crawled.size
|
|
113
|
+
@crawl_stats[:num_links] = (
|
|
114
|
+
@all_broken_links.size + @all_intact_links.size + @all_ignored_links.size
|
|
115
|
+
)
|
|
116
|
+
@crawl_stats[:num_broken_links] = @all_broken_links.size
|
|
117
|
+
@crawl_stats[:num_intact_links] = @all_intact_links.size
|
|
118
|
+
@crawl_stats[:num_ignored_links] = @all_ignored_links.size
|
|
119
|
+
@crawl_stats[:duration] = Time.now - start
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
private
|
|
123
|
+
|
|
124
|
+
# Returns the correct key value depending on the @sort type.
|
|
125
|
+
# @sort == :page ? [url, link] : [link, url]
|
|
126
|
+
def get_key_value(url, link)
|
|
127
|
+
case @sort
|
|
128
|
+
when :page
|
|
129
|
+
[url, link]
|
|
130
|
+
when :link
|
|
131
|
+
[link, url]
|
|
132
|
+
else
|
|
133
|
+
raise "Unsupported sort type: #{sort}"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module BrokenLinkFinder
|
|
4
|
+
# Class responsible for reporting in a HTML format.
|
|
4
5
|
class HTMLReporter < Reporter
|
|
5
|
-
#
|
|
6
|
+
# Returns a new HTMLReporter instance.
|
|
6
7
|
# stream is any Object that responds to :puts and :print.
|
|
7
8
|
def initialize(stream, sort,
|
|
8
9
|
broken_links, ignored_links,
|
|
@@ -6,7 +6,7 @@ module BrokenLinkFinder
|
|
|
6
6
|
# The amount of pages/links to display when verbose is false.
|
|
7
7
|
NUM_VALUES = 3
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# Returns a new Reporter instance.
|
|
10
10
|
# stream is any Object that responds to :puts and :print.
|
|
11
11
|
def initialize(stream, sort,
|
|
12
12
|
broken_links, ignored_links,
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module BrokenLinkFinder
|
|
4
|
+
# Class responsible for reporting in a text format.
|
|
4
5
|
class TextReporter < Reporter
|
|
5
|
-
#
|
|
6
|
+
# Returns a new TextReporter instance.
|
|
6
7
|
# stream is any Object that responds to :puts and :print.
|
|
7
8
|
def initialize(stream, sort,
|
|
8
9
|
broken_links, ignored_links,
|
|
@@ -18,7 +18,7 @@ rescue StandardError
|
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
# We extract all the Document's links e.g. <a>, <img>, <script>, <link> etc.
|
|
21
|
-
Wgit::Document.
|
|
21
|
+
Wgit::Document.define_extractor(
|
|
22
22
|
:all_links,
|
|
23
23
|
'//*/@href | //*/@src', # Any element's href or src attribute URL.
|
|
24
24
|
singleton: false,
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: broken_link_finder
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.11.
|
|
4
|
+
version: 0.11.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Michael Telford
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-
|
|
11
|
+
date: 2020-07-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -72,14 +72,14 @@ dependencies:
|
|
|
72
72
|
requirements:
|
|
73
73
|
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
|
-
version: '
|
|
75
|
+
version: '13.0'
|
|
76
76
|
type: :development
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
80
|
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
|
-
version: '
|
|
82
|
+
version: '13.0'
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
84
|
name: webmock
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -128,14 +128,14 @@ dependencies:
|
|
|
128
128
|
requirements:
|
|
129
129
|
- - "~>"
|
|
130
130
|
- !ruby/object:Gem::Version
|
|
131
|
-
version: '0.
|
|
131
|
+
version: '0.9'
|
|
132
132
|
type: :runtime
|
|
133
133
|
prerelease: false
|
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
|
135
135
|
requirements:
|
|
136
136
|
- - "~>"
|
|
137
137
|
- !ruby/object:Gem::Version
|
|
138
|
-
version: '0.
|
|
138
|
+
version: '0.9'
|
|
139
139
|
description: Finds a website's broken links using the 'wgit' gem and reports back
|
|
140
140
|
to you with a summary.
|
|
141
141
|
email: michael.telford@live.com
|
|
@@ -159,6 +159,7 @@ files:
|
|
|
159
159
|
- exe/broken_link_finder
|
|
160
160
|
- lib/broken_link_finder.rb
|
|
161
161
|
- lib/broken_link_finder/finder.rb
|
|
162
|
+
- lib/broken_link_finder/link_manager.rb
|
|
162
163
|
- lib/broken_link_finder/reporter/html_reporter.rb
|
|
163
164
|
- lib/broken_link_finder/reporter/reporter.rb
|
|
164
165
|
- lib/broken_link_finder/reporter/text_reporter.rb
|
|
@@ -189,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
189
190
|
- !ruby/object:Gem::Version
|
|
190
191
|
version: '0'
|
|
191
192
|
requirements: []
|
|
192
|
-
rubygems_version: 3.
|
|
193
|
+
rubygems_version: 3.1.2
|
|
193
194
|
signing_key:
|
|
194
195
|
specification_version: 4
|
|
195
196
|
summary: Finds a website's broken links and reports back to you with a summary.
|