rdoc_link_checker 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -3
- data/bin/rdoc_link_checker +32 -15
- data/doc/help.txt +16 -0
- data/lib/rdoc_link_checker/version.rb +1 -1
- data/lib/rdoc_link_checker.rb +676 -3
- data/rdoc_link_checker.gemspec +7 -11
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cde4b6bb3a511ccee4b20f792707c85c8705b8a828754909f63f88c533d5efe4
|
4
|
+
data.tar.gz: ff3dfefb26e94258e32a2c5f19bc0b894591611a4dff9612741e34e641ca8130
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c577bf0a97429715c606ee45986258a77bf5028a5e2af47f76e9e2776c5cf66c52045fbc6d1e28fc940e13c178b13ba5914aa6b952d2e24d13b6a36147675d8b
|
7
|
+
data.tar.gz: 9bdcfb203468e9de9d0f94d02a2e0e4ac6140e1e33745012272c36afd469631204a9da410061383a7b7be434df08aaed48c5553bbb6b7578fae115a8fb41f1ec
|
data/README.md
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
# RDoc Link Checker
|
2
2
|
|
3
|
-
|
3
|
+
A gem to find broken links in HTML files generated by Ruby RDoc.
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
Reports a link as broken if:
|
6
|
+
|
7
|
+
- The target page given by +href+ is not found.
|
8
|
+
- The target page is found, but the fragment given by +href+
|
9
|
+
is not a link target on that page;
|
10
|
+
this usually causes a browser to open at the top of the page
|
11
|
+
instead of at the given fragment.
|
12
|
+
|
13
|
+
Note that some browsers are forgiving, and will open the target
|
14
|
+
page at a link target similar to the given fragment;
|
15
|
+
for example, fragment ```#bar``` may be opened at an element
|
16
|
+
with id ```foobar```.
|
17
|
+
|
18
|
+
See the [help text](doc/help.txt).
|
data/bin/rdoc_link_checker
CHANGED
@@ -5,31 +5,33 @@ require_relative '../lib/rdoc_link_checker'
|
|
5
5
|
|
6
6
|
options = GetoptLong.new(
|
7
7
|
['--html_dirpath', '-d', GetoptLong::REQUIRED_ARGUMENT],
|
8
|
-
['--
|
9
|
-
['--
|
8
|
+
['--onsite_only', '-l', GetoptLong::NO_ARGUMENT],
|
9
|
+
['--no_toc', '-n', GetoptLong::NO_ARGUMENT],
|
10
|
+
['--version', '-v', GetoptLong::NO_ARGUMENT],
|
11
|
+
['--help', '-h', GetoptLong::NO_ARGUMENT]
|
10
12
|
)
|
11
13
|
|
12
|
-
message = nil
|
13
|
-
case ARGV.size
|
14
|
-
when 0
|
15
|
-
message = "Expected one argument; got none."
|
16
|
-
when 1
|
17
|
-
# Okay.
|
18
|
-
else
|
19
|
-
message = "Expected one argument, not #{ARGV.inspect}."
|
20
|
-
end
|
21
|
-
raise ArgumentError.new(message) if message
|
22
|
-
|
23
14
|
def help
|
24
|
-
|
15
|
+
path = File.absolute_path(__FILE__)
|
16
|
+
dirname = File.dirname(File.dirname(path))
|
17
|
+
filepath = File.join(dirname, 'doc', 'help.txt')
|
18
|
+
puts File.read(filepath)
|
19
|
+
exit
|
25
20
|
end
|
26
21
|
|
27
22
|
def version
|
28
23
|
puts RDocLinkChecker::VERSION
|
24
|
+
exit
|
29
25
|
end
|
30
26
|
|
27
|
+
onsite_only = false
|
28
|
+
no_toc = false
|
31
29
|
options.each do |option, argument|
|
32
30
|
case option
|
31
|
+
when '--onsite_only'
|
32
|
+
onsite_only = true
|
33
|
+
when '--no_toc'
|
34
|
+
no_toc = true
|
33
35
|
when '--help'
|
34
36
|
help
|
35
37
|
when '--version'
|
@@ -37,5 +39,20 @@ options.each do |option, argument|
|
|
37
39
|
end
|
38
40
|
end
|
39
41
|
|
42
|
+
message = nil
|
43
|
+
case ARGV.size
|
44
|
+
when 0
|
45
|
+
message = "Expected one argument; got none."
|
46
|
+
when 1
|
47
|
+
# Okay.
|
48
|
+
else
|
49
|
+
message = "Expected one argument, not #{ARGV.inspect}."
|
50
|
+
end
|
51
|
+
raise ArgumentError.new(message) if message
|
52
|
+
|
40
53
|
html_dirpath = ARGV[0]
|
41
|
-
RDocLinkChecker.new(
|
54
|
+
RDocLinkChecker.new(
|
55
|
+
html_dirpath,
|
56
|
+
onsite_only: onsite_only,
|
57
|
+
no_toc: no_toc
|
58
|
+
).check
|
data/doc/help.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Usage:
|
2
|
+
rdoc_link_checker html_dirpath options
|
3
|
+
|
4
|
+
The argument is the path to a directory containing a tree
|
5
|
+
of RDoc-generated HTML files, such as those generated by command
|
6
|
+
|
7
|
+
rdoc --visibility=private --op html . # Note the trailing dot.
|
8
|
+
|
9
|
+
Options:
|
10
|
+
--onsite_only Check link targets only on pages in the file tree at <html_dirpath>,
|
11
|
+
and not those on other local pages or those on the web.
|
12
|
+
--no_toc Do not check links on the TOC page (table_of_contents.html).
|
13
|
+
--version Print the version and exit.
|
14
|
+
--help Print this help and exit.
|
15
|
+
|
16
|
+
The output is file <html_dirpath>/Report.htm, which reports broken links.
|
data/lib/rdoc_link_checker.rb
CHANGED
@@ -1,13 +1,686 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'rexml/document'
|
5
|
+
require 'find'
|
6
|
+
require 'net/http'
|
7
|
+
|
3
8
|
require_relative 'rdoc_link_checker/version'
|
4
9
|
|
5
10
|
class RDocLinkChecker
|
6
11
|
|
7
|
-
|
8
|
-
|
12
|
+
include REXML
|
13
|
+
|
14
|
+
attr_accessor :html_dirpath, :onsite_only, :no_toc
|
15
|
+
|
16
|
+
def initialize(
|
17
|
+
html_dirpath,
|
18
|
+
onsite_only: false,
|
19
|
+
no_toc: false
|
20
|
+
)
|
21
|
+
self.html_dirpath = html_dirpath
|
22
|
+
self.onsite_only = onsite_only
|
23
|
+
self.no_toc = no_toc
|
24
|
+
@pages = {}
|
25
|
+
@counts = {
|
26
|
+
source_pages: 0,
|
27
|
+
target_pages: 0,
|
28
|
+
links_checked: 0,
|
29
|
+
links_broken: 0,
|
30
|
+
}
|
31
|
+
@verbose = false
|
32
|
+
end
|
33
|
+
|
34
|
+
def check
|
35
|
+
# All work is done in the HTML directory,
|
36
|
+
# and that is where Report.htm will be put.
|
37
|
+
Dir.chdir(html_dirpath) do |dir|
|
38
|
+
@counts[:start_time] = Time.now
|
39
|
+
gather_source_paths
|
40
|
+
create_source_pages
|
41
|
+
create_target_pages
|
42
|
+
verify_links
|
43
|
+
@counts[:end_time] = Time.now
|
44
|
+
report
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Gather paths to source HTML pages.
|
49
|
+
def gather_source_paths
|
50
|
+
paths = []
|
51
|
+
puts 'Gathering source paths' if @verbose
|
52
|
+
paths = Find.find('.').select {|path| path.end_with?('.html') }
|
53
|
+
# Remove leading './'.
|
54
|
+
@source_paths = paths.map{|path| path.sub(%r[^\./], '')}
|
55
|
+
@source_paths.delete('table_of_contents.html') if no_toc
|
56
|
+
if @verbose
|
57
|
+
@source_paths.each_with_index do |source_path, i|
|
58
|
+
puts '- %4d %s' % [i, source_path]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
@counts[:source_pages] = @source_paths.size
|
62
|
+
puts "Gathered #{@source_paths.size} source paths" if @verbose
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create a source \Page object for each source path.
|
66
|
+
# Gather its links and ids.
|
67
|
+
def create_source_pages
|
68
|
+
puts "Creating #{@source_paths.size} source pages" if @verbose
|
69
|
+
@source_paths.sort.each_with_index do |source_path, i|
|
70
|
+
progress_s = RDocLinkChecker.progress_s(i + 1, @source_paths.size)
|
71
|
+
puts "Creating source page #{source_path} #{progress_s}" if @verbose
|
72
|
+
source_page = Page.new(source_path, @verbose, @pages, @counts, onsite_only)
|
73
|
+
@pages[source_path] = source_page
|
74
|
+
source_text = File.read(source_path)
|
75
|
+
doc = Nokogiri::HTML(source_text)
|
76
|
+
source_page.gather_links(doc)
|
77
|
+
source_page.gather_ids(doc)
|
78
|
+
puts "Created source page #{progress_s}" if @verbose
|
79
|
+
end
|
80
|
+
puts "Created #{@pages.size} source pages" if @verbose
|
81
|
+
end
|
82
|
+
|
83
|
+
# Create a target \Page object for each link
|
84
|
+
# (unless already created as a source page).
|
85
|
+
def create_target_pages
|
86
|
+
doc = nil
|
87
|
+
target_page_count = 0
|
88
|
+
@source_paths = @pages.keys
|
89
|
+
@source_paths.each do |source_path|
|
90
|
+
# Need for relative links to work.
|
91
|
+
dirname = File.dirname(source_path)
|
92
|
+
Dir.chdir(dirname) do
|
93
|
+
source_page = @pages[source_path]
|
94
|
+
puts "Creating target pages for #{source_page.links.size} links in #{source_path}" if @verbose
|
95
|
+
source_page.links.each_with_index do |link, i|
|
96
|
+
next if link.path.nil?
|
97
|
+
link.puts(i) if @verbose
|
98
|
+
target_path = link.real_path
|
99
|
+
if @pages[target_path]
|
100
|
+
puts "Page #{target_path} already created" if @verbose
|
101
|
+
target_page = @pages[target_path]
|
102
|
+
else
|
103
|
+
if File.readable?(link.path)
|
104
|
+
puts "Creating target page #{target_path}" if @verbose
|
105
|
+
target_page_count += 1
|
106
|
+
target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only)
|
107
|
+
@pages[target_path] = target_page
|
108
|
+
target_text = File.read(link.path)
|
109
|
+
doc = Nokogiri::HTML(target_text)
|
110
|
+
target_page.gather_ids(doc)
|
111
|
+
puts "Created target page #{target_path}" if @verbose
|
112
|
+
elsif RDocLinkChecker.checkable?(link.path)
|
113
|
+
puts "Creating target page #{target_path}" if @verbose
|
114
|
+
target_page_count += 1
|
115
|
+
target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only)
|
116
|
+
@pages[target_path] = target_page
|
117
|
+
puts "Created target page #{target_path}" if @verbose
|
118
|
+
link.exception = fetch(link.path, target_page)
|
119
|
+
link.valid_p = false if link.exception
|
120
|
+
else
|
121
|
+
puts "File not readable or checkable: #{target_path}" if @verbose
|
122
|
+
end
|
123
|
+
end
|
124
|
+
next if target_page.nil?
|
125
|
+
if link.has_fragment? && target_page.ids.empty?
|
126
|
+
doc || doc = Nokogiri::HTML(target_text)
|
127
|
+
target_page.gather_ids(doc)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
puts "Created target pages for #{source_page.links.size} links in #{source_path}" if @verbose
|
131
|
+
end
|
132
|
+
end
|
133
|
+
puts "Created #{target_page_count} target pages" if @verbose
|
134
|
+
@counts[:target_pages] = target_page_count
|
135
|
+
end
|
136
|
+
|
137
|
+
# Verify that each link target exists.
|
138
|
+
def verify_links
|
139
|
+
linking_pages = @pages.select do |path, page|
|
140
|
+
!page.links.empty?
|
141
|
+
end
|
142
|
+
puts "Checking links on #{linking_pages.size} pages" if @verbose
|
143
|
+
link_count = 0
|
144
|
+
broken_count = 0
|
145
|
+
linking_pages.each_pair do |path, page|
|
146
|
+
puts "Checking #{page.links.size} links on page #{path}" if @verbose
|
147
|
+
link_count += page.links.size
|
148
|
+
page.links.each_with_index do |link, i|
|
149
|
+
if link.valid_p.nil? # Don't disturb if already set to false.
|
150
|
+
target_page = @pages[link.real_path]
|
151
|
+
if target_page
|
152
|
+
target_id = link.fragment
|
153
|
+
link.valid_p = target_id.nil? || target_page.ids.include?(target_id)
|
154
|
+
else
|
155
|
+
link_valid_p = false
|
156
|
+
end
|
157
|
+
end
|
158
|
+
link.puts(i) if @verbose
|
159
|
+
broken_count += 1 unless link.valid_p
|
160
|
+
end
|
161
|
+
puts "Checked #{page.links.size} links on page #{path}" if @verbose
|
162
|
+
end
|
163
|
+
puts "Checked #{link_count} links on #{linking_pages.size} pages" if @verbose
|
164
|
+
@counts[:links_checked] = link_count
|
165
|
+
@counts[:links_broken] = broken_count
|
166
|
+
end
|
167
|
+
|
168
|
+
# Fetch the page from the web and gather its ids into the target page.
|
169
|
+
# Returns exception or nil.
|
170
|
+
def fetch(url, target_page)
|
171
|
+
puts "Begin fetch target page #{url}" if @verbose
|
172
|
+
puts "Getting return code for #{url}" if @verbose
|
173
|
+
code = 0
|
174
|
+
exception = nil
|
175
|
+
begin
|
176
|
+
response = Net::HTTP.get_response(URI(url))
|
177
|
+
code = response.code.to_i
|
178
|
+
target_page.code = code
|
179
|
+
puts "Returned #{code} (#{response.class})" if @verbose
|
180
|
+
rescue => x
|
181
|
+
puts "Raised #{x.class} #{x.message}" if @verbose
|
182
|
+
raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/)
|
183
|
+
exception = RDocLinkChecker::HttpResponseError.new(url, x)
|
184
|
+
end
|
185
|
+
puts "Got return code #{code} for #{url} " if @verbose
|
186
|
+
# Don't load if bad code, or no response, or if not html.
|
187
|
+
if !code_bad?(code)
|
188
|
+
if content_type_html?(response)
|
189
|
+
doc = Nokogiri::HTML(response.body)
|
190
|
+
target_page.gather_ids(doc)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
puts "End fetch target page #{url}" if @verbose
|
194
|
+
exception
|
195
|
+
end
|
196
|
+
|
197
|
+
# Returns whether the code is bad (zero or >= 400).
|
198
|
+
def code_bad?(code)
|
199
|
+
return false if code.nil?
|
200
|
+
(code == 0) || (code >= 400)
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns whether the response body should be HTML.
|
204
|
+
def content_type_html?(response)
|
205
|
+
return false unless response
|
206
|
+
return false unless response['Content-Type']
|
207
|
+
response['Content-Type'].match('html')
|
208
|
+
end
|
209
|
+
|
210
|
+
# Returns whether the path is offsite.
|
211
|
+
def self.offsite?(path)
|
212
|
+
path.start_with?('http')
|
213
|
+
end
|
214
|
+
|
215
|
+
# Returns the string fragment for the given path or ULR, or +nil+
|
216
|
+
def self.get_fragment(s)
|
217
|
+
a = s.split('#', 2)
|
218
|
+
a.size == 2 ? a[1] : nil
|
219
|
+
end
|
220
|
+
|
221
|
+
# Returns a progress string giving a fraction and percentage.
|
222
|
+
def self.progress_s(i, total)
|
223
|
+
fraction_s = "#{i}/#{total}"
|
224
|
+
percent_i = (i*100.0/total).round
|
225
|
+
"(#{fraction_s}, #{percent_i}%)"
|
226
|
+
end
|
227
|
+
|
228
|
+
# Returns whether the path is checkable.
|
229
|
+
def self.checkable?(path)
|
230
|
+
return false unless path
|
231
|
+
begin
|
232
|
+
uri = URI(path)
|
233
|
+
return ['http', 'https', nil].include?(uri.scheme)
|
234
|
+
rescue
|
235
|
+
return false
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
# Generate the report; +checker+ is the \RDocLinkChecker object.
|
240
|
+
def report
|
241
|
+
|
242
|
+
doc = Document.new('')
|
243
|
+
root = doc.add_element(Element.new('root'))
|
244
|
+
|
245
|
+
head = root.add_element(Element.new('head'))
|
246
|
+
title = head.add_element(Element.new('title'))
|
247
|
+
title.text = 'RDocLinkChecker Report'
|
248
|
+
style = head.add_element(Element.new('style'))
|
249
|
+
style.text = <<EOT
|
250
|
+
* { font-family: sans-serif }
|
251
|
+
.data { font-family: courier }
|
252
|
+
.center { text-align: center }
|
253
|
+
.good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
|
254
|
+
.iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
|
255
|
+
.bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
|
256
|
+
.neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
|
257
|
+
EOT
|
258
|
+
|
259
|
+
body = root.add_element(Element.new('body'))
|
260
|
+
h1 = body.add_element(Element.new('h1'))
|
261
|
+
h1.text = 'RDocLinkChecker Report'
|
262
|
+
|
263
|
+
add_summary(body)
|
264
|
+
add_broken_links(body)
|
265
|
+
add_offsite_links(body) unless onsite_only
|
266
|
+
report_file_path = 'Report.htm' # _Not_ .html.
|
267
|
+
doc.write(File.new(report_file_path, 'w'), 2)
|
268
|
+
end
|
269
|
+
|
270
|
+
def add_summary(body)
|
271
|
+
h2 = body.add_element(Element.new('h2'))
|
272
|
+
h2.text = 'Summary'
|
273
|
+
|
274
|
+
# Parameters table.
|
275
|
+
data = []
|
276
|
+
[
|
277
|
+
:html_dirpath,
|
278
|
+
:onsite_only,
|
279
|
+
:no_toc
|
280
|
+
].each do |sym|
|
281
|
+
value = send(sym).inspect
|
282
|
+
row = {sym => :label, value => :good}
|
283
|
+
data.push(row)
|
284
|
+
end
|
285
|
+
table2(body, data, 'Parameters')
|
286
|
+
body.add_element(Element.new('p'))
|
287
|
+
|
288
|
+
# Times table.
|
289
|
+
elapsed_time = @counts[:end_time] - @counts[:start_time]
|
290
|
+
seconds = elapsed_time % 60
|
291
|
+
minutes = (elapsed_time / 60) % 60
|
292
|
+
hours = (elapsed_time/3600)
|
293
|
+
elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
|
294
|
+
format = "%Y-%m-%d-%a-%H:%M:%S"
|
295
|
+
start_time_s = @counts[:start_time].strftime(format)
|
296
|
+
end_time_s = @counts[:end_time].strftime(format)
|
297
|
+
data = [
|
298
|
+
{'Start Time' => :label, start_time_s => :good},
|
299
|
+
{'End Time' => :label, end_time_s => :good},
|
300
|
+
{'Elapsed Time' => :label, elapsed_time_s => :good},
|
301
|
+
]
|
302
|
+
table2(body, data, 'Times')
|
303
|
+
body.add_element(Element.new('p'))
|
304
|
+
|
305
|
+
# Counts.
|
306
|
+
data = [
|
307
|
+
{'Source Pages' => :label, @counts[:source_pages] => :good},
|
308
|
+
{'Target Pages' => :label, @counts[:target_pages] => :good},
|
309
|
+
{'Links Checked' => :label, @counts[:links_checked] => :good},
|
310
|
+
{'Links Broken' => :label, @counts[:links_broken] => :bad},
|
311
|
+
]
|
312
|
+
table2(body, data, 'Counts')
|
313
|
+
body.add_element(Element.new('p'))
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
def add_broken_links(body)
|
318
|
+
h2 = body.add_element(Element.new('h2'))
|
319
|
+
h2.text = 'Broken Links by Source Page'
|
320
|
+
|
321
|
+
if @counts[:links_broken] == 0
|
322
|
+
p = body.add_element('p')
|
323
|
+
p.text = 'None.'
|
324
|
+
return
|
325
|
+
end
|
326
|
+
|
327
|
+
ul = body.add_element(Element.new('ul'))
|
328
|
+
li = ul.add_element(Element.new('li'))
|
329
|
+
li.text = 'Href: the href of the anchor element.'
|
330
|
+
li = ul.add_element(Element.new('li'))
|
331
|
+
li.text = 'Text: the text of the anchor element.'
|
332
|
+
li = ul.add_element(Element.new('li'))
|
333
|
+
li.text = 'Path: the URL or path of the link (not including the fragment):'
|
334
|
+
ul2 = li.add_element(Element.new('ul'))
|
335
|
+
li2 = ul2.add_element(Element.new('li'))
|
336
|
+
li2.text = 'For an on-site link, an abbreviated path is given.'
|
337
|
+
li2 = ul2.add_element(Element.new('li'))
|
338
|
+
li2.text = <<EOT
|
339
|
+
For an off-site link, the full URL is given.
|
340
|
+
If the path is reddish, the page was not found.
|
341
|
+
EOT
|
342
|
+
li = ul.add_element(Element.new('li'))
|
343
|
+
li.text = <<EOT
|
344
|
+
Fragment: the fragment of the link.
|
345
|
+
If the fragment is reddish, fragment was not found.
|
346
|
+
EOT
|
347
|
+
|
348
|
+
@pages.each_pair do |path, page|
|
349
|
+
broken_links = page.links.select {|link| !link.valid_p }
|
350
|
+
next if broken_links.empty?
|
351
|
+
|
352
|
+
h3 = body.add_element(Element.new('h3'))
|
353
|
+
a = Element.new('a')
|
354
|
+
a.text = path
|
355
|
+
a.add_attribute('href', path)
|
356
|
+
h3.add_element(a)
|
357
|
+
|
358
|
+
broken_links.each do |link|
|
359
|
+
data = []
|
360
|
+
# Text, URL, fragment
|
361
|
+
a = Element.new('a')
|
362
|
+
a.text = link.href
|
363
|
+
a.add_attribute('href', link.href)
|
364
|
+
data.push({'Href' => :label, a => :bad})
|
365
|
+
data.push({'Text' => :label, link.text => :good})
|
366
|
+
fragment_p = !link.fragment.nil?
|
367
|
+
class_ = fragment_p ? :good : :bad
|
368
|
+
data.push({'Path' => :label, link.real_path => class_})
|
369
|
+
class_ = fragment_p ? :bad : :good
|
370
|
+
data.push({'Fragment' => :label, link.fragment => class_})
|
371
|
+
if link.exception
|
372
|
+
data.push({'Exception' => :label, link.exception.class => :bad})
|
373
|
+
data.push({'Message' => :label, link.exception.message => :bad})
|
374
|
+
end
|
375
|
+
table2(body, data)
|
376
|
+
body.add_element(Element.new('p'))
|
377
|
+
end
|
378
|
+
end
|
379
|
+
|
380
|
+
end
|
381
|
+
|
382
|
+
def add_offsite_links(body)
|
383
|
+
h2 = body.add_element(Element.new('h2'))
|
384
|
+
h2.text = 'Off-Site Links by Source Page'
|
385
|
+
@pages.each_pair do |path, page|
|
386
|
+
offsite_links = page.links.select do |link|
|
387
|
+
RDocLinkChecker.offsite?(link.href)
|
388
|
+
end
|
389
|
+
next if offsite_links.empty?
|
390
|
+
|
391
|
+
h3 = body.add_element(Element.new('h3'))
|
392
|
+
a = Element.new('a')
|
393
|
+
a.text = path
|
394
|
+
a.add_attribute('href', path)
|
395
|
+
h3.add_element(a)
|
396
|
+
|
397
|
+
offsite_links.each do |link|
|
398
|
+
data = []
|
399
|
+
# Text, URL, fragment
|
400
|
+
a = Element.new('a')
|
401
|
+
a.text = link.href
|
402
|
+
a.add_attribute('href', link.href)
|
403
|
+
class_ = link.valid_p ? :good : :bad
|
404
|
+
data.push({'Href' => :label, a => class_})
|
405
|
+
data.push({'Text' => :label, link.text => :good})
|
406
|
+
table2(body, data)
|
407
|
+
body.add_element(Element.new('p'))
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
Classes = {
|
413
|
+
label: 'label center neutral',
|
414
|
+
good: 'data center good',
|
415
|
+
iffy: 'data center iffy',
|
416
|
+
bad: 'data center bad',
|
417
|
+
}
|
418
|
+
|
419
|
+
def table2(parent, data, title = nil)
|
420
|
+
data = data.dup
|
421
|
+
table = parent.add_element(Element.new('table'))
|
422
|
+
if title
|
423
|
+
tr = table.add_element(Element.new('tr)'))
|
424
|
+
th = tr.add_element(Element.new('th'))
|
425
|
+
th.add_attribute('colspan', 2)
|
426
|
+
if title.kind_of?(REXML::Element)
|
427
|
+
th.add_element(title)
|
428
|
+
else
|
429
|
+
th.text = title
|
430
|
+
end
|
431
|
+
end
|
432
|
+
data.each do |row_h|
|
433
|
+
label, label_class, value, value_class = row_h.flatten
|
434
|
+
tr = table.add_element(Element.new('tr'))
|
435
|
+
td = tr.add_element(Element.new('td'))
|
436
|
+
td.text = label
|
437
|
+
td.add_attribute('class', Classes[label_class])
|
438
|
+
td = tr.add_element(Element.new('td'))
|
439
|
+
if value.kind_of?(REXML::Element)
|
440
|
+
td.add_element(value)
|
441
|
+
else
|
442
|
+
td.text = value
|
443
|
+
end
|
444
|
+
td.add_attribute('class', Classes[value_class])
|
445
|
+
end
|
446
|
+
end
|
447
|
+
|
448
|
+
class Error; end
|
449
|
+
|
450
|
+
class HttpResponseError < Error
|
451
|
+
|
452
|
+
attr_accessor :url, :x
|
453
|
+
|
454
|
+
def initialize(url, x)
|
455
|
+
self.url = url
|
456
|
+
self.x = x
|
457
|
+
end
|
458
|
+
|
459
|
+
def message
|
460
|
+
<<EOT
|
461
|
+
#{self.class.name}:
|
462
|
+
An exception was raised when checking page availability with Net::HTTP:
|
463
|
+
Url: #{url}
|
464
|
+
Class: #{x.class}
|
465
|
+
Message: #{x.message}
|
466
|
+
EOT
|
467
|
+
end
|
468
|
+
|
469
|
+
end
|
470
|
+
|
471
|
+
class HttpStatusCodeError < Error
|
472
|
+
|
473
|
+
attr_accessor :url, :code
|
474
|
+
|
475
|
+
def initialize(url, code)
|
476
|
+
self.url = url
|
477
|
+
self.code = code
|
478
|
+
end
|
479
|
+
|
480
|
+
def message
|
481
|
+
<<EOT
|
482
|
+
#{self.class.name}:
|
483
|
+
The return code for the page was not 200:
|
484
|
+
Url: #{url}
|
485
|
+
Return code: #{code}
|
486
|
+
EOT
|
487
|
+
end
|
488
|
+
|
489
|
+
end
|
490
|
+
|
491
|
+
# Class to represent a page.
|
492
|
+
class Page
|
493
|
+
|
494
|
+
attr_accessor :path, :type, :verbose, :pages, :counts, :code, :links, :ids, :dirname, :onsite_only
|
495
|
+
|
496
|
+
# Returns a new \Page object:
|
497
|
+
#
|
498
|
+
# - +path+: a path relative to the HTML directory (if on-site)
|
499
|
+
# or a URL (if off-site).
|
500
|
+
# - +verbose+: whether to put progress message to $stdout.
|
501
|
+
# - +pages+: hash of path/page pairs.
|
502
|
+
# - +counts+: hash of counts.
|
503
|
+
#
|
504
|
+
def initialize(path, verbose, pages, counts, onsite_only)
|
505
|
+
self.path = path
|
506
|
+
self.verbose = verbose
|
507
|
+
self.pages = pages
|
508
|
+
self.counts = counts
|
509
|
+
self.onsite_only = onsite_only
|
510
|
+
self.code = nil
|
511
|
+
self.links = []
|
512
|
+
self.ids = []
|
513
|
+
self.dirname = File.dirname(path)
|
514
|
+
self.dirname = self.dirname == '.' ? '' : dirname
|
515
|
+
end
|
516
|
+
|
517
|
+
# Gather links for the page:
|
518
|
+
#
|
519
|
+
# - +doc+: Nokogiri document to be parsed for links.
|
520
|
+
#
|
521
|
+
def gather_links(doc)
|
522
|
+
puts 'Gathering links' if @verbose
|
523
|
+
i = 0
|
524
|
+
# The links are in the anchors.
|
525
|
+
doc.search('a').each do |a|
|
526
|
+
# Ignore pilcrow (paragraph character) and up-arrow.
|
527
|
+
next if a.text == "\u00B6"
|
528
|
+
next if a.text == "\u2191"
|
529
|
+
|
530
|
+
href = a.attr('href')
|
531
|
+
next if href.nil? or href.empty?
|
532
|
+
next if RDocLinkChecker.offsite?(href) && onsite_only
|
533
|
+
next unless RDocLinkChecker.checkable?(href)
|
534
|
+
|
535
|
+
link = Link.new(href, a.text, dirname)
|
536
|
+
next if link.path.nil? || link.path.empty?
|
537
|
+
|
538
|
+
links.push(link)
|
539
|
+
link.puts(i) if @verbose
|
540
|
+
i += 1
|
541
|
+
end
|
542
|
+
puts "Gathered #{i} links" if @verbose
|
543
|
+
end
|
544
|
+
|
545
|
+
# Gather ids for the page.
|
546
|
+
# +doc+ is the Nokogiri document to be parsed.
|
547
|
+
def gather_ids(doc)
|
548
|
+
# Don't do twice (some pages are both source and target).
|
549
|
+
return unless ids.empty?
|
550
|
+
|
551
|
+
# For off-site, gather all ids, regardless of element.
|
552
|
+
if RDocLinkChecker.offsite?(path)
|
553
|
+
doc.xpath("//*[@id]").each do |element|
|
554
|
+
id = element.attr('id')
|
555
|
+
ids.push(id)
|
556
|
+
end
|
557
|
+
return
|
558
|
+
end
|
559
|
+
|
560
|
+
# We're on-site, which means that the page is RDoc-generated
|
561
|
+
# and we know what to expect.
|
562
|
+
# In theory, an author can link to any element that has an attribute :id.
|
563
|
+
# In practice, gathering all such elements is very time-consuming.
|
564
|
+
# These are the elements currently linked to:
|
565
|
+
#
|
566
|
+
# - body
|
567
|
+
# - a
|
568
|
+
# - div
|
569
|
+
# - dt
|
570
|
+
# - h*
|
571
|
+
#
|
572
|
+
# We can add more as needed (i.e., if/when we have actual broken links).
|
573
|
+
puts 'Gathering potential link targets' if @verbose
|
574
|
+
|
575
|
+
# body element has 'top', which is a link target.
|
576
|
+
body = doc.at('//body')
|
577
|
+
id = body.attribute('id')
|
578
|
+
ids.push(id) if id
|
579
|
+
|
580
|
+
# Some ids are in the as (anchors).
|
581
|
+
body.search('a').each do |a|
|
582
|
+
id = a.attr(id)
|
583
|
+
ids.push(id) if id
|
584
|
+
end
|
585
|
+
|
586
|
+
# Method ids are in divs, but gather only method-detail divs.
|
587
|
+
body.search('div').each do |div|
|
588
|
+
class_ = div.attr('class')
|
589
|
+
next if class_.nil?
|
590
|
+
next unless class_.match('method-')
|
591
|
+
id = div.attr('id')
|
592
|
+
ids.push(id) if id
|
593
|
+
end
|
594
|
+
|
595
|
+
# Constant ids are in dts.
|
596
|
+
body.search('dt').each do |dt|
|
597
|
+
id = dt.attr('id')
|
598
|
+
ids.push(id) if id
|
599
|
+
end
|
600
|
+
|
601
|
+
# Label ids are in headings.
|
602
|
+
%w[h1 h2 h3 h4 h5 h6].each do |tag|
|
603
|
+
body.search(tag).each do |h|
|
604
|
+
id = h.attr('id')
|
605
|
+
ids.push(id) if id
|
606
|
+
end
|
607
|
+
end
|
608
|
+
if @verbose
|
609
|
+
ids.each_with_index do |id, i|
|
610
|
+
puts '%4d %s' % [i, id]
|
611
|
+
end
|
612
|
+
end
|
613
|
+
puts "Gathered #{ids.size} potential link targets" if @verbose
|
614
|
+
|
615
|
+
end
|
616
|
+
|
9
617
|
end
|
10
618
|
|
11
|
-
|
619
|
+
# Class to represent a link.
|
620
|
+
class Link
|
621
|
+
|
622
|
+
attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
|
623
|
+
|
624
|
+
# Returns a new \Link object:
|
625
|
+
#
|
626
|
+
# - +href+: attribute href from anchor element.
|
627
|
+
# - +text+: attribute text from anchor element.
|
628
|
+
# - +dirname+: directory path of the linking page.
|
629
|
+
#
|
630
|
+
# TODO: accept the anchor element, instead of its href and text.
|
631
|
+
def initialize(href, text, dirname)
|
632
|
+
self.href = href
|
633
|
+
self.text = text
|
634
|
+
self.dirname = dirname
|
635
|
+
path, fragment = href.split('#', 2)
|
636
|
+
self.path = path
|
637
|
+
self.fragment = fragment
|
638
|
+
self.valid_p = nil
|
639
|
+
self.real_path = make_real_path(dirname, path)
|
640
|
+
self.exception = nil
|
641
|
+
end
|
642
|
+
|
643
|
+
# Return the real (not relative) path of the link.
|
644
|
+
def make_real_path(dirname, path)
|
645
|
+
# Trim single dot.
|
646
|
+
return path.sub('./', '') if path.start_with?('./')
|
647
|
+
return path if dirname.nil? || dirname.empty?
|
648
|
+
|
649
|
+
# May have one or more leading '../'.
|
650
|
+
up_dir = '../'
|
651
|
+
levels = path.scan(/(?=#{up_dir})/).count
|
652
|
+
dirs = dirname.split('/')
|
653
|
+
if levels == 0
|
654
|
+
dirs.empty? ? path : File.join(dirname, path)
|
655
|
+
else
|
656
|
+
# Remove leading '../' elements.
|
657
|
+
path = path.gsub(%r[\.\./], '')
|
658
|
+
# Remove the corresponding parts of dirname.
|
659
|
+
dirs.pop(levels)
|
660
|
+
return path if dirs.empty?
|
661
|
+
dirname = dirs.join('/')
|
662
|
+
File.join(dirname, path)
|
663
|
+
end
|
664
|
+
end
|
665
|
+
|
666
|
+
# Returns whether the link has a fragment.
|
667
|
+
def has_fragment?
|
668
|
+
fragment ? true : false
|
669
|
+
end
|
670
|
+
|
671
|
+
# Puts link info onto $stdout.
|
672
|
+
def puts(i)
|
673
|
+
$stdout.puts <<EOT
|
674
|
+
Link #{i}:
|
675
|
+
Href: #{href}
|
676
|
+
Text: #{text}
|
677
|
+
Path: #{path}
|
678
|
+
Fragment: #{fragment}
|
679
|
+
Valid: #{valid_p}
|
680
|
+
Real path: #{real_path}
|
681
|
+
Dirname: #{dirname}
|
682
|
+
EOT
|
683
|
+
end
|
684
|
+
end
|
12
685
|
|
13
686
|
end
|
data/rdoc_link_checker.gemspec
CHANGED
@@ -5,22 +5,12 @@ require 'rdoc_link_checker/version'
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
6
|
spec.name = 'rdoc_link_checker'
|
7
7
|
spec.version = RDocLinkChecker::VERSION
|
8
|
-
spec.authors = ['
|
8
|
+
spec.authors = ['Burdette Lamar']
|
9
9
|
spec.email = ['burdettelamar@yahoo.com']
|
10
10
|
spec.summary = 'Tool to check links in RDoc-generated HTML files.'
|
11
11
|
spec.homepage = 'https://github.com/BurdetteLamar/rdoc_link_checker'
|
12
12
|
spec.license = 'MIT'
|
13
13
|
|
14
|
-
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
15
|
-
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
16
|
-
# if spec.respond_to?(:metadata)
|
17
|
-
# spec.metadata['allowed_push_host'] = 'https://rubygems.org/'
|
18
|
-
# spec.metadata['allowed_push_host'] = "http://rubygems.org"
|
19
|
-
# else
|
20
|
-
# raise 'RubyGems 2.0 or newer is required to protect against ' \
|
21
|
-
# 'public gem pushes.'
|
22
|
-
# end
|
23
|
-
|
24
14
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
25
15
|
f.match(%r{^(test)/})
|
26
16
|
end
|
@@ -28,6 +18,12 @@ Gem::Specification.new do |spec|
|
|
28
18
|
spec.executables = ['rdoc_link_checker']
|
29
19
|
spec.require_paths = ['lib']
|
30
20
|
|
21
|
+
spec.metadata = {
|
22
|
+
'bug_tracker_uri' => 'https://github.com/BurdetteLamar/rdoc_link_checker/issues',
|
23
|
+
'documentation_uri' => 'https://github.com/BurdetteLamar/rdoc_link_checker/blob/dev/README.md',
|
24
|
+
'homepage_uri' => 'https://github.com/BurdetteLamar/rdoc_link_checker',
|
25
|
+
}
|
26
|
+
|
31
27
|
spec.add_development_dependency 'bundler', '~> 1.14'
|
32
28
|
spec.add_development_dependency 'rake', '~> 12.3.2'
|
33
29
|
spec.add_development_dependency 'minitest', '~> 5.0'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdoc_link_checker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Burdette Lamar
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,13 +66,17 @@ files:
|
|
66
66
|
- README.md
|
67
67
|
- Rakefile
|
68
68
|
- bin/rdoc_link_checker
|
69
|
+
- doc/help.txt
|
69
70
|
- lib/rdoc_link_checker.rb
|
70
71
|
- lib/rdoc_link_checker/version.rb
|
71
72
|
- rdoc_link_checker.gemspec
|
72
73
|
homepage: https://github.com/BurdetteLamar/rdoc_link_checker
|
73
74
|
licenses:
|
74
75
|
- MIT
|
75
|
-
metadata:
|
76
|
+
metadata:
|
77
|
+
bug_tracker_uri: https://github.com/BurdetteLamar/rdoc_link_checker/issues
|
78
|
+
documentation_uri: https://github.com/BurdetteLamar/rdoc_link_checker/blob/dev/README.md
|
79
|
+
homepage_uri: https://github.com/BurdetteLamar/rdoc_link_checker
|
76
80
|
post_install_message:
|
77
81
|
rdoc_options: []
|
78
82
|
require_paths:
|