rdoc_link_checker 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc87ba9f414e2d9c949f1eb2f85c20ff6ad3a165f0bed638211ea7a84ac16d69
4
- data.tar.gz: e801fe955f57f847e7d172540a23a6acfc94597e878ee8dad15eb772da155d32
3
+ metadata.gz: 3a9ffa7ca1ef044fdc73a544cd7fb31dce91b0fc0a2f0937d071a8ce1d033ac6
4
+ data.tar.gz: 7e93a01cbb75c2db88050e1ab10aee8e549316d4ae7e06ab80dec38e3af5f9e6
5
5
  SHA512:
6
- metadata.gz: 7487c3faaed27b9e8a3d3c9922dd650891eb7aeb1e9bd50704006037c9c4dd234bacc95482dcd2247ef9222e4288f2b2e789485bcd044d113a565d0fac733f96
7
- data.tar.gz: ebcb50284bb5f96f184d1f0047ed2a21994474c6c3e135dc09532a72ee88fe1fb25fa50e6253d7599966d73287543f7543bde72a34ab536d73b0b165c88a1670
6
+ metadata.gz: ffd501ff86f9bda291348d6a9aa0a982a4ad1b7b5020d963c69a388343a34427b2e9f5282732da8249b1fc2604482938dd96a610db236cd537f37f9abee7d075
7
+ data.tar.gz: 5ee92b068a8cb0feea303e4bf8f669bac3f587086a213cd9324cdd6d66e07c22bf6639c42d5119e25d63ad312bac3211bc2675424fec7c4ad76b0ca1f4770c32
data/README.md CHANGED
@@ -1,6 +1,35 @@
1
1
  # RDoc Link Checker
2
2
 
3
- Not ready for prime time. Just wanted to reserve the name.
3
+ A gem to find broken links in HTML files generated by Ruby RDoc.
4
4
 
5
- Development is active,
6
- so should be up in a few days (say, by the end of May 2023).
5
+ Reports a link as broken if:
6
+
7
+ - The target page given by +href+ is not found.
8
+ - The target page is found, but the fragment given by +href+
9
+ is not a link target on that page;
10
+ this usually causes a browser to open at the top of the page
11
+ instead of at the given fragment.
12
+
13
+ Note that some browsers are forgiving, and will open the target
14
+ page at a link target similar to the given fragment;
15
+ for example, fragment ```bar``` may be opened at an element
16
+ with id ```foobar```.
17
+
18
+ ```
19
+ Usage:
20
+ rdoc_link_checker html_dirpath options
21
+
22
+ The argument is the path to a directory containing a tree
23
+ of RDoc-generated HTML files, such as those generated by command
24
+
25
+ rdoc --visibility=private --op html . # Note the trailing dot.
26
+
27
+ Options:
28
+ --onsite_only Check link targets only on pages in the file tree at <html_dirpath>,
29
+ and not those on other local pages or those on the web.
30
+ --no_toc Do not check links on the TOC page (table_of_contents.html).
31
+ --version Print the version and exit.
32
+ --help Print this help and exit.
33
+
34
+ The output is file <html_dirpath>/Report.htm, which reports broken links.
35
+ ```
@@ -5,31 +5,33 @@ require_relative '../lib/rdoc_link_checker'
5
5
 
6
6
  options = GetoptLong.new(
7
7
  ['--html_dirpath', '-d', GetoptLong::REQUIRED_ARGUMENT],
8
- ['--version', '-v', GetoptLong::NO_ARGUMENT],
9
- ['--help', '-h', GetoptLong::NO_ARGUMENT]
8
+ ['--onsite_only', '-l', GetoptLong::NO_ARGUMENT],
9
+ ['--no_toc', '-n', GetoptLong::NO_ARGUMENT],
10
+ ['--version', '-v', GetoptLong::NO_ARGUMENT],
11
+ ['--help', '-h', GetoptLong::NO_ARGUMENT]
10
12
  )
11
13
 
12
- message = nil
13
- case ARGV.size
14
- when 0
15
- message = "Expected one argument; got none."
16
- when 1
17
- # Okay.
18
- else
19
- message = "Expected one argument, not #{ARGV.inspect}."
20
- end
21
- raise ArgumentError.new(message) if message
22
-
23
14
  def help
24
- puts 'Boo!'
15
+ path = File.absolute_path(__FILE__)
16
+ dirname = File.dirname(File.dirname(path))
17
+ filepath = File.join(dirname, 'doc', 'help.txt')
18
+ puts File.read(filepath)
19
+ exit
25
20
  end
26
21
 
27
22
  def version
28
23
  puts RDocLinkChecker::VERSION
24
+ exit
29
25
  end
30
26
 
27
+ onsite_only = false
28
+ no_toc = false
31
29
  options.each do |option, argument|
32
30
  case option
31
+ when '--onsite_only'
32
+ onsite_only = true
33
+ when '--no_toc'
34
+ no_toc = true
33
35
  when '--help'
34
36
  help
35
37
  when '--version'
@@ -37,5 +39,20 @@ options.each do |option, argument|
37
39
  end
38
40
  end
39
41
 
42
+ message = nil
43
+ case ARGV.size
44
+ when 0
45
+ message = "Expected one argument; got none."
46
+ when 1
47
+ # Okay.
48
+ else
49
+ message = "Expected one argument, not #{ARGV.inspect}."
50
+ end
51
+ raise ArgumentError.new(message) if message
52
+
40
53
  html_dirpath = ARGV[0]
41
- RDocLinkChecker.new(html_dirpath)
54
+ RDocLinkChecker.new(
55
+ html_dirpath,
56
+ onsite_only: onsite_only,
57
+ no_toc: no_toc
58
+ ).check
data/doc/help.txt ADDED
@@ -0,0 +1,16 @@
1
+ Usage:
2
+ rdoc_link_checker html_dirpath options
3
+
4
+ The argument is the path to a directory containing a tree
5
+ of RDoc-generated HTML files, such as those generated by command
6
+
7
+ rdoc --visibility=private --op html . # Note the trailing dot.
8
+
9
+ Options:
10
+ --onsite_only Check link targets only on pages in the file tree at <html_dirpath>,
11
+ and not those on other local pages or those on the web.
12
+ --no_toc Do not check links on the TOC page (table_of_contents.html).
13
+ --version Print the version and exit.
14
+ --help Print this help and exit.
15
+
16
+ The output is file <html_dirpath>/Report.htm, which reports broken links.
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class RDocLinkChecker
4
- VERSION = "0.2.0"
4
+ VERSION = "0.3.0"
5
5
  end
@@ -1,13 +1,686 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'nokogiri'
4
+ require 'rexml/document'
5
+ require 'find'
6
+ require 'net/http'
7
+
3
8
  require_relative 'rdoc_link_checker/version'
4
9
 
5
10
  class RDocLinkChecker
6
11
 
7
- def initialize(html_dirpath)
8
- puts html_dirpath
12
+ include REXML
13
+
14
+ attr_accessor :html_dirpath, :onsite_only, :no_toc
15
+
16
+ def initialize(
17
+ html_dirpath,
18
+ onsite_only: false,
19
+ no_toc: false
20
+ )
21
+ self.html_dirpath = html_dirpath
22
+ self.onsite_only = onsite_only
23
+ self.no_toc = no_toc
24
+ @pages = {}
25
+ @counts = {
26
+ source_pages: 0,
27
+ target_pages: 0,
28
+ links_checked: 0,
29
+ links_broken: 0,
30
+ }
31
+ @verbose = false
32
+ end
33
+
34
+ def check
35
+ # All work is done in the HTML directory,
36
+ # and that is where Report.htm will be put.
37
+ Dir.chdir(html_dirpath) do |dir|
38
+ @counts[:start_time] = Time.now
39
+ gather_source_paths
40
+ create_source_pages
41
+ create_target_pages
42
+ verify_links
43
+ @counts[:end_time] = Time.now
44
+ report
45
+ end
46
+ end
47
+
48
+ # Gather paths to source HTML pages.
49
+ def gather_source_paths
50
+ paths = []
51
+ puts 'Gathering source paths' if @verbose
52
+ paths = Find.find('.').select {|path| path.end_with?('.html') }
53
+ # Remove leading './'.
54
+ @source_paths = paths.map{|path| path.sub(%r[^\./], '')}
55
+ @source_paths.delete('table_of_contents.html') if no_toc
56
+ if @verbose
57
+ @source_paths.each_with_index do |source_path, i|
58
+ puts '- %4d %s' % [i, source_path]
59
+ end
60
+ end
61
+ @counts[:source_pages] = @source_paths.size
62
+ puts "Gathered #{@source_paths.size} source paths" if @verbose
63
+ end
64
+
65
+ # Create a source \Page object for each source path.
66
+ # Gather its links and ids.
67
+ def create_source_pages
68
+ puts "Creating #{@source_paths.size} source pages" if @verbose
69
+ @source_paths.sort.each_with_index do |source_path, i|
70
+ progress_s = RDocLinkChecker.progress_s(i + 1, @source_paths.size)
71
+ puts "Creating source page #{source_path} #{progress_s}" if @verbose
72
+ source_page = Page.new(source_path, @verbose, @pages, @counts, onsite_only)
73
+ @pages[source_path] = source_page
74
+ source_text = File.read(source_path)
75
+ doc = Nokogiri::HTML(source_text)
76
+ source_page.gather_links(doc)
77
+ source_page.gather_ids(doc)
78
+ puts "Created source page #{progress_s}" if @verbose
79
+ end
80
+ puts "Created #{@pages.size} source pages" if @verbose
81
+ end
82
+
83
+ # Create a target \Page object for each link
84
+ # (unless already created as a source page).
85
+ def create_target_pages
86
+ doc = nil
87
+ target_page_count = 0
88
+ @source_paths = @pages.keys
89
+ @source_paths.each do |source_path|
90
+ # Need for relative links to work.
91
+ dirname = File.dirname(source_path)
92
+ Dir.chdir(dirname) do
93
+ source_page = @pages[source_path]
94
+ puts "Creating target pages for #{source_page.links.size} links in #{source_path}" if @verbose
95
+ source_page.links.each_with_index do |link, i|
96
+ next if link.path.nil?
97
+ link.puts(i) if @verbose
98
+ target_path = link.real_path
99
+ if @pages[target_path]
100
+ puts "Page #{target_path} already created" if @verbose
101
+ target_page = @pages[target_path]
102
+ else
103
+ if File.readable?(link.path)
104
+ puts "Creating target page #{target_path}" if @verbose
105
+ target_page_count += 1
106
+ target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only)
107
+ @pages[target_path] = target_page
108
+ target_text = File.read(link.path)
109
+ doc = Nokogiri::HTML(target_text)
110
+ target_page.gather_ids(doc)
111
+ puts "Created target page #{target_path}" if @verbose
112
+ elsif RDocLinkChecker.checkable?(link.path)
113
+ puts "Creating target page #{target_path}" if @verbose
114
+ target_page_count += 1
115
+ target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only)
116
+ @pages[target_path] = target_page
117
+ puts "Created target page #{target_path}" if @verbose
118
+ link.exception = fetch(link.path, target_page)
119
+ link.valid_p = false if link.exception
120
+ else
121
+ puts "File not readable or checkable: #{target_path}" if @verbose
122
+ end
123
+ end
124
+ next if target_page.nil?
125
+ if link.has_fragment? && target_page.ids.empty?
126
+ doc || doc = Nokogiri::HTML(target_text)
127
+ target_page.gather_ids(doc)
128
+ end
129
+ end
130
+ puts "Created target pages for #{source_page.links.size} links in #{source_path}" if @verbose
131
+ end
132
+ end
133
+ puts "Created #{target_page_count} target pages" if @verbose
134
+ @counts[:target_pages] = target_page_count
135
+ end
136
+
137
+ # Verify that each link target exists.
138
+ def verify_links
139
+ linking_pages = @pages.select do |path, page|
140
+ !page.links.empty?
141
+ end
142
+ puts "Checking links on #{linking_pages.size} pages" if @verbose
143
+ link_count = 0
144
+ broken_count = 0
145
+ linking_pages.each_pair do |path, page|
146
+ puts "Checking #{page.links.size} links on page #{path}" if @verbose
147
+ link_count += page.links.size
148
+ page.links.each_with_index do |link, i|
149
+ if link.valid_p.nil? # Don't disturb if already set to false.
150
+ target_page = @pages[link.real_path]
151
+ if target_page
152
+ target_id = link.fragment
153
+ link.valid_p = target_id.nil? || target_page.ids.include?(target_id)
154
+ else
155
+ link_valid_p = false
156
+ end
157
+ end
158
+ link.puts(i) if @verbose
159
+ broken_count += 1 unless link.valid_p
160
+ end
161
+ puts "Checked #{page.links.size} links on page #{path}" if @verbose
162
+ end
163
+ puts "Checked #{link_count} links on #{linking_pages.size} pages" if @verbose
164
+ @counts[:links_checked] = link_count
165
+ @counts[:links_broken] = broken_count
166
+ end
167
+
168
+ # Fetch the page from the web and gather its ids into the target page.
169
+ # Returns exception or nil.
170
+ def fetch(url, target_page)
171
+ puts "Begin fetch target page #{url}" if @verbose
172
+ puts "Getting return code for #{url}" if @verbose
173
+ code = 0
174
+ exception = nil
175
+ begin
176
+ response = Net::HTTP.get_response(URI(url))
177
+ code = response.code.to_i
178
+ target_page.code = code
179
+ puts "Returned #{code} (#{response.class})" if @verbose
180
+ rescue => x
181
+ puts "Raised #{x.class} #{x.message}" if @verbose
182
+ raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/)
183
+ exception = RDocLinkChecker::HttpResponseError.new(url, x)
184
+ end
185
+ puts "Got return code #{code} for #{url} " if @verbose
186
+ # Don't load if bad code, or no response, or if not html.
187
+ if !code_bad?(code)
188
+ if content_type_html?(response)
189
+ doc = Nokogiri::HTML(response.body)
190
+ target_page.gather_ids(doc)
191
+ end
192
+ end
193
+ puts "End fetch target page #{url}" if @verbose
194
+ exception
195
+ end
196
+
197
+ # Returns whether the code is bad (zero or >= 400).
198
+ def code_bad?(code)
199
+ return false if code.nil?
200
+ (code == 0) || (code >= 400)
201
+ end
202
+
203
+ # Returns whether the response body should be HTML.
204
+ def content_type_html?(response)
205
+ return false unless response
206
+ return false unless response['Content-Type']
207
+ response['Content-Type'].match('html')
208
+ end
209
+
210
+ # Returns whether the path is offsite.
211
+ def self.offsite?(path)
212
+ path.start_with?('http')
213
+ end
214
+
215
+ # Returns the string fragment for the given path or ULR, or +nil+
216
+ def self.get_fragment(s)
217
+ a = s.split('#', 2)
218
+ a.size == 2 ? a[1] : nil
219
+ end
220
+
221
+ # Returns a progress string giving a fraction and percentage.
222
+ def self.progress_s(i, total)
223
+ fraction_s = "#{i}/#{total}"
224
+ percent_i = (i*100.0/total).round
225
+ "(#{fraction_s}, #{percent_i}%)"
226
+ end
227
+
228
+ # Returns whether the path is checkable.
229
+ def self.checkable?(path)
230
+ return false unless path
231
+ begin
232
+ uri = URI(path)
233
+ return ['http', 'https', nil].include?(uri.scheme)
234
+ rescue
235
+ return false
236
+ end
237
+ end
238
+
239
+ # Generate the report; +checker+ is the \RDocLinkChecker object.
240
+ def report
241
+
242
+ doc = Document.new('')
243
+ root = doc.add_element(Element.new('root'))
244
+
245
+ head = root.add_element(Element.new('head'))
246
+ title = head.add_element(Element.new('title'))
247
+ title.text = 'RDocLinkChecker Report'
248
+ style = head.add_element(Element.new('style'))
249
+ style.text = <<EOT
250
+ * { font-family: sans-serif }
251
+ .data { font-family: courier }
252
+ .center { text-align: center }
253
+ .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
254
+ .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
255
+ .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
256
+ .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
257
+ EOT
258
+
259
+ body = root.add_element(Element.new('body'))
260
+ h1 = body.add_element(Element.new('h1'))
261
+ h1.text = 'RDocLinkChecker Report'
262
+
263
+ add_summary(body)
264
+ add_broken_links(body)
265
+ add_offsite_links(body) unless onsite_only
266
+ report_file_path = 'Report.htm' # _Not_ .html.
267
+ doc.write(File.new(report_file_path, 'w'), 2)
268
+ end
269
+
270
+ def add_summary(body)
271
+ h2 = body.add_element(Element.new('h2'))
272
+ h2.text = 'Summary'
273
+
274
+ # Parameters table.
275
+ data = []
276
+ [
277
+ :html_dirpath,
278
+ :onsite_only,
279
+ :no_toc
280
+ ].each do |sym|
281
+ value = send(sym).inspect
282
+ row = {sym => :label, value => :good}
283
+ data.push(row)
284
+ end
285
+ table2(body, data, 'Parameters')
286
+ body.add_element(Element.new('p'))
287
+
288
+ # Times table.
289
+ elapsed_time = @counts[:end_time] - @counts[:start_time]
290
+ seconds = elapsed_time % 60
291
+ minutes = (elapsed_time / 60) % 60
292
+ hours = (elapsed_time/3600)
293
+ elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
294
+ format = "%Y-%m-%d-%a-%H:%M:%S"
295
+ start_time_s = @counts[:start_time].strftime(format)
296
+ end_time_s = @counts[:end_time].strftime(format)
297
+ data = [
298
+ {'Start Time' => :label, start_time_s => :good},
299
+ {'End Time' => :label, end_time_s => :good},
300
+ {'Elapsed Time' => :label, elapsed_time_s => :good},
301
+ ]
302
+ table2(body, data, 'Times')
303
+ body.add_element(Element.new('p'))
304
+
305
+ # Counts.
306
+ data = [
307
+ {'Source Pages' => :label, @counts[:source_pages] => :good},
308
+ {'Target Pages' => :label, @counts[:target_pages] => :good},
309
+ {'Links Checked' => :label, @counts[:links_checked] => :good},
310
+ {'Links Broken' => :label, @counts[:links_broken] => :bad},
311
+ ]
312
+ table2(body, data, 'Counts')
313
+ body.add_element(Element.new('p'))
314
+
315
+ end
316
+
317
+ def add_broken_links(body)
318
+ h2 = body.add_element(Element.new('h2'))
319
+ h2.text = 'Broken Links by Source Page'
320
+
321
+ if @counts[:links_broken] == 0
322
+ p = body.add_element('p')
323
+ p.text = 'None.'
324
+ return
325
+ end
326
+
327
+ ul = body.add_element(Element.new('ul'))
328
+ li = ul.add_element(Element.new('li'))
329
+ li.text = 'Href: the href of the anchor element.'
330
+ li = ul.add_element(Element.new('li'))
331
+ li.text = 'Text: the text of the anchor element.'
332
+ li = ul.add_element(Element.new('li'))
333
+ li.text = 'Path: the URL or path of the link (not including the fragment):'
334
+ ul2 = li.add_element(Element.new('ul'))
335
+ li2 = ul2.add_element(Element.new('li'))
336
+ li2.text = 'For an on-site link, an abbreviated path is given.'
337
+ li2 = ul2.add_element(Element.new('li'))
338
+ li2.text = <<EOT
339
+ For an off-site link, the full URL is given.
340
+ If the path is reddish, the page was not found.
341
+ EOT
342
+ li = ul.add_element(Element.new('li'))
343
+ li.text = <<EOT
344
+ Fragment: the fragment of the link.
345
+ If the fragment is reddish, fragment was not found.
346
+ EOT
347
+
348
+ @pages.each_pair do |path, page|
349
+ broken_links = page.links.select {|link| !link.valid_p }
350
+ next if broken_links.empty?
351
+
352
+ h3 = body.add_element(Element.new('h3'))
353
+ a = Element.new('a')
354
+ a.text = path
355
+ a.add_attribute('href', path)
356
+ h3.add_element(a)
357
+
358
+ broken_links.each do |link|
359
+ data = []
360
+ # Text, URL, fragment
361
+ a = Element.new('a')
362
+ a.text = link.href
363
+ a.add_attribute('href', link.href)
364
+ data.push({'Href' => :label, a => :bad})
365
+ data.push({'Text' => :label, link.text => :good})
366
+ fragment_p = !link.fragment.nil?
367
+ class_ = fragment_p ? :good : :bad
368
+ data.push({'Path' => :label, link.real_path => class_})
369
+ class_ = fragment_p ? :bad : :good
370
+ data.push({'Fragment' => :label, link.fragment => class_})
371
+ if link.exception
372
+ data.push({'Exception' => :label, link.exception.class => :bad})
373
+ data.push({'Message' => :label, link.exception.message => :bad})
374
+ end
375
+ table2(body, data)
376
+ body.add_element(Element.new('p'))
377
+ end
378
+ end
379
+
380
+ end
381
+
382
+ def add_offsite_links(body)
383
+ h2 = body.add_element(Element.new('h2'))
384
+ h2.text = 'Off-Site Links by Source Page'
385
+ @pages.each_pair do |path, page|
386
+ offsite_links = page.links.select do |link|
387
+ RDocLinkChecker.offsite?(link.href)
388
+ end
389
+ next if offsite_links.empty?
390
+
391
+ h3 = body.add_element(Element.new('h3'))
392
+ a = Element.new('a')
393
+ a.text = path
394
+ a.add_attribute('href', path)
395
+ h3.add_element(a)
396
+
397
+ offsite_links.each do |link|
398
+ data = []
399
+ # Text, URL, fragment
400
+ a = Element.new('a')
401
+ a.text = link.href
402
+ a.add_attribute('href', link.href)
403
+ class_ = link.valid_p ? :good : :bad
404
+ data.push({'Href' => :label, a => class_})
405
+ data.push({'Text' => :label, link.text => :good})
406
+ table2(body, data)
407
+ body.add_element(Element.new('p'))
408
+ end
409
+ end
410
+ end
411
+
412
+ Classes = {
413
+ label: 'label center neutral',
414
+ good: 'data center good',
415
+ iffy: 'data center iffy',
416
+ bad: 'data center bad',
417
+ }
418
+
419
+ def table2(parent, data, title = nil)
420
+ data = data.dup
421
+ table = parent.add_element(Element.new('table'))
422
+ if title
423
+ tr = table.add_element(Element.new('tr)'))
424
+ th = tr.add_element(Element.new('th'))
425
+ th.add_attribute('colspan', 2)
426
+ if title.kind_of?(REXML::Element)
427
+ th.add_element(title)
428
+ else
429
+ th.text = title
430
+ end
431
+ end
432
+ data.each do |row_h|
433
+ label, label_class, value, value_class = row_h.flatten
434
+ tr = table.add_element(Element.new('tr'))
435
+ td = tr.add_element(Element.new('td'))
436
+ td.text = label
437
+ td.add_attribute('class', Classes[label_class])
438
+ td = tr.add_element(Element.new('td'))
439
+ if value.kind_of?(REXML::Element)
440
+ td.add_element(value)
441
+ else
442
+ td.text = value
443
+ end
444
+ td.add_attribute('class', Classes[value_class])
445
+ end
446
+ end
447
+
448
+ class Error; end
449
+
450
+ class HttpResponseError < Error
451
+
452
+ attr_accessor :url, :x
453
+
454
+ def initialize(url, x)
455
+ self.url = url
456
+ self.x = x
457
+ end
458
+
459
+ def message
460
+ <<EOT
461
+ #{self.class.name}:
462
+ An exception was raised when checking page availability with Net::HTTP:
463
+ Url: #{url}
464
+ Class: #{x.class}
465
+ Message: #{x.message}
466
+ EOT
467
+ end
468
+
469
+ end
470
+
471
+ class HttpStatusCodeError < Error
472
+
473
+ attr_accessor :url, :code
474
+
475
+ def initialize(url, code)
476
+ self.url = url
477
+ self.code = code
478
+ end
479
+
480
+ def message
481
+ <<EOT
482
+ #{self.class.name}:
483
+ The return code for the page was not 200:
484
+ Url: #{url}
485
+ Return code: #{code}
486
+ EOT
487
+ end
488
+
489
+ end
490
+
491
+ # Class to represent a page.
492
+ class Page
493
+
494
+ attr_accessor :path, :type, :verbose, :pages, :counts, :code, :links, :ids, :dirname, :onsite_only
495
+
496
+ # Returns a new \Page object:
497
+ #
498
+ # - +path+: a path relative to the HTML directory (if on-site)
499
+ # or a URL (if off-site).
500
+ # - +verbose+: whether to put progress message to $stdout.
501
+ # - +pages+: hash of path/page pairs.
502
+ # - +counts+: hash of counts.
503
+ #
504
+ def initialize(path, verbose, pages, counts, onsite_only)
505
+ self.path = path
506
+ self.verbose = verbose
507
+ self.pages = pages
508
+ self.counts = counts
509
+ self.onsite_only = onsite_only
510
+ self.code = nil
511
+ self.links = []
512
+ self.ids = []
513
+ self.dirname = File.dirname(path)
514
+ self.dirname = self.dirname == '.' ? '' : dirname
515
+ end
516
+
517
+ # Gather links for the page:
518
+ #
519
+ # - +doc+: Nokogiri document to be parsed for links.
520
+ #
521
+ def gather_links(doc)
522
+ puts 'Gathering links' if @verbose
523
+ i = 0
524
+ # The links are in the anchors.
525
+ doc.search('a').each do |a|
526
+ # Ignore pilcrow (paragraph character) and up-arrow.
527
+ next if a.text == "\u00B6"
528
+ next if a.text == "\u2191"
529
+
530
+ href = a.attr('href')
531
+ next if href.nil? or href.empty?
532
+ next if RDocLinkChecker.offsite?(href) && onsite_only
533
+ next unless RDocLinkChecker.checkable?(href)
534
+
535
+ link = Link.new(href, a.text, dirname)
536
+ next if link.path.nil? || link.path.empty?
537
+
538
+ links.push(link)
539
+ link.puts(i) if @verbose
540
+ i += 1
541
+ end
542
+ puts "Gathered #{i} links" if @verbose
543
+ end
544
+
545
+ # Gather ids for the page.
546
+ # +doc+ is the Nokogiri document to be parsed.
547
+ def gather_ids(doc)
548
+ # Don't do twice (some pages are both source and target).
549
+ return unless ids.empty?
550
+
551
+ # For off-site, gather all ids, regardless of element.
552
+ if RDocLinkChecker.offsite?(path)
553
+ doc.xpath("//*[@id]").each do |element|
554
+ id = element.attr('id')
555
+ ids.push(id)
556
+ end
557
+ return
558
+ end
559
+
560
+ # We're on-site, which means that the page is RDoc-generated
561
+ # and we know what to expect.
562
+ # In theory, an author can link to any element that has an attribute :id.
563
+ # In practice, gathering all such elements is very time-consuming.
564
+ # These are the elements currently linked to:
565
+ #
566
+ # - body
567
+ # - a
568
+ # - div
569
+ # - dt
570
+ # - h*
571
+ #
572
+ # We can add more as needed (i.e., if/when we have actual broken links).
573
+ puts 'Gathering potential link targets' if @verbose
574
+
575
+ # body element has 'top', which is a link target.
576
+ body = doc.at('//body')
577
+ id = body.attribute('id')
578
+ ids.push(id) if id
579
+
580
+ # Some ids are in the as (anchors).
581
+ body.search('a').each do |a|
582
+ id = a.attr(id)
583
+ ids.push(id) if id
584
+ end
585
+
586
+ # Method ids are in divs, but gather only method-detail divs.
587
+ body.search('div').each do |div|
588
+ class_ = div.attr('class')
589
+ next if class_.nil?
590
+ next unless class_.match('method-')
591
+ id = div.attr('id')
592
+ ids.push(id) if id
593
+ end
594
+
595
+ # Constant ids are in dts.
596
+ body.search('dt').each do |dt|
597
+ id = dt.attr('id')
598
+ ids.push(id) if id
599
+ end
600
+
601
+ # Label ids are in headings.
602
+ %w[h1 h2 h3 h4 h5 h6].each do |tag|
603
+ body.search(tag).each do |h|
604
+ id = h.attr('id')
605
+ ids.push(id) if id
606
+ end
607
+ end
608
+ if @verbose
609
+ ids.each_with_index do |id, i|
610
+ puts '%4d %s' % [i, id]
611
+ end
612
+ end
613
+ puts "Gathered #{ids.size} potential link targets" if @verbose
614
+
615
+ end
616
+
9
617
  end
10
618
 
11
- class Error < StandardError; end
619
+ # Class to represent a link.
620
+ class Link
621
+
622
+ attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
623
+
624
+ # Returns a new \Link object:
625
+ #
626
+ # - +href+: attribute href from anchor element.
627
+ # - +text+: attribute text from anchor element.
628
+ # - +dirname+: directory path of the linking page.
629
+ #
630
+ # TODO: accept the anchor element, instead of its href and text.
631
+ def initialize(href, text, dirname)
632
+ self.href = href
633
+ self.text = text
634
+ self.dirname = dirname
635
+ path, fragment = href.split('#', 2)
636
+ self.path = path
637
+ self.fragment = fragment
638
+ self.valid_p = nil
639
+ self.real_path = make_real_path(dirname, path)
640
+ self.exception = nil
641
+ end
642
+
643
+ # Return the real (not relative) path of the link.
644
+ def make_real_path(dirname, path)
645
+ # Trim single dot.
646
+ return path.sub('./', '') if path.start_with?('./')
647
+ return path if dirname.nil? || dirname.empty?
648
+
649
+ # May have one or more leading '../'.
650
+ up_dir = '../'
651
+ levels = path.scan(/(?=#{up_dir})/).count
652
+ dirs = dirname.split('/')
653
+ if levels == 0
654
+ dirs.empty? ? path : File.join(dirname, path)
655
+ else
656
+ # Remove leading '../' elements.
657
+ path = path.gsub(%r[\.\./], '')
658
+ # Remove the corresponding parts of dirname.
659
+ dirs.pop(levels)
660
+ return path if dirs.empty?
661
+ dirname = dirs.join('/')
662
+ File.join(dirname, path)
663
+ end
664
+ end
665
+
666
+ # Returns whether the link has a fragment.
667
+ def has_fragment?
668
+ fragment ? true : false
669
+ end
670
+
671
+ # Puts link info onto $stdout.
672
+ def puts(i)
673
+ $stdout.puts <<EOT
674
+ Link #{i}:
675
+ Href: #{href}
676
+ Text: #{text}
677
+ Path: #{path}
678
+ Fragment: #{fragment}
679
+ Valid: #{valid_p}
680
+ Real path: #{real_path}
681
+ Dirname: #{dirname}
682
+ EOT
683
+ end
684
+ end
12
685
 
13
686
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdoc_link_checker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - burdettelamar
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,7 @@ files:
66
66
  - README.md
67
67
  - Rakefile
68
68
  - bin/rdoc_link_checker
69
+ - doc/help.txt
69
70
  - lib/rdoc_link_checker.rb
70
71
  - lib/rdoc_link_checker/version.rb
71
72
  - rdoc_link_checker.gemspec