rdoc_link_checker 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,674 +1,716 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
- require 'rexml/document'
5
- require 'find'
6
- require 'net/http'
7
-
8
- require_relative 'rdoc_link_checker/version'
9
-
10
- class RDocLinkChecker
11
-
12
- include REXML
13
-
14
- attr_accessor :html_dirpath, :onsite_only, :no_toc
15
-
16
- attr_accessor :source_paths, :pages
17
-
18
- def initialize(
19
- html_dirpath,
20
- onsite_only: false,
21
- no_toc: false
22
- )
23
- self.html_dirpath = html_dirpath
24
- self.onsite_only = onsite_only
25
- self.no_toc = no_toc
26
- self.pages = {}
27
- @counts = {
28
- source_pages: 0,
29
- target_pages: 0,
30
- links_checked: 0,
31
- links_broken: 0,
32
- }
33
- end
34
-
35
- def check
36
- # All work is done in the HTML directory,
37
- # and that is where Report.htm will be put.
38
- Dir.chdir(html_dirpath) do |dir|
39
- @counts[:start_time] = Time.new
40
- gather_source_paths
41
- create_source_pages
42
- create_target_pages
43
- verify_links
44
- @counts[:end_time] = Time.new
45
- report
46
- end
47
- end
48
-
49
- # Gather paths to source HTML pages.
50
- def gather_source_paths
51
- paths = []
52
- paths = Find.find('.').select {|path| path.end_with?('.html') }
53
- # Remove leading './'.
54
- self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
55
- @counts[:source_pages] = source_paths.size
56
- end
57
-
58
- # Create a source \Page object for each source path.
59
- # Gather its links and ids.
60
- def create_source_pages
61
- source_paths.sort.each_with_index do |source_path, i|
62
- progress_s = RDocLinkChecker.progress_s(i + 1, source_paths.size)
63
- source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
64
- pages[source_path] = source_page
65
- source_text = File.read(source_path)
66
- doc = Nokogiri::HTML(source_text)
67
- if source_path == 'table_of_contents.html'
68
- source_page.gather_links(doc) unless no_toc
69
- else
70
- source_page.gather_links(doc)
71
- end
72
- source_page.gather_ids(doc)
73
- end
74
- end
75
-
76
- # Create a target \Page object for each link
77
- # (unless already created as a source page).
78
- def create_target_pages
79
- doc = nil
80
- target_page_count = 0
81
- source_paths = pages.keys
82
- source_paths.each do |source_path|
83
- # Need for relative links to work.
84
- dirname = File.dirname(source_path)
85
- Dir.chdir(dirname) do
86
- source_page = pages[source_path]
87
- source_page.links.each_with_index do |link, i|
88
- next if link.path.nil?
89
- target_path = link.real_path
90
- if pages[target_path]
91
- target_page = pages[target_path]
92
- else
93
- target_page_count += 1
94
- target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
95
- pages[target_path] = target_page
96
- if File.readable?(link.path)
97
- target_text = File.read(link.path)
98
- doc = Nokogiri::HTML(target_text)
99
- target_page.gather_ids(doc)
100
- elsif RDocLinkChecker.checkable?(link.path)
101
- link.exception = fetch(link.path, target_page)
102
- link.valid_p = false if link.exception
103
- else
104
- # File not readable or checkable.
105
- end
106
- end
107
- next if target_page.nil?
108
- if link.has_fragment? && target_page.ids.empty?
109
- doc || doc = Nokogiri::HTML(target_text)
110
- target_page.gather_ids(doc)
111
- end
112
- end
113
- end
114
- end
115
- @counts[:target_pages] = target_page_count
116
- end
117
-
118
- # Verify that each link target exists.
119
- def verify_links
120
- linking_pages = pages.select do |path, page|
121
- !page.links.empty?
122
- end
123
- link_count = 0
124
- broken_count = 0
125
- linking_pages.each_pair do |path, page|
126
- link_count += page.links.size
127
- page.links.each_with_index do |link, i|
128
- if link.valid_p.nil? # Don't disturb if already set to false.
129
- target_page = pages[link.real_path]
130
- if target_page
131
- target_id = link.fragment
132
- link.valid_p = target_id.nil? || target_page.ids.include?(target_id)
133
- else
134
- link.valid_p = false
135
- end
136
- end
137
- broken_count += 1 unless link.valid_p
138
- end
139
- end
140
- @counts[:links_checked] = link_count
141
- @counts[:links_broken] = broken_count
142
- end
143
-
144
- # Fetch the page from the web and gather its ids into the target page.
145
- # Returns exception or nil.
146
- def fetch(url, target_page)
147
- code = 0
148
- exception = nil
149
- begin
150
- response = Net::HTTP.get_response(URI(url))
151
- code = response.code.to_i
152
- target_page.code = code
153
- rescue => x
154
- raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/)
155
- exception = RDocLinkChecker::HttpResponseError.new(url, x)
156
- end
157
- # Don't load if bad code, or no response, or if not html.
158
- if !code_bad?(code)
159
- if content_type_html?(response)
160
- doc = Nokogiri::HTML(response.body)
161
- target_page.gather_ids(doc)
162
- end
163
- end
164
- exception
165
- end
166
-
167
- # Returns whether the code is bad (zero or >= 400).
168
- def code_bad?(code)
169
- return false if code.nil?
170
- (code == 0) || (code >= 400)
171
- end
172
-
173
- # Returns whether the response body should be HTML.
174
- def content_type_html?(response)
175
- return false unless response
176
- return false unless response['Content-Type']
177
- response['Content-Type'].match('html')
178
- end
179
-
180
- # Returns whether the path is offsite.
181
- def self.offsite?(path)
182
- path.start_with?('http')
183
- end
184
-
185
- # Returns the string fragment for the given path or ULR, or +nil+
186
- def self.get_fragment(s)
187
- a = s.split('#', 2)
188
- a.size == 2 ? a[1] : nil
189
- end
190
-
191
- # Returns a progress string giving a fraction and percentage.
192
- def self.progress_s(i, total)
193
- fraction_s = "#{i}/#{total}"
194
- percent_i = (i*100.0/total).round
195
- "(#{fraction_s}, #{percent_i}%)"
196
- end
197
-
198
- # Returns whether the path is checkable.
199
- def self.checkable?(path)
200
- return false unless path
201
- begin
202
- uri = URI(path)
203
- return ['http', 'https', nil].include?(uri.scheme)
204
- rescue
205
- return false
206
- end
207
- end
208
-
209
- # Generate the report; +checker+ is the \RDocLinkChecker object.
210
- def report
211
-
212
- doc = Document.new('')
213
- root = doc.add_element(Element.new('root'))
214
-
215
- head = root.add_element(Element.new('head'))
216
- title = head.add_element(Element.new('title'))
217
- title.text = 'RDocLinkChecker Report'
218
- style = head.add_element(Element.new('style'))
219
- style.text = <<EOT
220
- * { font-family: sans-serif }
221
- .data { font-family: courier }
222
- .center { text-align: center }
223
- .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
224
- .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
225
- .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
226
- .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
227
- EOT
228
-
229
- body = root.add_element(Element.new('body'))
230
- h1 = body.add_element(Element.new('h1'))
231
- h1.text = 'RDocLinkChecker Report'
232
-
233
- add_summary(body)
234
- add_broken_links(body)
235
- # add_offsite_links(body) unless onsite_only
236
- report_file_path = 'Report.htm' # _Not_ .html.
237
- doc.write(File.new(report_file_path, 'w'), 2)
238
- end
239
-
240
- def add_summary(body)
241
- h2 = body.add_element(Element.new('h2'))
242
- h2.text = 'Summary'
243
-
244
- # Parameters table.
245
- data = []
246
- [
247
- :html_dirpath,
248
- :onsite_only,
249
- :no_toc
250
- ].each do |sym|
251
- value = send(sym).inspect
252
- row = {sym => :label, value => :good}
253
- data.push(row)
254
- end
255
- table2(body, data, 'parameters', 'Parameters')
256
- body.add_element(Element.new('p'))
257
-
258
- # Times table.
259
- elapsed_time = @counts[:end_time] - @counts[:start_time]
260
- seconds = elapsed_time % 60
261
- minutes = (elapsed_time / 60) % 60
262
- hours = (elapsed_time/3600)
263
- elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
264
- format = "%Y-%m-%d-%a-%H:%M:%SZ"
265
- start_time_s = @counts[:start_time].strftime(format)
266
- end_time_s = @counts[:end_time].strftime(format)
267
- data = [
268
- {'Start Time' => :label, start_time_s => :good},
269
- {'End Time' => :label, end_time_s => :good},
270
- {'Elapsed Time' => :label, elapsed_time_s => :good},
271
- ]
272
- table2(body, data, 'times', 'Times')
273
- body.add_element(Element.new('p'))
274
-
275
- # Counts.
276
- data = [
277
- {'Source Pages' => :label, @counts[:source_pages] => :good},
278
- {'Target Pages' => :label, @counts[:target_pages] => :good},
279
- {'Links Checked' => :label, @counts[:links_checked] => :good},
280
- {'Links Broken' => :label, @counts[:links_broken] => :bad},
281
- ]
282
- table2(body, data, 'counts', 'Counts')
283
- body.add_element(Element.new('p'))
284
-
285
- end
286
-
287
- def add_broken_links(body)
288
- h2 = body.add_element(Element.new('h2'))
289
- h2.text = 'Broken Links by Source Page'
290
-
291
- if @counts[:links_broken] == 0
292
- p = body.add_element('p')
293
- p.text = 'None.'
294
- return
295
- end
296
-
297
- # Legend.
298
- ul = body.add_element(Element.new('ul'))
299
- li = ul.add_element(Element.new('li'))
300
- li.text = 'Href: the href of the anchor element.'
301
- li = ul.add_element(Element.new('li'))
302
- li.text = 'Text: the text of the anchor element.'
303
- li = ul.add_element(Element.new('li'))
304
- li.text = 'Path: the URL or path of the link (not including the fragment):'
305
- ul2 = li.add_element(Element.new('ul'))
306
- li2 = ul2.add_element(Element.new('li'))
307
- li2.text = 'For an on-site link, an abbreviated path is given.'
308
- li2 = ul2.add_element(Element.new('li'))
309
- li2.text = <<EOT
310
- For an off-site link, the full URL is given.
311
- If the path is reddish, the page was not found.
312
- EOT
313
- li = ul.add_element(Element.new('li'))
314
- li.text = <<EOT
315
- Fragment: the fragment of the link.
316
- If the fragment is reddish, fragment was not found.
317
- EOT
318
-
319
- pages.each_pair do |path, page|
320
- broken_links = page.links.select {|link| !link.valid_p }
321
- next if broken_links.empty?
322
-
323
- page_div = body.add_element(Element.new('div'))
324
- page_div.add_attribute('class', 'broken_page')
325
- page_div.add_attribute('path', path)
326
- page_div.add_attribute('count', broken_links.count)
327
- h3 = page_div.add_element(Element.new('h3'))
328
- a = Element.new('a')
329
- a.text = "#{path} (#{broken_links.count})"
330
- a.add_attribute('href', path)
331
- h3.add_element(a)
332
-
333
- broken_links.each do |link|
334
- link_div = page_div.add_element(Element.new('div'))
335
- link_div.add_attribute('class', 'broken_link')
336
- data = []
337
- # Text, URL, fragment
338
- a = Element.new('a')
339
- a.text = link.href
340
- a.add_attribute('href', link.href)
341
- data.push({'Href' => :label, a => :bad})
342
- data.push({'Text' => :label, link.text => :good})
343
- fragment_p = !link.fragment.nil?
344
- class_ = fragment_p ? :good : :bad
345
- data.push({'Path' => :label, link.real_path => class_})
346
- class_ = fragment_p ? :bad : :good
347
- data.push({'Fragment' => :label, link.fragment => class_})
348
- if link.exception
349
- data.push({'Exception' => :label, link.exception.class => :bad})
350
- data.push({'Message' => :label, link.exception.message => :bad})
351
- end
352
- id = link.exception ? 'bad_url' : 'bad_fragment'
353
- table2(link_div, data, id)
354
- page_div.add_element(Element.new('p'))
355
- end
356
- end
357
-
358
- end
359
-
360
- def add_offsite_links(body)
361
- h2 = body.add_element(Element.new('h2'))
362
- h2.text = 'Off-Site Links by Source Page'
363
- none = true
364
- pages.each_pair do |path, page|
365
- offsite_links = page.links.select do |link|
366
- RDocLinkChecker.offsite?(link.href)
367
- end
368
- next if offsite_links.empty?
369
-
370
- none = false
371
- h3 = body.add_element(Element.new('h3'))
372
- a = Element.new('a')
373
- a.text = path
374
- a.add_attribute('href', path)
375
- h3.add_element(a)
376
-
377
- offsite_links.each do |link|
378
- data = []
379
- # Text, URL, fragment
380
- a = Element.new('a')
381
- a.text = link.href
382
- a.add_attribute('href', link.href)
383
- class_ = link.valid_p ? :good : :bad
384
- data.push({'Href' => :label, a => class_})
385
- data.push({'Text' => :label, link.text => :good})
386
- table2(body, data)
387
- body.add_element(Element.new('p'))
388
- end
389
- end
390
- if none
391
- p = body.add_element(Element.new('p'))
392
- p.text = 'None.'
393
- end
394
- end
395
-
396
- Classes = {
397
- label: 'label center neutral',
398
- good: 'data center good',
399
- iffy: 'data center iffy',
400
- bad: 'data center bad',
401
- }
402
-
403
- def table2(parent, data, id, title = nil)
404
- data = data.dup
405
- table = parent.add_element(Element.new('table'))
406
- table.add_attribute('id', id)
407
- if title
408
- tr = table.add_element(Element.new('tr)'))
409
- th = tr.add_element(Element.new('th'))
410
- th.add_attribute('colspan', 2)
411
- if title.kind_of?(REXML::Element)
412
- th.add_element(title)
413
- else
414
- th.text = title
415
- end
416
- end
417
- data.each do |row_h|
418
- label, label_class, value, value_class = row_h.flatten
419
- tr = table.add_element(Element.new('tr'))
420
- td = tr.add_element(Element.new('td'))
421
- td.text = label
422
- td.add_attribute('class', Classes[label_class])
423
- td = tr.add_element(Element.new('td'))
424
- if value.kind_of?(REXML::Element)
425
- td.add_element(value)
426
- else
427
- td.text = value
428
- end
429
- td.add_attribute('class', Classes[value_class])
430
- end
431
- end
432
-
433
- class Error; end
434
-
435
- class HttpResponseError < Error
436
-
437
- attr_accessor :url, :x
438
-
439
- def initialize(url, x)
440
- self.url = url
441
- self.x = x
442
- end
443
-
444
- def message
445
- <<EOT
446
- #{self.class.name}:
447
- An exception was raised when checking page availability with Net::HTTP:
448
- Url: #{url}
449
- Class: #{x.class}
450
- Message: #{x.message}
451
- EOT
452
- end
453
-
454
- end
455
-
456
- class HttpStatusCodeError < Error
457
-
458
- attr_accessor :url, :code
459
-
460
- def initialize(url, code)
461
- self.url = url
462
- self.code = code
463
- end
464
-
465
- def message
466
- <<EOT
467
- #{self.class.name}:
468
- The return code for the page was not 200:
469
- Url: #{url}
470
- Return code: #{code}
471
- EOT
472
- end
473
-
474
- end
475
-
476
- # Class to represent a page.
477
- class Page
478
-
479
- attr_accessor :path, :type, :pages, :counts, :code, :links, :ids, :dirname, :onsite_only
480
-
481
- # Returns a new \Page object:
482
- #
483
- # - +path+: a path relative to the HTML directory (if on-site)
484
- # or a URL (if off-site).
485
- # - +pages+: hash of path/page pairs.
486
- # - +counts+: hash of counts.
487
- #
488
- def initialize(type, path, onsite_only, pages: {}, counts: {})
489
- self.path = path
490
- self.type = type
491
- self.pages = pages
492
- self.counts = counts
493
- self.onsite_only = onsite_only
494
- self.code = nil
495
- self.links = []
496
- self.ids = []
497
- self.dirname = File.dirname(path)
498
- self.dirname = self.dirname == '.' ? '' : dirname
499
- end
500
-
501
- def to_h
502
- {
503
- path: path,
504
- type: type,
505
- dirname: dirname,
506
- code: code
507
- }
508
- end
509
-
510
- # Gather links for the page:
511
- #
512
- # - +doc+: Nokogiri document to be parsed for links.
513
- #
514
- def gather_links(doc)
515
- i = 0
516
- # The links are in the anchors.
517
- doc.search('a').each do |a|
518
- # Ignore pilcrow (paragraph character) and up-arrow.
519
- next if a.text == "\u00B6"
520
- next if a.text == "\u2191"
521
-
522
- href = a.attr('href')
523
- next if href.nil? or href.empty?
524
- next if RDocLinkChecker.offsite?(href) && onsite_only
525
- next unless RDocLinkChecker.checkable?(href)
526
-
527
- link = Link.new(href, a.text, dirname)
528
- next if link.path.nil? || link.path.empty?
529
-
530
- links.push(link)
531
- i += 1
532
- end
533
- end
534
-
535
- # Gather ids for the page.
536
- # +doc+ is the Nokogiri document to be parsed.
537
- def gather_ids(doc)
538
- # Don't do twice (some pages are both source and target).
539
- return unless ids.empty?
540
-
541
- # For off-site, gather all ids, regardless of element.
542
- if RDocLinkChecker.offsite?(path)
543
- doc.xpath("//*[@id]").each do |element|
544
- id = element.attr('id')
545
- ids.push(id)
546
- end
547
- return
548
- end
549
-
550
- # We're on-site, which means that the page is RDoc-generated
551
- # and we know what to expect.
552
- # In theory, an author can link to any element that has an attribute :id.
553
- # In practice, gathering all such elements is very time-consuming.
554
- # These are the elements currently linked to:
555
- #
556
- # - body
557
- # - a
558
- # - div
559
- # - dt
560
- # - h*
561
- #
562
- # We can add more as needed (i.e., if/when we have actual broken links).
563
-
564
- # body element has 'top', which is a link target.
565
- body = doc.at('//body')
566
- id = body.attribute('id')
567
- ids.push(id) if id
568
-
569
- # Some ids are in the as (anchors).
570
- body.search('a').each do |a|
571
- id = a.attr(id)
572
- ids.push(id) if id
573
- end
574
-
575
- # Method ids are in divs, but gather only method-detail divs.
576
- body.search('div').each do |div|
577
- class_ = div.attr('class')
578
- next if class_.nil?
579
- next unless class_.match('method-')
580
- id = div.attr('id')
581
- ids.push(id) if id
582
- end
583
-
584
- # Constant ids are in dts.
585
- body.search('dt').each do |dt|
586
- id = dt.attr('id')
587
- ids.push(id) if id
588
- end
589
-
590
- # Label ids are in headings.
591
- %w[h1 h2 h3 h4 h5 h6].each do |tag|
592
- body.search(tag).each do |h|
593
- id = h.attr('id')
594
- ids.push(id) if id
595
- end
596
- end
597
- end
598
-
599
- end
600
-
601
- # Class to represent a link.
602
- class Link
603
-
604
- attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
605
-
606
- # Returns a new \Link object:
607
- #
608
- # - +href+: attribute href from anchor element.
609
- # - +text+: attribute text from anchor element.
610
- # - +dirname+: directory path of the linking page.
611
- #
612
- def initialize(href, text, dirname)
613
- self.href = href
614
- self.text = text
615
- self.dirname = dirname
616
- path, fragment = href.split('#', 2)
617
- self.path = path
618
- self.fragment = fragment
619
- self.valid_p = nil
620
- self.real_path = make_real_path(dirname, path)
621
- self.exception = nil
622
- end
623
-
624
- def to_h
625
- {
626
- href: href,
627
- text: text,
628
- }
629
- end
630
-
631
- # Return the real (not relative) path of the link.
632
- def make_real_path(dirname, path)
633
- # Trim single dot.
634
- return path.sub('./', '') if path.start_with?('./')
635
- return path if dirname.nil? || dirname.empty?
636
-
637
- # May have one or more leading '../'.
638
- up_dir = '../'
639
- levels = path.scan(/(?=#{up_dir})/).count
640
- dirs = dirname.split('/')
641
- if levels == 0
642
- dirs.empty? ? path : File.join(dirname, path)
643
- else
644
- # Remove leading '../' elements.
645
- path = path.gsub(%r[\.\./], '')
646
- # Remove the corresponding parts of dirname.
647
- dirs.pop(levels)
648
- return path if dirs.empty?
649
- dirname = dirs.join('/')
650
- File.join(dirname, path)
651
- end
652
- end
653
-
654
- # Returns whether the link has a fragment.
655
- def has_fragment?
656
- fragment ? true : false
657
- end
658
-
659
- # Puts link info onto $stdout.
660
- def puts(i)
661
- $stdout.puts <<EOT
662
- Link #{i}:
663
- Href: #{href}
664
- Text: #{text}
665
- Path: #{path}
666
- Fragment: #{fragment}
667
- Valid: #{valid_p}
668
- Real path: #{real_path}
669
- Dirname: #{dirname}
670
- EOT
671
- end
672
- end
673
-
674
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+ require 'rexml/document'
6
+ require 'find'
7
+ require 'net/http'
8
+ require 'json'
9
+
10
+ require_relative 'rdoc_link_checker/version'
11
+
12
+ class RDocLinkChecker
13
+
14
+ include REXML
15
+
16
+ attr_accessor :html_dirpath, :config_filepath, :onsite_only, :no_toc,
17
+ :source_file_omits
18
+
19
+ attr_accessor :source_paths, :pages
20
+
21
+ def initialize(
22
+ html_dirpath,
23
+ config_filepath: nil,
24
+ onsite_only: false,
25
+ no_toc: false
26
+ )
27
+ self.html_dirpath = html_dirpath
28
+ self.config_filepath = config_filepath
29
+ self.onsite_only = onsite_only
30
+ self.no_toc = no_toc
31
+ self.source_file_omits = []
32
+ if config_filepath
33
+ config = JSON.parse(File.read(config_filepath))
34
+ options = config['options']
35
+ if options
36
+ val = options['onsite_only']
37
+ self.onsite_only = val if val
38
+ val = options['no_toc']
39
+ self.no_toc = val if val
40
+ end
41
+ regexp_sources = config['source_file_omits']
42
+ if regexp_sources
43
+ regexp_sources.each do |regexp_source|
44
+ self.source_file_omits.push(Regexp.new(regexp_source))
45
+ end
46
+ end
47
+ end
48
+ self.pages = {}
49
+ @counts = {
50
+ source_pages: 0,
51
+ target_pages: 0,
52
+ links_checked: 0,
53
+ links_broken: 0,
54
+ }
55
+ end
56
+
57
+ def check
58
+ # All work is done in the HTML directory,
59
+ # and that is where Report.htm will be put.
60
+ Dir.chdir(html_dirpath) do |dir|
61
+ @counts[:start_time] = Time.new
62
+ gather_source_paths
63
+ create_source_pages
64
+ create_target_pages
65
+ verify_links
66
+ @counts[:end_time] = Time.new
67
+ report
68
+ end
69
+ end
70
+
71
+ # Gather paths to source HTML pages.
72
+ def gather_source_paths
73
+ paths = []
74
+ paths = Find.find('.').select {|path| path.end_with?('.html') }
75
+ # Remove leading './'.
76
+ self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
77
+ source_file_omits.each do |re|
78
+ self.source_paths.delete_if do |source_path|
79
+ source_path.match(re)
80
+ end
81
+ end
82
+ @counts[:source_pages] = source_paths.size
83
+ end
84
+
85
+ # Create a source \Page object for each source path.
86
+ # Gather its links and ids.
87
+ def create_source_pages
88
+ source_paths.sort.each_with_index do |source_path, i|
89
+ source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
90
+ pages[source_path] = source_page
91
+ source_page.content_type = 'text/html'
92
+ source_text = File.read(source_path)
93
+ doc = Nokogiri::HTML(source_text)
94
+ if source_path == 'table_of_contents.html'
95
+ source_page.gather_links(doc) unless no_toc
96
+ else
97
+ source_page.gather_links(doc)
98
+ end
99
+ source_page.gather_link_targets(doc)
100
+ end
101
+ end
102
+
103
+ # Create a target \Page object for each link
104
+ # (unless already created as a source page).
105
+ def create_target_pages
106
+ doc = nil
107
+ target_page_count = 0
108
+ source_paths = pages.keys
109
+ source_paths.each do |source_path|
110
+ # Need for relative links to work.
111
+ dirname = File.dirname(source_path)
112
+ Dir.chdir(dirname) do
113
+ source_page = pages[source_path]
114
+ source_page.links.each_with_index do |link, i|
115
+ next if link.path.nil?
116
+ target_path = link.real_path
117
+ if pages[target_path]
118
+ target_page = pages[target_path]
119
+ else
120
+ target_page_count += 1
121
+ target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
122
+ pages[target_path] = target_page
123
+ if File.readable?(link.path)
124
+ target_text = File.read(link.path)
125
+ doc = Nokogiri::HTML(target_text)
126
+ target_page.gather_link_targets(doc)
127
+ elsif RDocLinkChecker.checkable?(link.path)
128
+ link.exception = fetch(link.path, target_page)
129
+ link.valid_p = false if link.exception
130
+ else
131
+ # File not readable or checkable.
132
+ end
133
+ end
134
+ next if target_page.nil?
135
+ if link.has_fragment? && target_page.ids.empty?
136
+ doc || doc = Nokogiri::HTML(target_text)
137
+ target_page.gather_link_targets(doc)
138
+ end
139
+ end
140
+ end
141
+ end
142
+ @counts[:target_pages] = target_page_count
143
+ end
144
+
145
+ # Verify that each link target exists.
146
+ def verify_links
147
+ linking_pages = pages.select do |path, page|
148
+ !page.links.empty?
149
+ end
150
+ link_count = 0
151
+ broken_count = 0
152
+ linking_pages.each_pair do |path, page|
153
+ link_count += page.links.size
154
+ page.links.each_with_index do |link, i|
155
+ if link.valid_p.nil? # Don't disturb if already set to false.
156
+ target_page = pages[link.real_path]
157
+ if target_page
158
+ target_id = link.fragment
159
+ link.valid_p = target_id.nil? ||
160
+ target_page.ids.include?(target_id) ||
161
+ !target_page.content_type&.match('html')
162
+ else
163
+ link.valid_p = false
164
+ end
165
+ end
166
+ broken_count += 1 unless link.valid_p
167
+ end
168
+ end
169
+ @counts[:links_checked] = link_count
170
+ @counts[:links_broken] = broken_count
171
+ end
172
+
173
+ # Fetch the page from the web and gather its ids into the target page.
174
+ # Returns exception or nil.
175
+ def fetch(url, target_page)
176
+ code = 0
177
+ exception = nil
178
+ begin
179
+ response = Net::HTTP.get_response(URI(url))
180
+ code = response.code.to_i
181
+ target_page.code = code
182
+ target_page.content_type = response['Content-Type']
183
+ rescue => x
184
+ raise unless x.class.name.match(/^(Net|Socket|IO::TimeoutError|Errno::)/)
185
+ exception = RDocLinkChecker::HttpResponseError.new(url, x)
186
+ end
187
+ # Don't load if bad code, or no response, or if not html.
188
+ if !code_bad?(code)
189
+ if content_type_html?(response)
190
+ doc = Nokogiri::HTML(response.body)
191
+ target_page.gather_link_targets(doc)
192
+ end
193
+ end
194
+ exception
195
+ end
196
+
197
+ # Returns whether the code is bad (zero or >= 400).
198
+ def code_bad?(code)
199
+ return false if code.nil?
200
+ (code == 0) || (code >= 400)
201
+ end
202
+
203
+ # Returns whether the response body should be HTML.
204
+ def content_type_html?(response)
205
+ return false unless response
206
+ return false unless response['Content-Type']
207
+ response['Content-Type'].match('html')
208
+ end
209
+
210
+ # Returns whether the path is offsite.
211
+ def self.offsite?(path)
212
+ path.start_with?('http')
213
+ end
214
+
215
+ # Returns the string fragment for the given path or ULR, or +nil+
216
+ def self.get_fragment(s)
217
+ a = s.split('#', 2)
218
+ a.size == 2 ? a[1] : nil
219
+ end
220
+
221
+ # Returns whether the path is checkable.
222
+ def self.checkable?(path)
223
+ return false unless path
224
+ begin
225
+ uri = URI(path)
226
+ return ['http', 'https', nil].include?(uri.scheme)
227
+ rescue
228
+ return false
229
+ end
230
+ end
231
+
232
+ # Generate the report; +checker+ is the \RDocLinkChecker object.
233
+ def report
234
+
235
+ doc = Document.new('')
236
+ root = doc.add_element(Element.new('root'))
237
+
238
+ head = root.add_element(Element.new('head'))
239
+ title = head.add_element(Element.new('title'))
240
+ title.text = 'RDocLinkChecker Report'
241
+ style = head.add_element(Element.new('style'))
242
+ style.text = <<EOT
243
+ * { font-family: sans-serif }
244
+ .data { font-family: courier }
245
+ .center { text-align: center }
246
+ .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
247
+ .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
248
+ .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
249
+ .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
250
+ EOT
251
+
252
+ body = root.add_element(Element.new('body'))
253
+ h1 = body.add_element(Element.new('h1'))
254
+ h1.text = 'RDocLinkChecker Report'
255
+
256
+ add_summary(body)
257
+ add_broken_links(body)
258
+ add_offsite_links(body) unless onsite_only
259
+ report_file_path = 'Report.htm' # _Not_ .html.
260
+ doc.write(File.new(report_file_path, 'w'), 2)
261
+ end
262
+
263
+ def add_summary(body)
264
+ h2 = body.add_element(Element.new('h2'))
265
+ h2.text = 'Summary'
266
+
267
+ # Parameters table.
268
+ data = []
269
+ [
270
+ :html_dirpath,
271
+ :onsite_only,
272
+ :no_toc
273
+ ].each do |sym|
274
+ value = send(sym).inspect
275
+ row = {sym => :label, value => :good}
276
+ data.push(row)
277
+ end
278
+ table2(body, data, 'parameters', 'Parameters')
279
+ body.add_element(Element.new('p'))
280
+
281
+ # Times table.
282
+ elapsed_time = @counts[:end_time] - @counts[:start_time]
283
+ seconds = elapsed_time % 60
284
+ minutes = (elapsed_time / 60) % 60
285
+ hours = (elapsed_time/3600)
286
+ elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
287
+ format = "%Y-%m-%d-%a-%H:%M:%SZ"
288
+ start_time_s = @counts[:start_time].strftime(format)
289
+ end_time_s = @counts[:end_time].strftime(format)
290
+ data = [
291
+ {'Start Time' => :label, start_time_s => :good},
292
+ {'End Time' => :label, end_time_s => :good},
293
+ {'Elapsed Time' => :label, elapsed_time_s => :good},
294
+ ]
295
+ table2(body, data, 'times', 'Times')
296
+ body.add_element(Element.new('p'))
297
+
298
+ # Counts.
299
+ data = [
300
+ {'Source Pages' => :label, @counts[:source_pages] => :good},
301
+ {'Target Pages' => :label, @counts[:target_pages] => :good},
302
+ {'Links Checked' => :label, @counts[:links_checked] => :good},
303
+ {'Links Broken' => :label, @counts[:links_broken] => :bad},
304
+ ]
305
+ table2(body, data, 'counts', 'Counts')
306
+ body.add_element(Element.new('p'))
307
+
308
+ end
309
+
310
+ def add_broken_links(body)
311
+ h2 = body.add_element(Element.new('h2'))
312
+ h2.text = 'Broken Links by Source Page'
313
+
314
+ if @counts[:links_broken] == 0
315
+ p = body.add_element('p')
316
+ p.text = 'None.'
317
+ return
318
+ end
319
+
320
+ # Legend.
321
+ ul = body.add_element(Element.new('ul'))
322
+ li = ul.add_element(Element.new('li'))
323
+ li.text = 'Href: the href of the anchor element.'
324
+ li = ul.add_element(Element.new('li'))
325
+ li.text = 'Text: the text of the anchor element.'
326
+ li = ul.add_element(Element.new('li'))
327
+ li.text = 'Path: the URL or path of the link (not including the fragment):'
328
+ ul2 = li.add_element(Element.new('ul'))
329
+ li2 = ul2.add_element(Element.new('li'))
330
+ li2.text = 'For an on-site link, an abbreviated path is given.'
331
+ li2 = ul2.add_element(Element.new('li'))
332
+ li2.text = <<EOT
333
+ For an off-site link, the full URL is given.
334
+ If the path is reddish, the page was not found.
335
+ EOT
336
+ li = ul.add_element(Element.new('li'))
337
+ li.text = <<EOT
338
+ Fragment: the fragment of the link.
339
+ If the fragment is reddish, fragment was not found.
340
+ EOT
341
+
342
+ pages.each_pair do |path, page|
343
+ broken_links = page.links.select {|link| !link.valid_p }
344
+ next if broken_links.empty?
345
+
346
+ page_div = body.add_element(Element.new('div'))
347
+ page_div.add_attribute('class', 'broken_page')
348
+ page_div.add_attribute('path', path)
349
+ page_div.add_attribute('count', broken_links.count)
350
+ h3 = page_div.add_element(Element.new('h3'))
351
+ a = Element.new('a')
352
+ a.text = "#{path} (#{broken_links.count})"
353
+ a.add_attribute('href', path)
354
+ h3.add_element(a)
355
+
356
+ broken_links.each do |link|
357
+ link_div = page_div.add_element(Element.new('div'))
358
+ link_div.add_attribute('class', 'broken_link')
359
+ data = []
360
+ # Text, URL, fragment
361
+ a = Element.new('a')
362
+ a.text = link.href
363
+ a.add_attribute('href', link.href)
364
+ data.push({'Href' => :label, a => :bad})
365
+ data.push({'Text' => :label, link.text => :good})
366
+ fragment_p = !link.fragment.nil?
367
+ class_ = fragment_p ? :good : :bad
368
+ data.push({'Path' => :label, link.real_path => class_})
369
+ class_ = fragment_p ? :bad : :good
370
+ data.push({'Fragment' => :label, link.fragment => class_})
371
+ if link.exception
372
+ data.push({'Exception' => :label, link.exception.class => :bad})
373
+ data.push({'Message' => :label, link.exception.message => :bad})
374
+ end
375
+ id = link.exception ? 'bad_url' : 'bad_fragment'
376
+ table2(link_div, data, id)
377
+ page_div.add_element(Element.new('p'))
378
+ end
379
+ end
380
+
381
+ end
382
+
383
+ def add_offsite_links(body)
384
+ h2 = body.add_element(Element.new('h2'))
385
+ count = 0
386
+ boilerplate = %w[
387
+ https://validator.w3.org/check/referer
388
+ https://ruby.github.io/rdoc/
389
+ http://deveiate.org/projects/Darkfish-RDoc/
390
+ http://deveiate.org
391
+ ]
392
+ none = true
393
+ pages.each_pair do |path, page|
394
+ offsite_links = page.links.select do |link|
395
+ RDocLinkChecker.offsite?(link.href) && !boilerplate.include?(link.href)
396
+ end
397
+ next if offsite_links.empty?
398
+ count += offsite_links.size
399
+
400
+ none = false
401
+ h3 = body.add_element(Element.new('h3'))
402
+ a = Element.new('a')
403
+ a.text = "#{path} (#{offsite_links.size})"
404
+ a.add_attribute('href', path)
405
+ h3.add_element(a)
406
+
407
+ offsite_links.each do |link|
408
+ data = []
409
+ # Text, URL, fragment
410
+ a = Element.new('a')
411
+ a.text = link.href
412
+ a.add_attribute('href', link.href)
413
+ class_ = link.valid_p ? :good : :bad
414
+ data.push({'Href' => :label, a => class_})
415
+ data.push({'Text' => :label, link.text => :good})
416
+ table2(body, data)
417
+ body.add_element(Element.new('p'))
418
+ end
419
+ end
420
+ h2.text = "Off-Site Links by Source Page (#{count})"
421
+ if none
422
+ p = body.add_element(Element.new('p'))
423
+ p.text = 'None.'
424
+ end
425
+ end
426
+
427
+ Classes = {
428
+ label: 'label center neutral',
429
+ good: 'data center good',
430
+ iffy: 'data center iffy',
431
+ bad: 'data center bad',
432
+ }
433
+
434
+ def table2(parent, data, id = nil, title = nil)
435
+ data = data.dup
436
+ table = parent.add_element(Element.new('table'))
437
+ table.add_attribute('id', id) if id
438
+ if title
439
+ tr = table.add_element(Element.new('tr)'))
440
+ th = tr.add_element(Element.new('th'))
441
+ th.add_attribute('colspan', 2)
442
+ if title.kind_of?(REXML::Element)
443
+ th.add_element(title)
444
+ else
445
+ th.text = title
446
+ end
447
+ end
448
+ data.each do |row_h|
449
+ label, label_class, value, value_class = row_h.flatten
450
+ tr = table.add_element(Element.new('tr'))
451
+ td = tr.add_element(Element.new('td'))
452
+ td.text = label
453
+ td.add_attribute('class', Classes[label_class])
454
+ td = tr.add_element(Element.new('td'))
455
+ if value.kind_of?(REXML::Element)
456
+ td.add_element(value)
457
+ else
458
+ td.text = value
459
+ end
460
+ td.add_attribute('class', Classes[value_class])
461
+ end
462
+ end
463
+
464
+ class Error; end
465
+
466
+ class HttpResponseError < Error
467
+
468
+ attr_accessor :url, :x
469
+
470
+ def initialize(url, x)
471
+ self.url = url
472
+ self.x = x
473
+ end
474
+
475
+ def message
476
+ <<EOT
477
+ #{self.class.name}:
478
+ An exception was raised when checking page availability with Net::HTTP:
479
+ Url: #{url}
480
+ Class: #{x.class}
481
+ Message: #{x.message}
482
+ EOT
483
+ end
484
+
485
+ end
486
+
487
+ class HttpStatusCodeError < Error
488
+
489
+ attr_accessor :url, :code
490
+
491
+ def initialize(url, code)
492
+ self.url = url
493
+ self.code = code
494
+ end
495
+
496
+ def message
497
+ <<EOT
498
+ #{self.class.name}:
499
+ The return code for the page was not 200:
500
+ Url: #{url}
501
+ Return code: #{code}
502
+ EOT
503
+ end
504
+
505
+ end
506
+
507
+ # Class to represent a page.
508
+ class Page
509
+
510
+ attr_accessor :path, :type, :pages, :counts, :code,
511
+ :links, :ids, :dirname, :onsite_only, :content_type
512
+
513
+ # Returns a new \Page object:
514
+ #
515
+ # - +path+: a path relative to the HTML directory (if on-site)
516
+ # or a URL (if off-site).
517
+ # - +pages+: hash of path/page pairs.
518
+ # - +counts+: hash of counts.
519
+ #
520
+ def initialize(type, path, onsite_only, pages: {}, counts: {})
521
+ self.path = path
522
+ self.type = type
523
+ self.pages = pages
524
+ self.counts = counts
525
+ self.onsite_only = onsite_only
526
+ self.code = nil
527
+ self.links = []
528
+ self.ids = []
529
+ self.dirname = File.dirname(path)
530
+ self.dirname = self.dirname == '.' ? '' : dirname
531
+ end
532
+
533
+ def to_h
534
+ {
535
+ path: path,
536
+ type: type,
537
+ dirname: dirname,
538
+ code: code
539
+ }
540
+ end
541
+
542
+ # Gather links for the page:
543
+ #
544
+ # - +doc+: Nokogiri document to be parsed for links.
545
+ #
546
+ def gather_links(doc)
547
+ i = 0
548
+ # The links are in the anchors.
549
+ doc.search('a').each do |a|
550
+ # Ignore pilcrow (paragraph character) and up-arrow.
551
+ next if a.text == "\u00B6"
552
+ next if a.text == "\u2191"
553
+
554
+ href = a.attr('href')
555
+ next if href.nil? or href.empty?
556
+ next if RDocLinkChecker.offsite?(href) && onsite_only
557
+ next unless RDocLinkChecker.checkable?(href)
558
+
559
+ link = Link.new(href, a.text, dirname)
560
+ next if link.path.nil? || link.path.empty?
561
+
562
+ links.push(link)
563
+ i += 1
564
+ end
565
+ end
566
+
567
+ # Gather link targets for the page.
568
+ # +doc+ is the Nokogiri document to be parsed.
569
+ def gather_link_targets(doc)
570
+ # Don't do twice (some pages are both source and target).
571
+ return unless ids.empty?
572
+
573
+ # For off-site, gather all ids, regardless of element.
574
+ if RDocLinkChecker.offsite?(path)
575
+ ok = path.match('homepages')
576
+ doc.xpath("//*[@id]").each do |element|
577
+ id = element.attr('id')
578
+ ids.push(id)
579
+ end
580
+ doc.xpath("//*[@name]").each do |element|
581
+ name = element.attr('name')
582
+ ids.push(name)
583
+ end
584
+ doc.xpath("//a[@href]").each do |element|
585
+ href = element.attr('href')
586
+ next unless href.start_with?('#')
587
+ ids.push(href.sub('#', ''))
588
+ end
589
+ return
590
+ end
591
+
592
+ # We're on-site, which means that the page is RDoc-generated
593
+ # and we know what to expect.
594
+ # In theory, an author can link to any element that has an attribute :id.
595
+ # In practice, gathering all such elements is very time-consuming.
596
+ # These are the elements currently linked to:
597
+ #
598
+ # - body
599
+ # - a
600
+ # - div
601
+ # - dt
602
+ # - h*
603
+ #
604
+ # We can add more as needed (i.e., if/when we have actual broken links).
605
+
606
+ # body element has 'top', which is a link target.
607
+ body = doc.at('//body')
608
+ id = body.attribute('id')
609
+ ids.push(id) if id
610
+
611
+ # Some ids are in the as (anchors).
612
+ body.search('a').each do |a|
613
+ id = a.attr(id)
614
+ ids.push(id) if id
615
+ end
616
+
617
+ # Method ids are in divs, but gather only method-detail divs.
618
+ body.search('div').each do |div|
619
+ class_ = div.attr('class')
620
+ next if class_.nil?
621
+ next unless class_.match('method-')
622
+ id = div.attr('id')
623
+ ids.push(id) if id
624
+ end
625
+
626
+ # Constant ids are in dts.
627
+ body.search('dt').each do |dt|
628
+ id = dt.attr('id')
629
+ ids.push(id) if id
630
+ end
631
+
632
+ # Label ids are in headings.
633
+ %w[h1 h2 h3 h4 h5 h6].each do |tag|
634
+ body.search(tag).each do |h|
635
+ id = h.attr('id')
636
+ ids.push(id) if id
637
+ end
638
+ end
639
+ end
640
+
641
+ end
642
+
643
+ # Class to represent a link.
644
+ class Link
645
+
646
+ attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
647
+
648
+ # Returns a new \Link object:
649
+ #
650
+ # - +href+: attribute href from anchor element.
651
+ # - +text+: attribute text from anchor element.
652
+ # - +dirname+: directory path of the linking page.
653
+ #
654
+ def initialize(href, text, dirname)
655
+ self.href = href
656
+ self.text = text
657
+ self.dirname = dirname
658
+ path, fragment = href.split('#', 2)
659
+ self.path = path
660
+ self.fragment = fragment
661
+ self.valid_p = nil
662
+ self.real_path = make_real_path(dirname, path)
663
+ self.exception = nil
664
+ end
665
+
666
+ def to_h
667
+ {
668
+ href: href,
669
+ text: text,
670
+ }
671
+ end
672
+
673
+ # Return the real (not relative) path of the link.
674
+ def make_real_path(dirname, path)
675
+ # Trim single dot.
676
+ return path.sub('./', '') if path.start_with?('./')
677
+ return path if dirname.nil? || dirname.empty?
678
+
679
+ # May have one or more leading '../'.
680
+ up_dir = '../'
681
+ levels = path.scan(/(?=#{up_dir})/).count
682
+ dirs = dirname.split('/')
683
+ if levels == 0
684
+ dirs.empty? ? path : File.join(dirname, path)
685
+ else
686
+ # Remove leading '../' elements.
687
+ path = path.gsub(%r[\.\./], '')
688
+ # Remove the corresponding parts of dirname.
689
+ dirs.pop(levels)
690
+ return path if dirs.empty?
691
+ dirname = dirs.join('/')
692
+ File.join(dirname, path)
693
+ end
694
+ end
695
+
696
+ # Returns whether the link has a fragment.
697
+ def has_fragment?
698
+ fragment ? true : false
699
+ end
700
+
701
+ # Puts link info onto $stdout.
702
+ def puts(i)
703
+ $stdout.puts <<EOT
704
+ Link #{i}:
705
+ Href: #{href}
706
+ Text: #{text}
707
+ Path: #{path}
708
+ Fragment: #{fragment}
709
+ Valid: #{valid_p}
710
+ Real path: #{real_path}
711
+ Dirname: #{dirname}
712
+ EOT
713
+ end
714
+ end
715
+
716
+ end