rdoc_link_checker 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,674 +1,722 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
- require 'rexml/document'
5
- require 'find'
6
- require 'net/http'
7
-
8
- require_relative 'rdoc_link_checker/version'
9
-
10
- class RDocLinkChecker
11
-
12
- include REXML
13
-
14
- attr_accessor :html_dirpath, :onsite_only, :no_toc
15
-
16
- attr_accessor :source_paths, :pages
17
-
18
- def initialize(
19
- html_dirpath,
20
- onsite_only: false,
21
- no_toc: false
22
- )
23
- self.html_dirpath = html_dirpath
24
- self.onsite_only = onsite_only
25
- self.no_toc = no_toc
26
- self.pages = {}
27
- @counts = {
28
- source_pages: 0,
29
- target_pages: 0,
30
- links_checked: 0,
31
- links_broken: 0,
32
- }
33
- end
34
-
35
- def check
36
- # All work is done in the HTML directory,
37
- # and that is where Report.htm will be put.
38
- Dir.chdir(html_dirpath) do |dir|
39
- @counts[:start_time] = Time.new
40
- gather_source_paths
41
- create_source_pages
42
- create_target_pages
43
- verify_links
44
- @counts[:end_time] = Time.new
45
- report
46
- end
47
- end
48
-
49
- # Gather paths to source HTML pages.
50
- def gather_source_paths
51
- paths = []
52
- paths = Find.find('.').select {|path| path.end_with?('.html') }
53
- # Remove leading './'.
54
- self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
55
- @counts[:source_pages] = source_paths.size
56
- end
57
-
58
- # Create a source \Page object for each source path.
59
- # Gather its links and ids.
60
- def create_source_pages
61
- source_paths.sort.each_with_index do |source_path, i|
62
- progress_s = RDocLinkChecker.progress_s(i + 1, source_paths.size)
63
- source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
64
- pages[source_path] = source_page
65
- source_text = File.read(source_path)
66
- doc = Nokogiri::HTML(source_text)
67
- if source_path == 'table_of_contents.html'
68
- source_page.gather_links(doc) unless no_toc
69
- else
70
- source_page.gather_links(doc)
71
- end
72
- source_page.gather_ids(doc)
73
- end
74
- end
75
-
76
- # Create a target \Page object for each link
77
- # (unless already created as a source page).
78
- def create_target_pages
79
- doc = nil
80
- target_page_count = 0
81
- source_paths = pages.keys
82
- source_paths.each do |source_path|
83
- # Need for relative links to work.
84
- dirname = File.dirname(source_path)
85
- Dir.chdir(dirname) do
86
- source_page = pages[source_path]
87
- source_page.links.each_with_index do |link, i|
88
- next if link.path.nil?
89
- target_path = link.real_path
90
- if pages[target_path]
91
- target_page = pages[target_path]
92
- else
93
- target_page_count += 1
94
- target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
95
- pages[target_path] = target_page
96
- if File.readable?(link.path)
97
- target_text = File.read(link.path)
98
- doc = Nokogiri::HTML(target_text)
99
- target_page.gather_ids(doc)
100
- elsif RDocLinkChecker.checkable?(link.path)
101
- link.exception = fetch(link.path, target_page)
102
- link.valid_p = false if link.exception
103
- else
104
- # File not readable or checkable.
105
- end
106
- end
107
- next if target_page.nil?
108
- if link.has_fragment? && target_page.ids.empty?
109
- doc || doc = Nokogiri::HTML(target_text)
110
- target_page.gather_ids(doc)
111
- end
112
- end
113
- end
114
- end
115
- @counts[:target_pages] = target_page_count
116
- end
117
-
118
- # Verify that each link target exists.
119
- def verify_links
120
- linking_pages = pages.select do |path, page|
121
- !page.links.empty?
122
- end
123
- link_count = 0
124
- broken_count = 0
125
- linking_pages.each_pair do |path, page|
126
- link_count += page.links.size
127
- page.links.each_with_index do |link, i|
128
- if link.valid_p.nil? # Don't disturb if already set to false.
129
- target_page = pages[link.real_path]
130
- if target_page
131
- target_id = link.fragment
132
- link.valid_p = target_id.nil? || target_page.ids.include?(target_id)
133
- else
134
- link.valid_p = false
135
- end
136
- end
137
- broken_count += 1 unless link.valid_p
138
- end
139
- end
140
- @counts[:links_checked] = link_count
141
- @counts[:links_broken] = broken_count
142
- end
143
-
144
- # Fetch the page from the web and gather its ids into the target page.
145
- # Returns exception or nil.
146
- def fetch(url, target_page)
147
- code = 0
148
- exception = nil
149
- begin
150
- response = Net::HTTP.get_response(URI(url))
151
- code = response.code.to_i
152
- target_page.code = code
153
- rescue => x
154
- raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/)
155
- exception = RDocLinkChecker::HttpResponseError.new(url, x)
156
- end
157
- # Don't load if bad code, or no response, or if not html.
158
- if !code_bad?(code)
159
- if content_type_html?(response)
160
- doc = Nokogiri::HTML(response.body)
161
- target_page.gather_ids(doc)
162
- end
163
- end
164
- exception
165
- end
166
-
167
- # Returns whether the code is bad (zero or >= 400).
168
- def code_bad?(code)
169
- return false if code.nil?
170
- (code == 0) || (code >= 400)
171
- end
172
-
173
- # Returns whether the response body should be HTML.
174
- def content_type_html?(response)
175
- return false unless response
176
- return false unless response['Content-Type']
177
- response['Content-Type'].match('html')
178
- end
179
-
180
- # Returns whether the path is offsite.
181
- def self.offsite?(path)
182
- path.start_with?('http')
183
- end
184
-
185
- # Returns the string fragment for the given path or ULR, or +nil+
186
- def self.get_fragment(s)
187
- a = s.split('#', 2)
188
- a.size == 2 ? a[1] : nil
189
- end
190
-
191
- # Returns a progress string giving a fraction and percentage.
192
- def self.progress_s(i, total)
193
- fraction_s = "#{i}/#{total}"
194
- percent_i = (i*100.0/total).round
195
- "(#{fraction_s}, #{percent_i}%)"
196
- end
197
-
198
- # Returns whether the path is checkable.
199
- def self.checkable?(path)
200
- return false unless path
201
- begin
202
- uri = URI(path)
203
- return ['http', 'https', nil].include?(uri.scheme)
204
- rescue
205
- return false
206
- end
207
- end
208
-
209
- # Generate the report; +checker+ is the \RDocLinkChecker object.
210
- def report
211
-
212
- doc = Document.new('')
213
- root = doc.add_element(Element.new('root'))
214
-
215
- head = root.add_element(Element.new('head'))
216
- title = head.add_element(Element.new('title'))
217
- title.text = 'RDocLinkChecker Report'
218
- style = head.add_element(Element.new('style'))
219
- style.text = <<EOT
220
- * { font-family: sans-serif }
221
- .data { font-family: courier }
222
- .center { text-align: center }
223
- .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
224
- .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
225
- .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
226
- .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
227
- EOT
228
-
229
- body = root.add_element(Element.new('body'))
230
- h1 = body.add_element(Element.new('h1'))
231
- h1.text = 'RDocLinkChecker Report'
232
-
233
- add_summary(body)
234
- add_broken_links(body)
235
- # add_offsite_links(body) unless onsite_only
236
- report_file_path = 'Report.htm' # _Not_ .html.
237
- doc.write(File.new(report_file_path, 'w'), 2)
238
- end
239
-
240
- def add_summary(body)
241
- h2 = body.add_element(Element.new('h2'))
242
- h2.text = 'Summary'
243
-
244
- # Parameters table.
245
- data = []
246
- [
247
- :html_dirpath,
248
- :onsite_only,
249
- :no_toc
250
- ].each do |sym|
251
- value = send(sym).inspect
252
- row = {sym => :label, value => :good}
253
- data.push(row)
254
- end
255
- table2(body, data, 'parameters', 'Parameters')
256
- body.add_element(Element.new('p'))
257
-
258
- # Times table.
259
- elapsed_time = @counts[:end_time] - @counts[:start_time]
260
- seconds = elapsed_time % 60
261
- minutes = (elapsed_time / 60) % 60
262
- hours = (elapsed_time/3600)
263
- elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
264
- format = "%Y-%m-%d-%a-%H:%M:%SZ"
265
- start_time_s = @counts[:start_time].strftime(format)
266
- end_time_s = @counts[:end_time].strftime(format)
267
- data = [
268
- {'Start Time' => :label, start_time_s => :good},
269
- {'End Time' => :label, end_time_s => :good},
270
- {'Elapsed Time' => :label, elapsed_time_s => :good},
271
- ]
272
- table2(body, data, 'times', 'Times')
273
- body.add_element(Element.new('p'))
274
-
275
- # Counts.
276
- data = [
277
- {'Source Pages' => :label, @counts[:source_pages] => :good},
278
- {'Target Pages' => :label, @counts[:target_pages] => :good},
279
- {'Links Checked' => :label, @counts[:links_checked] => :good},
280
- {'Links Broken' => :label, @counts[:links_broken] => :bad},
281
- ]
282
- table2(body, data, 'counts', 'Counts')
283
- body.add_element(Element.new('p'))
284
-
285
- end
286
-
287
- def add_broken_links(body)
288
- h2 = body.add_element(Element.new('h2'))
289
- h2.text = 'Broken Links by Source Page'
290
-
291
- if @counts[:links_broken] == 0
292
- p = body.add_element('p')
293
- p.text = 'None.'
294
- return
295
- end
296
-
297
- # Legend.
298
- ul = body.add_element(Element.new('ul'))
299
- li = ul.add_element(Element.new('li'))
300
- li.text = 'Href: the href of the anchor element.'
301
- li = ul.add_element(Element.new('li'))
302
- li.text = 'Text: the text of the anchor element.'
303
- li = ul.add_element(Element.new('li'))
304
- li.text = 'Path: the URL or path of the link (not including the fragment):'
305
- ul2 = li.add_element(Element.new('ul'))
306
- li2 = ul2.add_element(Element.new('li'))
307
- li2.text = 'For an on-site link, an abbreviated path is given.'
308
- li2 = ul2.add_element(Element.new('li'))
309
- li2.text = <<EOT
310
- For an off-site link, the full URL is given.
311
- If the path is reddish, the page was not found.
312
- EOT
313
- li = ul.add_element(Element.new('li'))
314
- li.text = <<EOT
315
- Fragment: the fragment of the link.
316
- If the fragment is reddish, fragment was not found.
317
- EOT
318
-
319
- pages.each_pair do |path, page|
320
- broken_links = page.links.select {|link| !link.valid_p }
321
- next if broken_links.empty?
322
-
323
- page_div = body.add_element(Element.new('div'))
324
- page_div.add_attribute('class', 'broken_page')
325
- page_div.add_attribute('path', path)
326
- page_div.add_attribute('count', broken_links.count)
327
- h3 = page_div.add_element(Element.new('h3'))
328
- a = Element.new('a')
329
- a.text = "#{path} (#{broken_links.count})"
330
- a.add_attribute('href', path)
331
- h3.add_element(a)
332
-
333
- broken_links.each do |link|
334
- link_div = page_div.add_element(Element.new('div'))
335
- link_div.add_attribute('class', 'broken_link')
336
- data = []
337
- # Text, URL, fragment
338
- a = Element.new('a')
339
- a.text = link.href
340
- a.add_attribute('href', link.href)
341
- data.push({'Href' => :label, a => :bad})
342
- data.push({'Text' => :label, link.text => :good})
343
- fragment_p = !link.fragment.nil?
344
- class_ = fragment_p ? :good : :bad
345
- data.push({'Path' => :label, link.real_path => class_})
346
- class_ = fragment_p ? :bad : :good
347
- data.push({'Fragment' => :label, link.fragment => class_})
348
- if link.exception
349
- data.push({'Exception' => :label, link.exception.class => :bad})
350
- data.push({'Message' => :label, link.exception.message => :bad})
351
- end
352
- id = link.exception ? 'bad_url' : 'bad_fragment'
353
- table2(link_div, data, id)
354
- page_div.add_element(Element.new('p'))
355
- end
356
- end
357
-
358
- end
359
-
360
- def add_offsite_links(body)
361
- h2 = body.add_element(Element.new('h2'))
362
- h2.text = 'Off-Site Links by Source Page'
363
- none = true
364
- pages.each_pair do |path, page|
365
- offsite_links = page.links.select do |link|
366
- RDocLinkChecker.offsite?(link.href)
367
- end
368
- next if offsite_links.empty?
369
-
370
- none = false
371
- h3 = body.add_element(Element.new('h3'))
372
- a = Element.new('a')
373
- a.text = path
374
- a.add_attribute('href', path)
375
- h3.add_element(a)
376
-
377
- offsite_links.each do |link|
378
- data = []
379
- # Text, URL, fragment
380
- a = Element.new('a')
381
- a.text = link.href
382
- a.add_attribute('href', link.href)
383
- class_ = link.valid_p ? :good : :bad
384
- data.push({'Href' => :label, a => class_})
385
- data.push({'Text' => :label, link.text => :good})
386
- table2(body, data)
387
- body.add_element(Element.new('p'))
388
- end
389
- end
390
- if none
391
- p = body.add_element(Element.new('p'))
392
- p.text = 'None.'
393
- end
394
- end
395
-
396
- Classes = {
397
- label: 'label center neutral',
398
- good: 'data center good',
399
- iffy: 'data center iffy',
400
- bad: 'data center bad',
401
- }
402
-
403
- def table2(parent, data, id, title = nil)
404
- data = data.dup
405
- table = parent.add_element(Element.new('table'))
406
- table.add_attribute('id', id)
407
- if title
408
- tr = table.add_element(Element.new('tr)'))
409
- th = tr.add_element(Element.new('th'))
410
- th.add_attribute('colspan', 2)
411
- if title.kind_of?(REXML::Element)
412
- th.add_element(title)
413
- else
414
- th.text = title
415
- end
416
- end
417
- data.each do |row_h|
418
- label, label_class, value, value_class = row_h.flatten
419
- tr = table.add_element(Element.new('tr'))
420
- td = tr.add_element(Element.new('td'))
421
- td.text = label
422
- td.add_attribute('class', Classes[label_class])
423
- td = tr.add_element(Element.new('td'))
424
- if value.kind_of?(REXML::Element)
425
- td.add_element(value)
426
- else
427
- td.text = value
428
- end
429
- td.add_attribute('class', Classes[value_class])
430
- end
431
- end
432
-
433
- class Error; end
434
-
435
- class HttpResponseError < Error
436
-
437
- attr_accessor :url, :x
438
-
439
- def initialize(url, x)
440
- self.url = url
441
- self.x = x
442
- end
443
-
444
- def message
445
- <<EOT
446
- #{self.class.name}:
447
- An exception was raised when checking page availability with Net::HTTP:
448
- Url: #{url}
449
- Class: #{x.class}
450
- Message: #{x.message}
451
- EOT
452
- end
453
-
454
- end
455
-
456
- class HttpStatusCodeError < Error
457
-
458
- attr_accessor :url, :code
459
-
460
- def initialize(url, code)
461
- self.url = url
462
- self.code = code
463
- end
464
-
465
- def message
466
- <<EOT
467
- #{self.class.name}:
468
- The return code for the page was not 200:
469
- Url: #{url}
470
- Return code: #{code}
471
- EOT
472
- end
473
-
474
- end
475
-
476
- # Class to represent a page.
477
- class Page
478
-
479
- attr_accessor :path, :type, :pages, :counts, :code, :links, :ids, :dirname, :onsite_only
480
-
481
- # Returns a new \Page object:
482
- #
483
- # - +path+: a path relative to the HTML directory (if on-site)
484
- # or a URL (if off-site).
485
- # - +pages+: hash of path/page pairs.
486
- # - +counts+: hash of counts.
487
- #
488
- def initialize(type, path, onsite_only, pages: {}, counts: {})
489
- self.path = path
490
- self.type = type
491
- self.pages = pages
492
- self.counts = counts
493
- self.onsite_only = onsite_only
494
- self.code = nil
495
- self.links = []
496
- self.ids = []
497
- self.dirname = File.dirname(path)
498
- self.dirname = self.dirname == '.' ? '' : dirname
499
- end
500
-
501
- def to_h
502
- {
503
- path: path,
504
- type: type,
505
- dirname: dirname,
506
- code: code
507
- }
508
- end
509
-
510
- # Gather links for the page:
511
- #
512
- # - +doc+: Nokogiri document to be parsed for links.
513
- #
514
- def gather_links(doc)
515
- i = 0
516
- # The links are in the anchors.
517
- doc.search('a').each do |a|
518
- # Ignore pilcrow (paragraph character) and up-arrow.
519
- next if a.text == "\u00B6"
520
- next if a.text == "\u2191"
521
-
522
- href = a.attr('href')
523
- next if href.nil? or href.empty?
524
- next if RDocLinkChecker.offsite?(href) && onsite_only
525
- next unless RDocLinkChecker.checkable?(href)
526
-
527
- link = Link.new(href, a.text, dirname)
528
- next if link.path.nil? || link.path.empty?
529
-
530
- links.push(link)
531
- i += 1
532
- end
533
- end
534
-
535
- # Gather ids for the page.
536
- # +doc+ is the Nokogiri document to be parsed.
537
- def gather_ids(doc)
538
- # Don't do twice (some pages are both source and target).
539
- return unless ids.empty?
540
-
541
- # For off-site, gather all ids, regardless of element.
542
- if RDocLinkChecker.offsite?(path)
543
- doc.xpath("//*[@id]").each do |element|
544
- id = element.attr('id')
545
- ids.push(id)
546
- end
547
- return
548
- end
549
-
550
- # We're on-site, which means that the page is RDoc-generated
551
- # and we know what to expect.
552
- # In theory, an author can link to any element that has an attribute :id.
553
- # In practice, gathering all such elements is very time-consuming.
554
- # These are the elements currently linked to:
555
- #
556
- # - body
557
- # - a
558
- # - div
559
- # - dt
560
- # - h*
561
- #
562
- # We can add more as needed (i.e., if/when we have actual broken links).
563
-
564
- # body element has 'top', which is a link target.
565
- body = doc.at('//body')
566
- id = body.attribute('id')
567
- ids.push(id) if id
568
-
569
- # Some ids are in the as (anchors).
570
- body.search('a').each do |a|
571
- id = a.attr(id)
572
- ids.push(id) if id
573
- end
574
-
575
- # Method ids are in divs, but gather only method-detail divs.
576
- body.search('div').each do |div|
577
- class_ = div.attr('class')
578
- next if class_.nil?
579
- next unless class_.match('method-')
580
- id = div.attr('id')
581
- ids.push(id) if id
582
- end
583
-
584
- # Constant ids are in dts.
585
- body.search('dt').each do |dt|
586
- id = dt.attr('id')
587
- ids.push(id) if id
588
- end
589
-
590
- # Label ids are in headings.
591
- %w[h1 h2 h3 h4 h5 h6].each do |tag|
592
- body.search(tag).each do |h|
593
- id = h.attr('id')
594
- ids.push(id) if id
595
- end
596
- end
597
- end
598
-
599
- end
600
-
601
- # Class to represent a link.
602
- class Link
603
-
604
- attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
605
-
606
- # Returns a new \Link object:
607
- #
608
- # - +href+: attribute href from anchor element.
609
- # - +text+: attribute text from anchor element.
610
- # - +dirname+: directory path of the linking page.
611
- #
612
- def initialize(href, text, dirname)
613
- self.href = href
614
- self.text = text
615
- self.dirname = dirname
616
- path, fragment = href.split('#', 2)
617
- self.path = path
618
- self.fragment = fragment
619
- self.valid_p = nil
620
- self.real_path = make_real_path(dirname, path)
621
- self.exception = nil
622
- end
623
-
624
- def to_h
625
- {
626
- href: href,
627
- text: text,
628
- }
629
- end
630
-
631
- # Return the real (not relative) path of the link.
632
- def make_real_path(dirname, path)
633
- # Trim single dot.
634
- return path.sub('./', '') if path.start_with?('./')
635
- return path if dirname.nil? || dirname.empty?
636
-
637
- # May have one or more leading '../'.
638
- up_dir = '../'
639
- levels = path.scan(/(?=#{up_dir})/).count
640
- dirs = dirname.split('/')
641
- if levels == 0
642
- dirs.empty? ? path : File.join(dirname, path)
643
- else
644
- # Remove leading '../' elements.
645
- path = path.gsub(%r[\.\./], '')
646
- # Remove the corresponding parts of dirname.
647
- dirs.pop(levels)
648
- return path if dirs.empty?
649
- dirname = dirs.join('/')
650
- File.join(dirname, path)
651
- end
652
- end
653
-
654
- # Returns whether the link has a fragment.
655
- def has_fragment?
656
- fragment ? true : false
657
- end
658
-
659
- # Puts link info onto $stdout.
660
- def puts(i)
661
- $stdout.puts <<EOT
662
- Link #{i}:
663
- Href: #{href}
664
- Text: #{text}
665
- Path: #{path}
666
- Fragment: #{fragment}
667
- Valid: #{valid_p}
668
- Real path: #{real_path}
669
- Dirname: #{dirname}
670
- EOT
671
- end
672
- end
673
-
674
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+ require 'rexml/document'
6
+ require 'find'
7
+ require 'net/http'
8
+ require 'json'
9
+
10
+ require_relative 'rdoc_link_checker/version'
11
+
12
+ class RDocLinkChecker
13
+
14
+ include REXML
15
+
16
+ attr_accessor :html_dirpath, :ruby_core, :config_filepath, :onsite_only, :no_toc,
17
+ :source_file_omits
18
+
19
+ attr_accessor :source_paths, :pages
20
+
21
+ RUBY_CORE_OMITS = [
22
+ "^LEGAL",
23
+ "^NEWS"
24
+ ]
25
+
26
+ def initialize(
27
+ html_dirpath,
28
+ ruby_core: true,
29
+ config_filepath: nil,
30
+ onsite_only: false,
31
+ no_toc: false
32
+ )
33
+ self.html_dirpath = html_dirpath
34
+ self.config_filepath = config_filepath
35
+ self.onsite_only = onsite_only
36
+ self.no_toc = no_toc
37
+ self.source_file_omits = ruby_core ? RUBY_CORE_OMITS : []
38
+ if config_filepath
39
+ config = JSON.parse(File.read(config_filepath))
40
+ options = config['options']
41
+ if options
42
+ val = options['onsite_only']
43
+ self.onsite_only = val if val
44
+ val = options['no_toc']
45
+ self.no_toc = val if val
46
+ end
47
+ regexp_sources = config['source_file_omits']
48
+ if regexp_sources
49
+ regexp_sources.each do |regexp_source|
50
+ self.source_file_omits.push(Regexp.new(regexp_source))
51
+ end
52
+ end
53
+ end
54
+ self.pages = {}
55
+ @counts = {
56
+ source_pages: 0,
57
+ target_pages: 0,
58
+ links_checked: 0,
59
+ links_broken: 0,
60
+ }
61
+ end
62
+
63
+ def check
64
+ # All work is done in the HTML directory,
65
+ # and that is where Report.htm will be put.
66
+ Dir.chdir(html_dirpath) do |dir|
67
+ @counts[:start_time] = Time.new
68
+ gather_source_paths
69
+ create_source_pages
70
+ create_target_pages
71
+ verify_links
72
+ @counts[:end_time] = Time.new
73
+ report
74
+ end
75
+ end
76
+
77
+ # Gather paths to source HTML pages.
78
+ def gather_source_paths
79
+ paths = []
80
+ paths = Find.find('.').select {|path| path.end_with?('.html') }
81
+ # Remove leading './'.
82
+ self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
83
+ source_file_omits.each do |re|
84
+ self.source_paths.delete_if do |source_path|
85
+ source_path.match(re)
86
+ end
87
+ end
88
+ @counts[:source_pages] = source_paths.size
89
+ end
90
+
91
+ # Create a source \Page object for each source path.
92
+ # Gather its links and ids.
93
+ def create_source_pages
94
+ source_paths.sort.each_with_index do |source_path, i|
95
+ source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
96
+ pages[source_path] = source_page
97
+ source_page.content_type = 'text/html'
98
+ source_text = File.read(source_path)
99
+ doc = Nokogiri::HTML(source_text)
100
+ if source_path == 'table_of_contents.html'
101
+ source_page.gather_links(doc) unless no_toc
102
+ else
103
+ source_page.gather_links(doc)
104
+ end
105
+ source_page.gather_link_targets(doc)
106
+ end
107
+ end
108
+
109
+ # Create a target \Page object for each link
110
+ # (unless already created as a source page).
111
+ def create_target_pages
112
+ doc = nil
113
+ target_page_count = 0
114
+ source_paths = pages.keys
115
+ source_paths.each do |source_path|
116
+ # Need for relative links to work.
117
+ dirname = File.dirname(source_path)
118
+ Dir.chdir(dirname) do
119
+ source_page = pages[source_path]
120
+ source_page.links.each_with_index do |link, i|
121
+ next if link.path.nil?
122
+ target_path = link.real_path
123
+ if pages[target_path]
124
+ target_page = pages[target_path]
125
+ else
126
+ target_page_count += 1
127
+ target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
128
+ pages[target_path] = target_page
129
+ if File.readable?(link.path)
130
+ target_text = File.read(link.path)
131
+ doc = Nokogiri::HTML(target_text)
132
+ target_page.gather_link_targets(doc)
133
+ elsif RDocLinkChecker.checkable?(link.path)
134
+ link.exception = fetch(link.path, target_page)
135
+ link.valid_p = false if link.exception
136
+ else
137
+ # File not readable or checkable.
138
+ end
139
+ end
140
+ next if target_page.nil?
141
+ if link.has_fragment? && target_page.ids.empty?
142
+ doc || doc = Nokogiri::HTML(target_text)
143
+ target_page.gather_link_targets(doc)
144
+ end
145
+ end
146
+ end
147
+ end
148
+ @counts[:target_pages] = target_page_count
149
+ end
150
+
151
+ # Verify that each link target exists.
152
+ def verify_links
153
+ linking_pages = pages.select do |path, page|
154
+ !page.links.empty?
155
+ end
156
+ link_count = 0
157
+ broken_count = 0
158
+ linking_pages.each_pair do |path, page|
159
+ link_count += page.links.size
160
+ page.links.each_with_index do |link, i|
161
+ if link.valid_p.nil? # Don't disturb if already set to false.
162
+ target_page = pages[link.real_path]
163
+ if target_page
164
+ target_id = link.fragment
165
+ link.valid_p = target_id.nil? ||
166
+ target_page.ids.include?(target_id) ||
167
+ !target_page.content_type&.match('html')
168
+ else
169
+ link.valid_p = false
170
+ end
171
+ end
172
+ broken_count += 1 unless link.valid_p
173
+ end
174
+ end
175
+ @counts[:links_checked] = link_count
176
+ @counts[:links_broken] = broken_count
177
+ end
178
+
179
+ # Fetch the page from the web and gather its ids into the target page.
180
+ # Returns exception or nil.
181
+ def fetch(url, target_page)
182
+ code = 0
183
+ exception = nil
184
+ begin
185
+ response = Net::HTTP.get_response(URI(url))
186
+ code = response.code.to_i
187
+ target_page.code = code
188
+ target_page.content_type = response['Content-Type']
189
+ rescue => x
190
+ raise unless x.class.name.match(/^(Net|Socket|IO::TimeoutError|Errno::)/)
191
+ exception = RDocLinkChecker::HttpResponseError.new(url, x)
192
+ end
193
+ # Don't load if bad code, or no response, or if not html.
194
+ if !code_bad?(code)
195
+ if content_type_html?(response)
196
+ doc = Nokogiri::HTML(response.body)
197
+ target_page.gather_link_targets(doc)
198
+ end
199
+ end
200
+ exception
201
+ end
202
+
203
+ # Returns whether the code is bad (zero or >= 400).
204
+ def code_bad?(code)
205
+ return false if code.nil?
206
+ (code == 0) || (code >= 400)
207
+ end
208
+
209
+ # Returns whether the response body should be HTML.
210
+ def content_type_html?(response)
211
+ return false unless response
212
+ return false unless response['Content-Type']
213
+ response['Content-Type'].match('html')
214
+ end
215
+
216
+ # Returns whether the path is offsite.
217
+ def self.offsite?(path)
218
+ path.start_with?('http')
219
+ end
220
+
221
+ # Returns the string fragment for the given path or ULR, or +nil+
222
+ def self.get_fragment(s)
223
+ a = s.split('#', 2)
224
+ a.size == 2 ? a[1] : nil
225
+ end
226
+
227
+ # Returns whether the path is checkable.
228
+ def self.checkable?(path)
229
+ return false unless path
230
+ begin
231
+ uri = URI(path)
232
+ return ['http', 'https', nil].include?(uri.scheme)
233
+ rescue
234
+ return false
235
+ end
236
+ end
237
+
238
+ # Generate the report; +checker+ is the \RDocLinkChecker object.
239
+ def report
240
+
241
+ doc = Document.new('')
242
+ root = doc.add_element(Element.new('root'))
243
+
244
+ head = root.add_element(Element.new('head'))
245
+ title = head.add_element(Element.new('title'))
246
+ title.text = 'RDocLinkChecker Report'
247
+ style = head.add_element(Element.new('style'))
248
+ style.text = <<EOT
249
+ * { font-family: sans-serif }
250
+ .data { font-family: courier }
251
+ .center { text-align: center }
252
+ .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
253
+ .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
254
+ .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
255
+ .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
256
+ EOT
257
+
258
+ body = root.add_element(Element.new('body'))
259
+ h1 = body.add_element(Element.new('h1'))
260
+ h1.text = 'RDocLinkChecker Report'
261
+
262
+ add_summary(body)
263
+ add_broken_links(body)
264
+ add_offsite_links(body) unless onsite_only
265
+ report_file_path = 'Report.htm' # _Not_ .html.
266
+ doc.write(File.new(report_file_path, 'w'), 2)
267
+ end
268
+
269
+ def add_summary(body)
270
+ h2 = body.add_element(Element.new('h2'))
271
+ h2.text = 'Summary'
272
+
273
+ # Parameters table.
274
+ data = []
275
+ [
276
+ :html_dirpath,
277
+ :onsite_only,
278
+ :no_toc
279
+ ].each do |sym|
280
+ value = send(sym).inspect
281
+ row = {sym => :label, value => :good}
282
+ data.push(row)
283
+ end
284
+ table2(body, data, 'parameters', 'Parameters')
285
+ body.add_element(Element.new('p'))
286
+
287
+ # Times table.
288
+ elapsed_time = @counts[:end_time] - @counts[:start_time]
289
+ seconds = elapsed_time % 60
290
+ minutes = (elapsed_time / 60) % 60
291
+ hours = (elapsed_time/3600)
292
+ elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
293
+ format = "%Y-%m-%d-%a-%H:%M:%SZ"
294
+ start_time_s = @counts[:start_time].strftime(format)
295
+ end_time_s = @counts[:end_time].strftime(format)
296
+ data = [
297
+ {'Start Time' => :label, start_time_s => :good},
298
+ {'End Time' => :label, end_time_s => :good},
299
+ {'Elapsed Time' => :label, elapsed_time_s => :good},
300
+ ]
301
+ table2(body, data, 'times', 'Times')
302
+ body.add_element(Element.new('p'))
303
+
304
+ # Counts.
305
+ data = [
306
+ {'Source Pages' => :label, @counts[:source_pages] => :good},
307
+ {'Target Pages' => :label, @counts[:target_pages] => :good},
308
+ {'Links Checked' => :label, @counts[:links_checked] => :good},
309
+ {'Links Broken' => :label, @counts[:links_broken] => :bad},
310
+ ]
311
+ table2(body, data, 'counts', 'Counts')
312
+ body.add_element(Element.new('p'))
313
+
314
+ end
315
+
316
+ def add_broken_links(body)
317
+ h2 = body.add_element(Element.new('h2'))
318
+ h2.text = 'Broken Links by Source Page'
319
+
320
+ if @counts[:links_broken] == 0
321
+ p = body.add_element('p')
322
+ p.text = 'None.'
323
+ return
324
+ end
325
+
326
+ # Legend.
327
+ ul = body.add_element(Element.new('ul'))
328
+ li = ul.add_element(Element.new('li'))
329
+ li.text = 'Href: the href of the anchor element.'
330
+ li = ul.add_element(Element.new('li'))
331
+ li.text = 'Text: the text of the anchor element.'
332
+ li = ul.add_element(Element.new('li'))
333
+ li.text = 'Path: the URL or path of the link (not including the fragment):'
334
+ ul2 = li.add_element(Element.new('ul'))
335
+ li2 = ul2.add_element(Element.new('li'))
336
+ li2.text = 'For an on-site link, an abbreviated path is given.'
337
+ li2 = ul2.add_element(Element.new('li'))
338
+ li2.text = <<EOT
339
+ For an off-site link, the full URL is given.
340
+ If the path is reddish, the page was not found.
341
+ EOT
342
+ li = ul.add_element(Element.new('li'))
343
+ li.text = <<EOT
344
+ Fragment: the fragment of the link.
345
+ If the fragment is reddish, fragment was not found.
346
+ EOT
347
+
348
+ pages.each_pair do |path, page|
349
+ broken_links = page.links.select {|link| !link.valid_p }
350
+ next if broken_links.empty?
351
+
352
+ page_div = body.add_element(Element.new('div'))
353
+ page_div.add_attribute('class', 'broken_page')
354
+ page_div.add_attribute('path', path)
355
+ page_div.add_attribute('count', broken_links.count)
356
+ h3 = page_div.add_element(Element.new('h3'))
357
+ a = Element.new('a')
358
+ a.text = "#{path} (#{broken_links.count})"
359
+ a.add_attribute('href', path)
360
+ h3.add_element(a)
361
+
362
+ broken_links.each do |link|
363
+ link_div = page_div.add_element(Element.new('div'))
364
+ link_div.add_attribute('class', 'broken_link')
365
+ data = []
366
+ # Text, URL, fragment
367
+ a = Element.new('a')
368
+ a.text = link.href
369
+ a.add_attribute('href', link.href)
370
+ data.push({'Href' => :label, a => :bad})
371
+ data.push({'Text' => :label, link.text => :good})
372
+ fragment_p = !link.fragment.nil?
373
+ class_ = fragment_p ? :good : :bad
374
+ data.push({'Path' => :label, link.real_path => class_})
375
+ class_ = fragment_p ? :bad : :good
376
+ data.push({'Fragment' => :label, link.fragment => class_})
377
+ if link.exception
378
+ data.push({'Exception' => :label, link.exception.class => :bad})
379
+ data.push({'Message' => :label, link.exception.message => :bad})
380
+ end
381
+ id = link.exception ? 'bad_url' : 'bad_fragment'
382
+ table2(link_div, data, id)
383
+ page_div.add_element(Element.new('p'))
384
+ end
385
+ end
386
+
387
+ end
388
+
389
+ def add_offsite_links(body)
390
+ h2 = body.add_element(Element.new('h2'))
391
+ count = 0
392
+ boilerplate = %w[
393
+ https://validator.w3.org/check/referer
394
+ https://ruby.github.io/rdoc/
395
+ http://deveiate.org/projects/Darkfish-RDoc/
396
+ http://deveiate.org
397
+ ]
398
+ none = true
399
+ pages.each_pair do |path, page|
400
+ offsite_links = page.links.select do |link|
401
+ RDocLinkChecker.offsite?(link.href) && !boilerplate.include?(link.href)
402
+ end
403
+ next if offsite_links.empty?
404
+ count += offsite_links.size
405
+
406
+ none = false
407
+ h3 = body.add_element(Element.new('h3'))
408
+ a = Element.new('a')
409
+ a.text = "#{path} (#{offsite_links.size})"
410
+ a.add_attribute('href', path)
411
+ h3.add_element(a)
412
+
413
+ offsite_links.each do |link|
414
+ data = []
415
+ # Text, URL, fragment
416
+ a = Element.new('a')
417
+ a.text = link.href
418
+ a.add_attribute('href', link.href)
419
+ class_ = link.valid_p ? :good : :bad
420
+ data.push({'Href' => :label, a => class_})
421
+ data.push({'Text' => :label, link.text => :good})
422
+ table2(body, data)
423
+ body.add_element(Element.new('p'))
424
+ end
425
+ end
426
+ h2.text = "Off-Site Links by Source Page (#{count})"
427
+ if none
428
+ p = body.add_element(Element.new('p'))
429
+ p.text = 'None.'
430
+ end
431
+ end
432
+
433
+ Classes = {
434
+ label: 'label center neutral',
435
+ good: 'data center good',
436
+ iffy: 'data center iffy',
437
+ bad: 'data center bad',
438
+ }
439
+
440
+ def table2(parent, data, id = nil, title = nil)
441
+ data = data.dup
442
+ table = parent.add_element(Element.new('table'))
443
+ table.add_attribute('id', id) if id
444
+ if title
445
+ tr = table.add_element(Element.new('tr)'))
446
+ th = tr.add_element(Element.new('th'))
447
+ th.add_attribute('colspan', 2)
448
+ if title.kind_of?(REXML::Element)
449
+ th.add_element(title)
450
+ else
451
+ th.text = title
452
+ end
453
+ end
454
+ data.each do |row_h|
455
+ label, label_class, value, value_class = row_h.flatten
456
+ tr = table.add_element(Element.new('tr'))
457
+ td = tr.add_element(Element.new('td'))
458
+ td.text = label
459
+ td.add_attribute('class', Classes[label_class])
460
+ td = tr.add_element(Element.new('td'))
461
+ if value.kind_of?(REXML::Element)
462
+ td.add_element(value)
463
+ else
464
+ td.text = value
465
+ end
466
+ td.add_attribute('class', Classes[value_class])
467
+ end
468
+ end
469
+
470
+ class Error; end
471
+
472
+ class HttpResponseError < Error
473
+
474
+ attr_accessor :url, :x
475
+
476
+ def initialize(url, x)
477
+ self.url = url
478
+ self.x = x
479
+ end
480
+
481
+ def message
482
+ <<EOT
483
+ #{self.class.name}:
484
+ An exception was raised when checking page availability with Net::HTTP:
485
+ Url: #{url}
486
+ Class: #{x.class}
487
+ Message: #{x.message}
488
+ EOT
489
+ end
490
+
491
+ end
492
+
493
+ class HttpStatusCodeError < Error
494
+
495
+ attr_accessor :url, :code
496
+
497
+ def initialize(url, code)
498
+ self.url = url
499
+ self.code = code
500
+ end
501
+
502
+ def message
503
+ <<EOT
504
+ #{self.class.name}:
505
+ The return code for the page was not 200:
506
+ Url: #{url}
507
+ Return code: #{code}
508
+ EOT
509
+ end
510
+
511
+ end
512
+
513
+ # Class to represent a page.
514
+ class Page
515
+
516
+ attr_accessor :path, :type, :pages, :counts, :code,
517
+ :links, :ids, :dirname, :onsite_only, :content_type
518
+
519
+ # Returns a new \Page object:
520
+ #
521
+ # - +path+: a path relative to the HTML directory (if on-site)
522
+ # or a URL (if off-site).
523
+ # - +pages+: hash of path/page pairs.
524
+ # - +counts+: hash of counts.
525
+ #
526
+ def initialize(type, path, onsite_only, pages: {}, counts: {})
527
+ self.path = path
528
+ self.type = type
529
+ self.pages = pages
530
+ self.counts = counts
531
+ self.onsite_only = onsite_only
532
+ self.code = nil
533
+ self.links = []
534
+ self.ids = []
535
+ self.dirname = File.dirname(path)
536
+ self.dirname = self.dirname == '.' ? '' : dirname
537
+ end
538
+
539
+ def to_h
540
+ {
541
+ path: path,
542
+ type: type,
543
+ dirname: dirname,
544
+ code: code
545
+ }
546
+ end
547
+
548
+ # Gather links for the page:
549
+ #
550
+ # - +doc+: Nokogiri document to be parsed for links.
551
+ #
552
+ def gather_links(doc)
553
+ i = 0
554
+ # The links are in the anchors.
555
+ doc.search('a').each do |a|
556
+ # Ignore pilcrow (paragraph character) and up-arrow.
557
+ next if a.text == "\u00B6"
558
+ next if a.text == "\u2191"
559
+
560
+ href = a.attr('href')
561
+ next if href.nil? or href.empty?
562
+ next if RDocLinkChecker.offsite?(href) && onsite_only
563
+ next unless RDocLinkChecker.checkable?(href)
564
+
565
+ link = Link.new(href, a.text, dirname)
566
+ next if link.path.nil? || link.path.empty?
567
+
568
+ links.push(link)
569
+ i += 1
570
+ end
571
+ end
572
+
573
+ # Gather link targets for the page.
574
+ # +doc+ is the Nokogiri document to be parsed.
575
+ def gather_link_targets(doc)
576
+ # Don't do twice (some pages are both source and target).
577
+ return unless ids.empty?
578
+
579
+ # For off-site, gather all ids, regardless of element.
580
+ if RDocLinkChecker.offsite?(path)
581
+ path.match('homepages')
582
+ doc.xpath("//*[@id]").each do |element|
583
+ id = element.attr('id')
584
+ ids.push(id)
585
+ end
586
+ doc.xpath("//*[@name]").each do |element|
587
+ name = element.attr('name')
588
+ ids.push(name)
589
+ end
590
+ doc.xpath("//a[@href]").each do |element|
591
+ href = element.attr('href')
592
+ next unless href.start_with?('#')
593
+ ids.push(href.sub('#', ''))
594
+ end
595
+ return
596
+ end
597
+
598
+ # We're on-site, which means that the page is RDoc-generated
599
+ # and we know what to expect.
600
+ # In theory, an author can link to any element that has an attribute :id.
601
+ # In practice, gathering all such elements is very time-consuming.
602
+ # These are the elements currently linked to:
603
+ #
604
+ # - body
605
+ # - a
606
+ # - div
607
+ # - dt
608
+ # - h*
609
+ #
610
+ # We can add more as needed (i.e., if/when we have actual broken links).
611
+
612
+ # body element has 'top', which is a link target.
613
+ body = doc.at('//body')
614
+ id = body.attribute('id')
615
+ ids.push(id) if id
616
+
617
+ # Some ids are in the as (anchors).
618
+ body.search('a').each do |a|
619
+ id = a.attr(id)
620
+ ids.push(id) if id
621
+ end
622
+
623
+ # Method ids are in divs, but gather only method-detail divs.
624
+ body.search('div').each do |div|
625
+ class_ = div.attr('class')
626
+ next if class_.nil?
627
+ next unless class_.match('method-')
628
+ id = div.attr('id')
629
+ ids.push(id) if id
630
+ end
631
+
632
+ # Constant ids are in dts.
633
+ body.search('dt').each do |dt|
634
+ id = dt.attr('id')
635
+ ids.push(id) if id
636
+ end
637
+
638
+ # Label ids are in headings.
639
+ %w[h1 h2 h3 h4 h5 h6].each do |tag|
640
+ body.search(tag).each do |h|
641
+ id = h.attr('id')
642
+ ids.push(id) if id
643
+ end
644
+ end
645
+ end
646
+
647
+ end
648
+
649
+ # Class to represent a link.
650
+ class Link
651
+
652
+ attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
653
+
654
+ # Returns a new \Link object:
655
+ #
656
+ # - +href+: attribute href from anchor element.
657
+ # - +text+: attribute text from anchor element.
658
+ # - +dirname+: directory path of the linking page.
659
+ #
660
+ def initialize(href, text, dirname)
661
+ self.href = href
662
+ self.text = text
663
+ self.dirname = dirname
664
+ path, fragment = href.split('#', 2)
665
+ self.path = path
666
+ self.fragment = fragment
667
+ self.valid_p = nil
668
+ self.real_path = make_real_path(dirname, path)
669
+ self.exception = nil
670
+ end
671
+
672
+ def to_h
673
+ {
674
+ href: href,
675
+ text: text,
676
+ }
677
+ end
678
+
679
+ # Return the real (not relative) path of the link.
680
+ def make_real_path(dirname, path)
681
+ # Trim single dot.
682
+ return path.sub('./', '') if path.start_with?('./')
683
+ return path if dirname.nil? || dirname.empty?
684
+
685
+ # May have one or more leading '../'.
686
+ up_dir = '../'
687
+ levels = path.scan(/(?=#{up_dir})/).count
688
+ dirs = dirname.split('/')
689
+ if levels == 0
690
+ dirs.empty? ? path : File.join(dirname, path)
691
+ else
692
+ # Remove leading '../' elements.
693
+ path = path.gsub(%r[\.\./], '')
694
+ # Remove the corresponding parts of dirname.
695
+ dirs.pop(levels)
696
+ return path if dirs.empty?
697
+ dirname = dirs.join('/')
698
+ File.join(dirname, path)
699
+ end
700
+ end
701
+
702
+ # Returns whether the link has a fragment.
703
+ def has_fragment?
704
+ fragment ? true : false
705
+ end
706
+
707
+ # Puts link info onto $stdout.
708
+ def puts(i)
709
+ $stdout.puts <<EOT
710
+ Link #{i}:
711
+ Href: #{href}
712
+ Text: #{text}
713
+ Path: #{path}
714
+ Fragment: #{fragment}
715
+ Valid: #{valid_p}
716
+ Real path: #{real_path}
717
+ Dirname: #{dirname}
718
+ EOT
719
+ end
720
+ end
721
+
722
+ end