rdoc_link_checker 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,670 +1,716 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
- require 'rexml/document'
5
- require 'find'
6
- require 'net/http'
7
-
8
- require_relative 'rdoc_link_checker/version'
9
-
10
- class RDocLinkChecker
11
-
12
- include REXML
13
-
14
- attr_accessor :html_dirpath, :onsite_only, :no_toc
15
-
16
- attr_accessor :source_paths, :pages
17
-
18
- def initialize(
19
- html_dirpath,
20
- onsite_only: false,
21
- no_toc: false
22
- )
23
- self.html_dirpath = html_dirpath
24
- self.onsite_only = onsite_only
25
- self.no_toc = no_toc
26
- self.pages = {}
27
- @counts = {
28
- source_pages: 0,
29
- target_pages: 0,
30
- links_checked: 0,
31
- links_broken: 0,
32
- }
33
- end
34
-
35
- def check
36
- # All work is done in the HTML directory,
37
- # and that is where Report.htm will be put.
38
- Dir.chdir(html_dirpath) do |dir|
39
- @counts[:start_time] = Time.new
40
- gather_source_paths
41
- create_source_pages
42
- create_target_pages
43
- verify_links
44
- @counts[:end_time] = Time.new
45
- report
46
- end
47
- end
48
-
49
- # Gather paths to source HTML pages.
50
- def gather_source_paths
51
- paths = []
52
- paths = Find.find('.').select {|path| path.end_with?('.html') }
53
- # Remove leading './'.
54
- self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
55
- @counts[:source_pages] = source_paths.size
56
- end
57
-
58
- # Create a source \Page object for each source path.
59
- # Gather its links and ids.
60
- def create_source_pages
61
- source_paths.sort.each_with_index do |source_path, i|
62
- progress_s = RDocLinkChecker.progress_s(i + 1, source_paths.size)
63
- source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
64
- pages[source_path] = source_page
65
- source_text = File.read(source_path)
66
- doc = Nokogiri::HTML(source_text)
67
- source_page.gather_links(doc) unless no_toc
68
- source_page.gather_ids(doc)
69
- end
70
- end
71
-
72
- # Create a target \Page object for each link
73
- # (unless already created as a source page).
74
- def create_target_pages
75
- doc = nil
76
- target_page_count = 0
77
- source_paths = pages.keys
78
- source_paths.each do |source_path|
79
- # Need for relative links to work.
80
- dirname = File.dirname(source_path)
81
- Dir.chdir(dirname) do
82
- source_page = pages[source_path]
83
- source_page.links.each_with_index do |link, i|
84
- next if link.path.nil?
85
- target_path = link.real_path
86
- if pages[target_path]
87
- target_page = pages[target_path]
88
- else
89
- target_page_count += 1
90
- target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
91
- pages[target_path] = target_page
92
- if File.readable?(link.path)
93
- target_text = File.read(link.path)
94
- doc = Nokogiri::HTML(target_text)
95
- target_page.gather_ids(doc)
96
- elsif RDocLinkChecker.checkable?(link.path)
97
- link.exception = fetch(link.path, target_page)
98
- link.valid_p = false if link.exception
99
- else
100
- # File not readable or checkable.
101
- end
102
- end
103
- next if target_page.nil?
104
- if link.has_fragment? && target_page.ids.empty?
105
- doc || doc = Nokogiri::HTML(target_text)
106
- target_page.gather_ids(doc)
107
- end
108
- end
109
- end
110
- end
111
- @counts[:target_pages] = target_page_count
112
- end
113
-
114
- # Verify that each link target exists.
115
- def verify_links
116
- linking_pages = pages.select do |path, page|
117
- !page.links.empty?
118
- end
119
- link_count = 0
120
- broken_count = 0
121
- linking_pages.each_pair do |path, page|
122
- link_count += page.links.size
123
- page.links.each_with_index do |link, i|
124
- if link.valid_p.nil? # Don't disturb if already set to false.
125
- target_page = pages[link.real_path]
126
- if target_page
127
- target_id = link.fragment
128
- link.valid_p = target_id.nil? || target_page.ids.include?(target_id)
129
- else
130
- link.valid_p = false
131
- end
132
- end
133
- broken_count += 1 unless link.valid_p
134
- end
135
- end
136
- @counts[:links_checked] = link_count
137
- @counts[:links_broken] = broken_count
138
- end
139
-
140
- # Fetch the page from the web and gather its ids into the target page.
141
- # Returns exception or nil.
142
- def fetch(url, target_page)
143
- code = 0
144
- exception = nil
145
- begin
146
- response = Net::HTTP.get_response(URI(url))
147
- code = response.code.to_i
148
- target_page.code = code
149
- rescue => x
150
- raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/)
151
- exception = RDocLinkChecker::HttpResponseError.new(url, x)
152
- end
153
- # Don't load if bad code, or no response, or if not html.
154
- if !code_bad?(code)
155
- if content_type_html?(response)
156
- doc = Nokogiri::HTML(response.body)
157
- target_page.gather_ids(doc)
158
- end
159
- end
160
- exception
161
- end
162
-
163
- # Returns whether the code is bad (zero or >= 400).
164
- def code_bad?(code)
165
- return false if code.nil?
166
- (code == 0) || (code >= 400)
167
- end
168
-
169
- # Returns whether the response body should be HTML.
170
- def content_type_html?(response)
171
- return false unless response
172
- return false unless response['Content-Type']
173
- response['Content-Type'].match('html')
174
- end
175
-
176
- # Returns whether the path is offsite.
177
- def self.offsite?(path)
178
- path.start_with?('http')
179
- end
180
-
181
- # Returns the string fragment for the given path or ULR, or +nil+
182
- def self.get_fragment(s)
183
- a = s.split('#', 2)
184
- a.size == 2 ? a[1] : nil
185
- end
186
-
187
- # Returns a progress string giving a fraction and percentage.
188
- def self.progress_s(i, total)
189
- fraction_s = "#{i}/#{total}"
190
- percent_i = (i*100.0/total).round
191
- "(#{fraction_s}, #{percent_i}%)"
192
- end
193
-
194
- # Returns whether the path is checkable.
195
- def self.checkable?(path)
196
- return false unless path
197
- begin
198
- uri = URI(path)
199
- return ['http', 'https', nil].include?(uri.scheme)
200
- rescue
201
- return false
202
- end
203
- end
204
-
205
- # Generate the report; +checker+ is the \RDocLinkChecker object.
206
- def report
207
-
208
- doc = Document.new('')
209
- root = doc.add_element(Element.new('root'))
210
-
211
- head = root.add_element(Element.new('head'))
212
- title = head.add_element(Element.new('title'))
213
- title.text = 'RDocLinkChecker Report'
214
- style = head.add_element(Element.new('style'))
215
- style.text = <<EOT
216
- * { font-family: sans-serif }
217
- .data { font-family: courier }
218
- .center { text-align: center }
219
- .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
220
- .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
221
- .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
222
- .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
223
- EOT
224
-
225
- body = root.add_element(Element.new('body'))
226
- h1 = body.add_element(Element.new('h1'))
227
- h1.text = 'RDocLinkChecker Report'
228
-
229
- add_summary(body)
230
- add_broken_links(body)
231
- # add_offsite_links(body) unless onsite_only
232
- report_file_path = 'Report.htm' # _Not_ .html.
233
- doc.write(File.new(report_file_path, 'w'), 2)
234
- end
235
-
236
- def add_summary(body)
237
- h2 = body.add_element(Element.new('h2'))
238
- h2.text = 'Summary'
239
-
240
- # Parameters table.
241
- data = []
242
- [
243
- :html_dirpath,
244
- :onsite_only,
245
- :no_toc
246
- ].each do |sym|
247
- value = send(sym).inspect
248
- row = {sym => :label, value => :good}
249
- data.push(row)
250
- end
251
- table2(body, data, 'parameters', 'Parameters')
252
- body.add_element(Element.new('p'))
253
-
254
- # Times table.
255
- elapsed_time = @counts[:end_time] - @counts[:start_time]
256
- seconds = elapsed_time % 60
257
- minutes = (elapsed_time / 60) % 60
258
- hours = (elapsed_time/3600)
259
- elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
260
- format = "%Y-%m-%d-%a-%H:%M:%SZ"
261
- start_time_s = @counts[:start_time].strftime(format)
262
- end_time_s = @counts[:end_time].strftime(format)
263
- data = [
264
- {'Start Time' => :label, start_time_s => :good},
265
- {'End Time' => :label, end_time_s => :good},
266
- {'Elapsed Time' => :label, elapsed_time_s => :good},
267
- ]
268
- table2(body, data, 'times', 'Times')
269
- body.add_element(Element.new('p'))
270
-
271
- # Counts.
272
- data = [
273
- {'Source Pages' => :label, @counts[:source_pages] => :good},
274
- {'Target Pages' => :label, @counts[:target_pages] => :good},
275
- {'Links Checked' => :label, @counts[:links_checked] => :good},
276
- {'Links Broken' => :label, @counts[:links_broken] => :bad},
277
- ]
278
- table2(body, data, 'counts', 'Counts')
279
- body.add_element(Element.new('p'))
280
-
281
- end
282
-
283
- def add_broken_links(body)
284
- h2 = body.add_element(Element.new('h2'))
285
- h2.text = 'Broken Links by Source Page'
286
-
287
- if @counts[:links_broken] == 0
288
- p = body.add_element('p')
289
- p.text = 'None.'
290
- return
291
- end
292
-
293
- # Legend.
294
- ul = body.add_element(Element.new('ul'))
295
- li = ul.add_element(Element.new('li'))
296
- li.text = 'Href: the href of the anchor element.'
297
- li = ul.add_element(Element.new('li'))
298
- li.text = 'Text: the text of the anchor element.'
299
- li = ul.add_element(Element.new('li'))
300
- li.text = 'Path: the URL or path of the link (not including the fragment):'
301
- ul2 = li.add_element(Element.new('ul'))
302
- li2 = ul2.add_element(Element.new('li'))
303
- li2.text = 'For an on-site link, an abbreviated path is given.'
304
- li2 = ul2.add_element(Element.new('li'))
305
- li2.text = <<EOT
306
- For an off-site link, the full URL is given.
307
- If the path is reddish, the page was not found.
308
- EOT
309
- li = ul.add_element(Element.new('li'))
310
- li.text = <<EOT
311
- Fragment: the fragment of the link.
312
- If the fragment is reddish, fragment was not found.
313
- EOT
314
-
315
- pages.each_pair do |path, page|
316
- broken_links = page.links.select {|link| !link.valid_p }
317
- next if broken_links.empty?
318
-
319
- page_div = body.add_element(Element.new('div'))
320
- page_div.add_attribute('class', 'broken_page')
321
- page_div.add_attribute('path', path)
322
- page_div.add_attribute('count', broken_links.count)
323
- h3 = page_div.add_element(Element.new('h3'))
324
- a = Element.new('a')
325
- a.text = "#{path} (#{broken_links.count})"
326
- a.add_attribute('href', path)
327
- h3.add_element(a)
328
-
329
- broken_links.each do |link|
330
- link_div = page_div.add_element(Element.new('div'))
331
- link_div.add_attribute('class', 'broken_link')
332
- data = []
333
- # Text, URL, fragment
334
- a = Element.new('a')
335
- a.text = link.href
336
- a.add_attribute('href', link.href)
337
- data.push({'Href' => :label, a => :bad})
338
- data.push({'Text' => :label, link.text => :good})
339
- fragment_p = !link.fragment.nil?
340
- class_ = fragment_p ? :good : :bad
341
- data.push({'Path' => :label, link.real_path => class_})
342
- class_ = fragment_p ? :bad : :good
343
- data.push({'Fragment' => :label, link.fragment => class_})
344
- if link.exception
345
- data.push({'Exception' => :label, link.exception.class => :bad})
346
- data.push({'Message' => :label, link.exception.message => :bad})
347
- end
348
- id = link.exception ? 'bad_url' : 'bad_fragment'
349
- table2(link_div, data, id)
350
- page_div.add_element(Element.new('p'))
351
- end
352
- end
353
-
354
- end
355
-
356
- def add_offsite_links(body)
357
- h2 = body.add_element(Element.new('h2'))
358
- h2.text = 'Off-Site Links by Source Page'
359
- none = true
360
- pages.each_pair do |path, page|
361
- offsite_links = page.links.select do |link|
362
- RDocLinkChecker.offsite?(link.href)
363
- end
364
- next if offsite_links.empty?
365
-
366
- none = false
367
- h3 = body.add_element(Element.new('h3'))
368
- a = Element.new('a')
369
- a.text = path
370
- a.add_attribute('href', path)
371
- h3.add_element(a)
372
-
373
- offsite_links.each do |link|
374
- data = []
375
- # Text, URL, fragment
376
- a = Element.new('a')
377
- a.text = link.href
378
- a.add_attribute('href', link.href)
379
- class_ = link.valid_p ? :good : :bad
380
- data.push({'Href' => :label, a => class_})
381
- data.push({'Text' => :label, link.text => :good})
382
- table2(body, data)
383
- body.add_element(Element.new('p'))
384
- end
385
- end
386
- if none
387
- p = body.add_element(Element.new('p'))
388
- p.text = 'None.'
389
- end
390
- end
391
-
392
- Classes = {
393
- label: 'label center neutral',
394
- good: 'data center good',
395
- iffy: 'data center iffy',
396
- bad: 'data center bad',
397
- }
398
-
399
- def table2(parent, data, id, title = nil)
400
- data = data.dup
401
- table = parent.add_element(Element.new('table'))
402
- table.add_attribute('id', id)
403
- if title
404
- tr = table.add_element(Element.new('tr)'))
405
- th = tr.add_element(Element.new('th'))
406
- th.add_attribute('colspan', 2)
407
- if title.kind_of?(REXML::Element)
408
- th.add_element(title)
409
- else
410
- th.text = title
411
- end
412
- end
413
- data.each do |row_h|
414
- label, label_class, value, value_class = row_h.flatten
415
- tr = table.add_element(Element.new('tr'))
416
- td = tr.add_element(Element.new('td'))
417
- td.text = label
418
- td.add_attribute('class', Classes[label_class])
419
- td = tr.add_element(Element.new('td'))
420
- if value.kind_of?(REXML::Element)
421
- td.add_element(value)
422
- else
423
- td.text = value
424
- end
425
- td.add_attribute('class', Classes[value_class])
426
- end
427
- end
428
-
429
- class Error; end
430
-
431
- class HttpResponseError < Error
432
-
433
- attr_accessor :url, :x
434
-
435
- def initialize(url, x)
436
- self.url = url
437
- self.x = x
438
- end
439
-
440
- def message
441
- <<EOT
442
- #{self.class.name}:
443
- An exception was raised when checking page availability with Net::HTTP:
444
- Url: #{url}
445
- Class: #{x.class}
446
- Message: #{x.message}
447
- EOT
448
- end
449
-
450
- end
451
-
452
- class HttpStatusCodeError < Error
453
-
454
- attr_accessor :url, :code
455
-
456
- def initialize(url, code)
457
- self.url = url
458
- self.code = code
459
- end
460
-
461
- def message
462
- <<EOT
463
- #{self.class.name}:
464
- The return code for the page was not 200:
465
- Url: #{url}
466
- Return code: #{code}
467
- EOT
468
- end
469
-
470
- end
471
-
472
- # Class to represent a page.
473
- class Page
474
-
475
- attr_accessor :path, :type, :pages, :counts, :code, :links, :ids, :dirname, :onsite_only
476
-
477
- # Returns a new \Page object:
478
- #
479
- # - +path+: a path relative to the HTML directory (if on-site)
480
- # or a URL (if off-site).
481
- # - +pages+: hash of path/page pairs.
482
- # - +counts+: hash of counts.
483
- #
484
- def initialize(type, path, onsite_only, pages: {}, counts: {})
485
- self.path = path
486
- self.type = type
487
- self.pages = pages
488
- self.counts = counts
489
- self.onsite_only = onsite_only
490
- self.code = nil
491
- self.links = []
492
- self.ids = []
493
- self.dirname = File.dirname(path)
494
- self.dirname = self.dirname == '.' ? '' : dirname
495
- end
496
-
497
- def to_h
498
- {
499
- path: path,
500
- type: type,
501
- dirname: dirname,
502
- code: code
503
- }
504
- end
505
-
506
- # Gather links for the page:
507
- #
508
- # - +doc+: Nokogiri document to be parsed for links.
509
- #
510
- def gather_links(doc)
511
- i = 0
512
- # The links are in the anchors.
513
- doc.search('a').each do |a|
514
- # Ignore pilcrow (paragraph character) and up-arrow.
515
- next if a.text == "\u00B6"
516
- next if a.text == "\u2191"
517
-
518
- href = a.attr('href')
519
- next if href.nil? or href.empty?
520
- next if RDocLinkChecker.offsite?(href) && onsite_only
521
- next unless RDocLinkChecker.checkable?(href)
522
-
523
- link = Link.new(href, a.text, dirname)
524
- next if link.path.nil? || link.path.empty?
525
-
526
- links.push(link)
527
- i += 1
528
- end
529
- end
530
-
531
- # Gather ids for the page.
532
- # +doc+ is the Nokogiri document to be parsed.
533
- def gather_ids(doc)
534
- # Don't do twice (some pages are both source and target).
535
- return unless ids.empty?
536
-
537
- # For off-site, gather all ids, regardless of element.
538
- if RDocLinkChecker.offsite?(path)
539
- doc.xpath("//*[@id]").each do |element|
540
- id = element.attr('id')
541
- ids.push(id)
542
- end
543
- return
544
- end
545
-
546
- # We're on-site, which means that the page is RDoc-generated
547
- # and we know what to expect.
548
- # In theory, an author can link to any element that has an attribute :id.
549
- # In practice, gathering all such elements is very time-consuming.
550
- # These are the elements currently linked to:
551
- #
552
- # - body
553
- # - a
554
- # - div
555
- # - dt
556
- # - h*
557
- #
558
- # We can add more as needed (i.e., if/when we have actual broken links).
559
-
560
- # body element has 'top', which is a link target.
561
- body = doc.at('//body')
562
- id = body.attribute('id')
563
- ids.push(id) if id
564
-
565
- # Some ids are in the as (anchors).
566
- body.search('a').each do |a|
567
- id = a.attr(id)
568
- ids.push(id) if id
569
- end
570
-
571
- # Method ids are in divs, but gather only method-detail divs.
572
- body.search('div').each do |div|
573
- class_ = div.attr('class')
574
- next if class_.nil?
575
- next unless class_.match('method-')
576
- id = div.attr('id')
577
- ids.push(id) if id
578
- end
579
-
580
- # Constant ids are in dts.
581
- body.search('dt').each do |dt|
582
- id = dt.attr('id')
583
- ids.push(id) if id
584
- end
585
-
586
- # Label ids are in headings.
587
- %w[h1 h2 h3 h4 h5 h6].each do |tag|
588
- body.search(tag).each do |h|
589
- id = h.attr('id')
590
- ids.push(id) if id
591
- end
592
- end
593
- end
594
-
595
- end
596
-
597
- # Class to represent a link.
598
- class Link
599
-
600
- attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
601
-
602
- # Returns a new \Link object:
603
- #
604
- # - +href+: attribute href from anchor element.
605
- # - +text+: attribute text from anchor element.
606
- # - +dirname+: directory path of the linking page.
607
- #
608
- def initialize(href, text, dirname)
609
- self.href = href
610
- self.text = text
611
- self.dirname = dirname
612
- path, fragment = href.split('#', 2)
613
- self.path = path
614
- self.fragment = fragment
615
- self.valid_p = nil
616
- self.real_path = make_real_path(dirname, path)
617
- self.exception = nil
618
- end
619
-
620
- def to_h
621
- {
622
- href: href,
623
- text: text,
624
- }
625
- end
626
-
627
- # Return the real (not relative) path of the link.
628
- def make_real_path(dirname, path)
629
- # Trim single dot.
630
- return path.sub('./', '') if path.start_with?('./')
631
- return path if dirname.nil? || dirname.empty?
632
-
633
- # May have one or more leading '../'.
634
- up_dir = '../'
635
- levels = path.scan(/(?=#{up_dir})/).count
636
- dirs = dirname.split('/')
637
- if levels == 0
638
- dirs.empty? ? path : File.join(dirname, path)
639
- else
640
- # Remove leading '../' elements.
641
- path = path.gsub(%r[\.\./], '')
642
- # Remove the corresponding parts of dirname.
643
- dirs.pop(levels)
644
- return path if dirs.empty?
645
- dirname = dirs.join('/')
646
- File.join(dirname, path)
647
- end
648
- end
649
-
650
- # Returns whether the link has a fragment.
651
- def has_fragment?
652
- fragment ? true : false
653
- end
654
-
655
- # Puts link info onto $stdout.
656
- def puts(i)
657
- $stdout.puts <<EOT
658
- Link #{i}:
659
- Href: #{href}
660
- Text: #{text}
661
- Path: #{path}
662
- Fragment: #{fragment}
663
- Valid: #{valid_p}
664
- Real path: #{real_path}
665
- Dirname: #{dirname}
666
- EOT
667
- end
668
- end
669
-
670
- end
1
+ # frozen_string_literal: true
2
+
3
+ require 'rubygems'
4
+ require 'nokogiri'
5
+ require 'rexml/document'
6
+ require 'find'
7
+ require 'net/http'
8
+ require 'json'
9
+
10
+ require_relative 'rdoc_link_checker/version'
11
+
12
+ class RDocLinkChecker
13
+
14
+ include REXML
15
+
16
+ attr_accessor :html_dirpath, :config_filepath, :onsite_only, :no_toc,
17
+ :source_file_omits
18
+
19
+ attr_accessor :source_paths, :pages
20
+
21
+ def initialize(
22
+ html_dirpath,
23
+ config_filepath: nil,
24
+ onsite_only: false,
25
+ no_toc: false
26
+ )
27
+ self.html_dirpath = html_dirpath
28
+ self.config_filepath = config_filepath
29
+ self.onsite_only = onsite_only
30
+ self.no_toc = no_toc
31
+ self.source_file_omits = []
32
+ if config_filepath
33
+ config = JSON.parse(File.read(config_filepath))
34
+ options = config['options']
35
+ if options
36
+ val = options['onsite_only']
37
+ self.onsite_only = val if val
38
+ val = options['no_toc']
39
+ self.no_toc = val if val
40
+ end
41
+ regexp_sources = config['source_file_omits']
42
+ if regexp_sources
43
+ regexp_sources.each do |regexp_source|
44
+ self.source_file_omits.push(Regexp.new(regexp_source))
45
+ end
46
+ end
47
+ end
48
+ self.pages = {}
49
+ @counts = {
50
+ source_pages: 0,
51
+ target_pages: 0,
52
+ links_checked: 0,
53
+ links_broken: 0,
54
+ }
55
+ end
56
+
57
+ def check
58
+ # All work is done in the HTML directory,
59
+ # and that is where Report.htm will be put.
60
+ Dir.chdir(html_dirpath) do |dir|
61
+ @counts[:start_time] = Time.new
62
+ gather_source_paths
63
+ create_source_pages
64
+ create_target_pages
65
+ verify_links
66
+ @counts[:end_time] = Time.new
67
+ report
68
+ end
69
+ end
70
+
71
+ # Gather paths to source HTML pages.
72
+ def gather_source_paths
73
+ paths = []
74
+ paths = Find.find('.').select {|path| path.end_with?('.html') }
75
+ # Remove leading './'.
76
+ self.source_paths = paths.map{|path| path.sub(%r[^\./], '')}
77
+ source_file_omits.each do |re|
78
+ self.source_paths.delete_if do |source_path|
79
+ source_path.match(re)
80
+ end
81
+ end
82
+ @counts[:source_pages] = source_paths.size
83
+ end
84
+
85
+ # Create a source \Page object for each source path.
86
+ # Gather its links and ids.
87
+ def create_source_pages
88
+ source_paths.sort.each_with_index do |source_path, i|
89
+ source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts)
90
+ pages[source_path] = source_page
91
+ source_page.content_type = 'text/html'
92
+ source_text = File.read(source_path)
93
+ doc = Nokogiri::HTML(source_text)
94
+ if source_path == 'table_of_contents.html'
95
+ source_page.gather_links(doc) unless no_toc
96
+ else
97
+ source_page.gather_links(doc)
98
+ end
99
+ source_page.gather_link_targets(doc)
100
+ end
101
+ end
102
+
103
+ # Create a target \Page object for each link
104
+ # (unless already created as a source page).
105
+ def create_target_pages
106
+ doc = nil
107
+ target_page_count = 0
108
+ source_paths = pages.keys
109
+ source_paths.each do |source_path|
110
+ # Need for relative links to work.
111
+ dirname = File.dirname(source_path)
112
+ Dir.chdir(dirname) do
113
+ source_page = pages[source_path]
114
+ source_page.links.each_with_index do |link, i|
115
+ next if link.path.nil?
116
+ target_path = link.real_path
117
+ if pages[target_path]
118
+ target_page = pages[target_path]
119
+ else
120
+ target_page_count += 1
121
+ target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts)
122
+ pages[target_path] = target_page
123
+ if File.readable?(link.path)
124
+ target_text = File.read(link.path)
125
+ doc = Nokogiri::HTML(target_text)
126
+ target_page.gather_link_targets(doc)
127
+ elsif RDocLinkChecker.checkable?(link.path)
128
+ link.exception = fetch(link.path, target_page)
129
+ link.valid_p = false if link.exception
130
+ else
131
+ # File not readable or checkable.
132
+ end
133
+ end
134
+ next if target_page.nil?
135
+ if link.has_fragment? && target_page.ids.empty?
136
+ doc || doc = Nokogiri::HTML(target_text)
137
+ target_page.gather_link_targets(doc)
138
+ end
139
+ end
140
+ end
141
+ end
142
+ @counts[:target_pages] = target_page_count
143
+ end
144
+
145
+ # Verify that each link target exists.
146
+ def verify_links
147
+ linking_pages = pages.select do |path, page|
148
+ !page.links.empty?
149
+ end
150
+ link_count = 0
151
+ broken_count = 0
152
+ linking_pages.each_pair do |path, page|
153
+ link_count += page.links.size
154
+ page.links.each_with_index do |link, i|
155
+ if link.valid_p.nil? # Don't disturb if already set to false.
156
+ target_page = pages[link.real_path]
157
+ if target_page
158
+ target_id = link.fragment
159
+ link.valid_p = target_id.nil? ||
160
+ target_page.ids.include?(target_id) ||
161
+ !target_page.content_type&.match('html')
162
+ else
163
+ link.valid_p = false
164
+ end
165
+ end
166
+ broken_count += 1 unless link.valid_p
167
+ end
168
+ end
169
+ @counts[:links_checked] = link_count
170
+ @counts[:links_broken] = broken_count
171
+ end
172
+
173
+ # Fetch the page from the web and gather its ids into the target page.
174
+ # Returns exception or nil.
175
+ def fetch(url, target_page)
176
+ code = 0
177
+ exception = nil
178
+ begin
179
+ response = Net::HTTP.get_response(URI(url))
180
+ code = response.code.to_i
181
+ target_page.code = code
182
+ target_page.content_type = response['Content-Type']
183
+ rescue => x
184
+ raise unless x.class.name.match(/^(Net|Socket|IO::TimeoutError|Errno::)/)
185
+ exception = RDocLinkChecker::HttpResponseError.new(url, x)
186
+ end
187
+ # Don't load if bad code, or no response, or if not html.
188
+ if !code_bad?(code)
189
+ if content_type_html?(response)
190
+ doc = Nokogiri::HTML(response.body)
191
+ target_page.gather_link_targets(doc)
192
+ end
193
+ end
194
+ exception
195
+ end
196
+
197
+ # Returns whether the code is bad (zero or >= 400).
198
+ def code_bad?(code)
199
+ return false if code.nil?
200
+ (code == 0) || (code >= 400)
201
+ end
202
+
203
+ # Returns whether the response body should be HTML.
204
+ def content_type_html?(response)
205
+ return false unless response
206
+ return false unless response['Content-Type']
207
+ response['Content-Type'].match('html')
208
+ end
209
+
210
+ # Returns whether the path is offsite.
211
+ def self.offsite?(path)
212
+ path.start_with?('http')
213
+ end
214
+
215
+ # Returns the string fragment for the given path or ULR, or +nil+
216
+ def self.get_fragment(s)
217
+ a = s.split('#', 2)
218
+ a.size == 2 ? a[1] : nil
219
+ end
220
+
221
+ # Returns whether the path is checkable.
222
+ def self.checkable?(path)
223
+ return false unless path
224
+ begin
225
+ uri = URI(path)
226
+ return ['http', 'https', nil].include?(uri.scheme)
227
+ rescue
228
+ return false
229
+ end
230
+ end
231
+
232
+ # Generate the report; +checker+ is the \RDocLinkChecker object.
233
+ def report
234
+
235
+ doc = Document.new('')
236
+ root = doc.add_element(Element.new('root'))
237
+
238
+ head = root.add_element(Element.new('head'))
239
+ title = head.add_element(Element.new('title'))
240
+ title.text = 'RDocLinkChecker Report'
241
+ style = head.add_element(Element.new('style'))
242
+ style.text = <<EOT
243
+ * { font-family: sans-serif }
244
+ .data { font-family: courier }
245
+ .center { text-align: center }
246
+ .good { color: rgb( 0, 97, 0); background-color: rgb(198, 239, 206) } /* Greenish */
247
+ .iffy { color: rgb(156, 101, 0); background-color: rgb(255, 235, 156) } /* Yellowish */
248
+ .bad { color: rgb(156, 0, 6); background-color: rgb(255, 199, 206) } /* Reddish */
249
+ .neutral { color: rgb( 0, 0, 0); background-color: rgb(217, 217, 214) } /* Grayish */
250
+ EOT
251
+
252
+ body = root.add_element(Element.new('body'))
253
+ h1 = body.add_element(Element.new('h1'))
254
+ h1.text = 'RDocLinkChecker Report'
255
+
256
+ add_summary(body)
257
+ add_broken_links(body)
258
+ add_offsite_links(body) unless onsite_only
259
+ report_file_path = 'Report.htm' # _Not_ .html.
260
+ doc.write(File.new(report_file_path, 'w'), 2)
261
+ end
262
+
263
+ def add_summary(body)
264
+ h2 = body.add_element(Element.new('h2'))
265
+ h2.text = 'Summary'
266
+
267
+ # Parameters table.
268
+ data = []
269
+ [
270
+ :html_dirpath,
271
+ :onsite_only,
272
+ :no_toc
273
+ ].each do |sym|
274
+ value = send(sym).inspect
275
+ row = {sym => :label, value => :good}
276
+ data.push(row)
277
+ end
278
+ table2(body, data, 'parameters', 'Parameters')
279
+ body.add_element(Element.new('p'))
280
+
281
+ # Times table.
282
+ elapsed_time = @counts[:end_time] - @counts[:start_time]
283
+ seconds = elapsed_time % 60
284
+ minutes = (elapsed_time / 60) % 60
285
+ hours = (elapsed_time/3600)
286
+ elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds]
287
+ format = "%Y-%m-%d-%a-%H:%M:%SZ"
288
+ start_time_s = @counts[:start_time].strftime(format)
289
+ end_time_s = @counts[:end_time].strftime(format)
290
+ data = [
291
+ {'Start Time' => :label, start_time_s => :good},
292
+ {'End Time' => :label, end_time_s => :good},
293
+ {'Elapsed Time' => :label, elapsed_time_s => :good},
294
+ ]
295
+ table2(body, data, 'times', 'Times')
296
+ body.add_element(Element.new('p'))
297
+
298
+ # Counts.
299
+ data = [
300
+ {'Source Pages' => :label, @counts[:source_pages] => :good},
301
+ {'Target Pages' => :label, @counts[:target_pages] => :good},
302
+ {'Links Checked' => :label, @counts[:links_checked] => :good},
303
+ {'Links Broken' => :label, @counts[:links_broken] => :bad},
304
+ ]
305
+ table2(body, data, 'counts', 'Counts')
306
+ body.add_element(Element.new('p'))
307
+
308
+ end
309
+
310
+ def add_broken_links(body)
311
+ h2 = body.add_element(Element.new('h2'))
312
+ h2.text = 'Broken Links by Source Page'
313
+
314
+ if @counts[:links_broken] == 0
315
+ p = body.add_element('p')
316
+ p.text = 'None.'
317
+ return
318
+ end
319
+
320
+ # Legend.
321
+ ul = body.add_element(Element.new('ul'))
322
+ li = ul.add_element(Element.new('li'))
323
+ li.text = 'Href: the href of the anchor element.'
324
+ li = ul.add_element(Element.new('li'))
325
+ li.text = 'Text: the text of the anchor element.'
326
+ li = ul.add_element(Element.new('li'))
327
+ li.text = 'Path: the URL or path of the link (not including the fragment):'
328
+ ul2 = li.add_element(Element.new('ul'))
329
+ li2 = ul2.add_element(Element.new('li'))
330
+ li2.text = 'For an on-site link, an abbreviated path is given.'
331
+ li2 = ul2.add_element(Element.new('li'))
332
+ li2.text = <<EOT
333
+ For an off-site link, the full URL is given.
334
+ If the path is reddish, the page was not found.
335
+ EOT
336
+ li = ul.add_element(Element.new('li'))
337
+ li.text = <<EOT
338
+ Fragment: the fragment of the link.
339
+ If the fragment is reddish, fragment was not found.
340
+ EOT
341
+
342
+ pages.each_pair do |path, page|
343
+ broken_links = page.links.select {|link| !link.valid_p }
344
+ next if broken_links.empty?
345
+
346
+ page_div = body.add_element(Element.new('div'))
347
+ page_div.add_attribute('class', 'broken_page')
348
+ page_div.add_attribute('path', path)
349
+ page_div.add_attribute('count', broken_links.count)
350
+ h3 = page_div.add_element(Element.new('h3'))
351
+ a = Element.new('a')
352
+ a.text = "#{path} (#{broken_links.count})"
353
+ a.add_attribute('href', path)
354
+ h3.add_element(a)
355
+
356
+ broken_links.each do |link|
357
+ link_div = page_div.add_element(Element.new('div'))
358
+ link_div.add_attribute('class', 'broken_link')
359
+ data = []
360
+ # Text, URL, fragment
361
+ a = Element.new('a')
362
+ a.text = link.href
363
+ a.add_attribute('href', link.href)
364
+ data.push({'Href' => :label, a => :bad})
365
+ data.push({'Text' => :label, link.text => :good})
366
+ fragment_p = !link.fragment.nil?
367
+ class_ = fragment_p ? :good : :bad
368
+ data.push({'Path' => :label, link.real_path => class_})
369
+ class_ = fragment_p ? :bad : :good
370
+ data.push({'Fragment' => :label, link.fragment => class_})
371
+ if link.exception
372
+ data.push({'Exception' => :label, link.exception.class => :bad})
373
+ data.push({'Message' => :label, link.exception.message => :bad})
374
+ end
375
+ id = link.exception ? 'bad_url' : 'bad_fragment'
376
+ table2(link_div, data, id)
377
+ page_div.add_element(Element.new('p'))
378
+ end
379
+ end
380
+
381
+ end
382
+
383
+ def add_offsite_links(body)
384
+ h2 = body.add_element(Element.new('h2'))
385
+ count = 0
386
+ boilerplate = %w[
387
+ https://validator.w3.org/check/referer
388
+ https://ruby.github.io/rdoc/
389
+ http://deveiate.org/projects/Darkfish-RDoc/
390
+ http://deveiate.org
391
+ ]
392
+ none = true
393
+ pages.each_pair do |path, page|
394
+ offsite_links = page.links.select do |link|
395
+ RDocLinkChecker.offsite?(link.href) && !boilerplate.include?(link.href)
396
+ end
397
+ next if offsite_links.empty?
398
+ count += offsite_links.size
399
+
400
+ none = false
401
+ h3 = body.add_element(Element.new('h3'))
402
+ a = Element.new('a')
403
+ a.text = "#{path} (#{offsite_links.size})"
404
+ a.add_attribute('href', path)
405
+ h3.add_element(a)
406
+
407
+ offsite_links.each do |link|
408
+ data = []
409
+ # Text, URL, fragment
410
+ a = Element.new('a')
411
+ a.text = link.href
412
+ a.add_attribute('href', link.href)
413
+ class_ = link.valid_p ? :good : :bad
414
+ data.push({'Href' => :label, a => class_})
415
+ data.push({'Text' => :label, link.text => :good})
416
+ table2(body, data)
417
+ body.add_element(Element.new('p'))
418
+ end
419
+ end
420
+ h2.text = "Off-Site Links by Source Page (#{count})"
421
+ if none
422
+ p = body.add_element(Element.new('p'))
423
+ p.text = 'None.'
424
+ end
425
+ end
426
+
427
+ Classes = {
428
+ label: 'label center neutral',
429
+ good: 'data center good',
430
+ iffy: 'data center iffy',
431
+ bad: 'data center bad',
432
+ }
433
+
434
+ def table2(parent, data, id = nil, title = nil)
435
+ data = data.dup
436
+ table = parent.add_element(Element.new('table'))
437
+ table.add_attribute('id', id) if id
438
+ if title
439
+ tr = table.add_element(Element.new('tr)'))
440
+ th = tr.add_element(Element.new('th'))
441
+ th.add_attribute('colspan', 2)
442
+ if title.kind_of?(REXML::Element)
443
+ th.add_element(title)
444
+ else
445
+ th.text = title
446
+ end
447
+ end
448
+ data.each do |row_h|
449
+ label, label_class, value, value_class = row_h.flatten
450
+ tr = table.add_element(Element.new('tr'))
451
+ td = tr.add_element(Element.new('td'))
452
+ td.text = label
453
+ td.add_attribute('class', Classes[label_class])
454
+ td = tr.add_element(Element.new('td'))
455
+ if value.kind_of?(REXML::Element)
456
+ td.add_element(value)
457
+ else
458
+ td.text = value
459
+ end
460
+ td.add_attribute('class', Classes[value_class])
461
+ end
462
+ end
463
+
464
+ class Error; end
465
+
466
+ class HttpResponseError < Error
467
+
468
+ attr_accessor :url, :x
469
+
470
+ def initialize(url, x)
471
+ self.url = url
472
+ self.x = x
473
+ end
474
+
475
+ def message
476
+ <<EOT
477
+ #{self.class.name}:
478
+ An exception was raised when checking page availability with Net::HTTP:
479
+ Url: #{url}
480
+ Class: #{x.class}
481
+ Message: #{x.message}
482
+ EOT
483
+ end
484
+
485
+ end
486
+
487
+ class HttpStatusCodeError < Error
488
+
489
+ attr_accessor :url, :code
490
+
491
+ def initialize(url, code)
492
+ self.url = url
493
+ self.code = code
494
+ end
495
+
496
+ def message
497
+ <<EOT
498
+ #{self.class.name}:
499
+ The return code for the page was not 200:
500
+ Url: #{url}
501
+ Return code: #{code}
502
+ EOT
503
+ end
504
+
505
+ end
506
+
507
+ # Class to represent a page.
508
+ class Page
509
+
510
+ attr_accessor :path, :type, :pages, :counts, :code,
511
+ :links, :ids, :dirname, :onsite_only, :content_type
512
+
513
+ # Returns a new \Page object:
514
+ #
515
+ # - +path+: a path relative to the HTML directory (if on-site)
516
+ # or a URL (if off-site).
517
+ # - +pages+: hash of path/page pairs.
518
+ # - +counts+: hash of counts.
519
+ #
520
+ def initialize(type, path, onsite_only, pages: {}, counts: {})
521
+ self.path = path
522
+ self.type = type
523
+ self.pages = pages
524
+ self.counts = counts
525
+ self.onsite_only = onsite_only
526
+ self.code = nil
527
+ self.links = []
528
+ self.ids = []
529
+ self.dirname = File.dirname(path)
530
+ self.dirname = self.dirname == '.' ? '' : dirname
531
+ end
532
+
533
+ def to_h
534
+ {
535
+ path: path,
536
+ type: type,
537
+ dirname: dirname,
538
+ code: code
539
+ }
540
+ end
541
+
542
+ # Gather links for the page:
543
+ #
544
+ # - +doc+: Nokogiri document to be parsed for links.
545
+ #
546
+ def gather_links(doc)
547
+ i = 0
548
+ # The links are in the anchors.
549
+ doc.search('a').each do |a|
550
+ # Ignore pilcrow (paragraph character) and up-arrow.
551
+ next if a.text == "\u00B6"
552
+ next if a.text == "\u2191"
553
+
554
+ href = a.attr('href')
555
+ next if href.nil? or href.empty?
556
+ next if RDocLinkChecker.offsite?(href) && onsite_only
557
+ next unless RDocLinkChecker.checkable?(href)
558
+
559
+ link = Link.new(href, a.text, dirname)
560
+ next if link.path.nil? || link.path.empty?
561
+
562
+ links.push(link)
563
+ i += 1
564
+ end
565
+ end
566
+
567
+ # Gather link targets for the page.
568
+ # +doc+ is the Nokogiri document to be parsed.
569
+ def gather_link_targets(doc)
570
+ # Don't do twice (some pages are both source and target).
571
+ return unless ids.empty?
572
+
573
+ # For off-site, gather all ids, regardless of element.
574
+ if RDocLinkChecker.offsite?(path)
575
+ ok = path.match('homepages')
576
+ doc.xpath("//*[@id]").each do |element|
577
+ id = element.attr('id')
578
+ ids.push(id)
579
+ end
580
+ doc.xpath("//*[@name]").each do |element|
581
+ name = element.attr('name')
582
+ ids.push(name)
583
+ end
584
+ doc.xpath("//a[@href]").each do |element|
585
+ href = element.attr('href')
586
+ next unless href.start_with?('#')
587
+ ids.push(href.sub('#', ''))
588
+ end
589
+ return
590
+ end
591
+
592
+ # We're on-site, which means that the page is RDoc-generated
593
+ # and we know what to expect.
594
+ # In theory, an author can link to any element that has an attribute :id.
595
+ # In practice, gathering all such elements is very time-consuming.
596
+ # These are the elements currently linked to:
597
+ #
598
+ # - body
599
+ # - a
600
+ # - div
601
+ # - dt
602
+ # - h*
603
+ #
604
+ # We can add more as needed (i.e., if/when we have actual broken links).
605
+
606
+ # body element has 'top', which is a link target.
607
+ body = doc.at('//body')
608
+ id = body.attribute('id')
609
+ ids.push(id) if id
610
+
611
+ # Some ids are in the as (anchors).
612
+ body.search('a').each do |a|
613
+ id = a.attr(id)
614
+ ids.push(id) if id
615
+ end
616
+
617
+ # Method ids are in divs, but gather only method-detail divs.
618
+ body.search('div').each do |div|
619
+ class_ = div.attr('class')
620
+ next if class_.nil?
621
+ next unless class_.match('method-')
622
+ id = div.attr('id')
623
+ ids.push(id) if id
624
+ end
625
+
626
+ # Constant ids are in dts.
627
+ body.search('dt').each do |dt|
628
+ id = dt.attr('id')
629
+ ids.push(id) if id
630
+ end
631
+
632
+ # Label ids are in headings.
633
+ %w[h1 h2 h3 h4 h5 h6].each do |tag|
634
+ body.search(tag).each do |h|
635
+ id = h.attr('id')
636
+ ids.push(id) if id
637
+ end
638
+ end
639
+ end
640
+
641
+ end
642
+
643
+ # Class to represent a link.
644
+ class Link
645
+
646
+ attr_accessor :href, :text, :dirname, :path, :fragment, :valid_p, :real_path, :exception
647
+
648
+ # Returns a new \Link object:
649
+ #
650
+ # - +href+: attribute href from anchor element.
651
+ # - +text+: attribute text from anchor element.
652
+ # - +dirname+: directory path of the linking page.
653
+ #
654
+ def initialize(href, text, dirname)
655
+ self.href = href
656
+ self.text = text
657
+ self.dirname = dirname
658
+ path, fragment = href.split('#', 2)
659
+ self.path = path
660
+ self.fragment = fragment
661
+ self.valid_p = nil
662
+ self.real_path = make_real_path(dirname, path)
663
+ self.exception = nil
664
+ end
665
+
666
+ def to_h
667
+ {
668
+ href: href,
669
+ text: text,
670
+ }
671
+ end
672
+
673
+ # Return the real (not relative) path of the link.
674
+ def make_real_path(dirname, path)
675
+ # Trim single dot.
676
+ return path.sub('./', '') if path.start_with?('./')
677
+ return path if dirname.nil? || dirname.empty?
678
+
679
+ # May have one or more leading '../'.
680
+ up_dir = '../'
681
+ levels = path.scan(/(?=#{up_dir})/).count
682
+ dirs = dirname.split('/')
683
+ if levels == 0
684
+ dirs.empty? ? path : File.join(dirname, path)
685
+ else
686
+ # Remove leading '../' elements.
687
+ path = path.gsub(%r[\.\./], '')
688
+ # Remove the corresponding parts of dirname.
689
+ dirs.pop(levels)
690
+ return path if dirs.empty?
691
+ dirname = dirs.join('/')
692
+ File.join(dirname, path)
693
+ end
694
+ end
695
+
696
+ # Returns whether the link has a fragment.
697
+ def has_fragment?
698
+ fragment ? true : false
699
+ end
700
+
701
+ # Puts link info onto $stdout.
702
+ def puts(i)
703
+ $stdout.puts <<EOT
704
+ Link #{i}:
705
+ Href: #{href}
706
+ Text: #{text}
707
+ Path: #{path}
708
+ Fragment: #{fragment}
709
+ Valid: #{valid_p}
710
+ Real path: #{real_path}
711
+ Dirname: #{dirname}
712
+ EOT
713
+ end
714
+ end
715
+
716
+ end