rolfedh-doc-utils 0.1.37__py3-none-any.whl → 0.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1083 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Published Documentation Link Checker
4
+
5
+ Uses linkchecker to validate links on published HTML documentation pages with special handling for:
6
+ - Misresolved image paths (automatically corrected and verified via configurable URL rewriting)
7
+ - Known false positives (host:port placeholders, Maven Central 403)
8
+ - Timeout detection and reporting
9
+ - Custom ignore patterns via CLI or configuration file
10
+
11
+ Supports both single URL and bulk validation modes.
12
+
13
+ Usage:
14
+ # Single URL
15
+ ./check_published_links.py <URL> [--timeout SECONDS]
16
+
17
+ # Bulk validation from file
18
+ ./check_published_links.py --file <URL-LIST-FILE> [--timeout SECONDS]
19
+
20
+ # With URL rewriting for misresolved paths
21
+ ./check_published_links.py <URL> --rewrite-pattern "/docs/en/product/" --rewrite-replacement "/docs/en/PRODUCT_CODE_1.0/"
22
+
23
+ # With custom ignore patterns
24
+ ./check_published_links.py <URL> --ignore-pattern "^https?://internal\\.example\\.com"
25
+
26
+ # Using a configuration file
27
+ ./check_published_links.py <URL> --config linkcheck.conf
28
+
29
+ Examples:
30
+ # Single URL
31
+ ./check_published_links.py https://docs.example.com/guide/index.html
32
+ ./check_published_links.py https://docs.example.com/guide/index.html --timeout 90
33
+
34
+ # Bulk validation
35
+ ./check_published_links.py --file urls-to-check.txt
36
+ ./check_published_links.py --file urls-to-check.txt --timeout 90
37
+
38
+ # With URL rewriting for documentation platforms that misresolve relative paths
39
+ ./check_published_links.py https://docs.example.com/product/guide \\
40
+ --rewrite-pattern "/docs/en/product/" \\
41
+ --rewrite-replacement "/docs/en/PRODUCT_V1.0/"
42
+
43
+ # With custom ignore patterns
44
+ ./check_published_links.py https://docs.example.com/guide/ \\
45
+ --ignore-pattern "^https?://internal\\.example\\.com" \\
46
+ --ignore-pattern "^https?://staging\\."
47
+
48
+ Configuration File:
49
+ Create a file (default: .check-published-links.conf) with options:
50
+
51
+ # General settings
52
+ [settings]
53
+ timeout = 30
54
+ reports-dir = ./build/link-reports/
55
+
56
+ # Ignore patterns (one regex per line)
57
+ [ignore-patterns]
58
+ ^https?://internal\\.example\\.com
59
+ ^https?://staging\\.
60
+ ^https?://private-api\\.
61
+
62
+ # Rewrite rules (pattern = replacement)
63
+ [rewrite-rules]
64
+ /docs/en/product/ = /docs/en/PRODUCT_V1.0/
65
+ /docs/en/product/images/ = /docs/en/PRODUCT_V1.0/images/
66
+ """
67
+
68
+ import subprocess
69
+ import sys
70
+ import re
71
+ import argparse
72
+ from datetime import datetime
73
+ from pathlib import Path
74
+ from dataclasses import dataclass, field
75
+ import urllib.request
76
+ import urllib.error
77
+
78
+ # Configuration
79
+ DEFAULT_TIMEOUT = 15
80
+ REPORTS_DIR = Path("reports")
81
+ DEFAULT_CONFIG_FILE = Path(".check-published-links.conf")
82
+
83
+ # ANSI colors
84
+ class Colors:
85
+ RED = '\033[0;31m'
86
+ GREEN = '\033[0;32m'
87
+ YELLOW = '\033[1;33m'
88
+ BLUE = '\033[0;34m'
89
+ CYAN = '\033[0;36m'
90
+ NC = '\033[0m' # No Color
91
+
92
+
93
+ # Default ignore URL patterns for linkchecker
94
+ DEFAULT_IGNORE_PATTERNS = [
95
+ r"^https?://localhost(:[0-9]+)?(/.*)?$",
96
+ r"^https?://127\.0\.0\.1(:[0-9]+)?(/.*)?$",
97
+ r"^https?://([a-zA-Z0-9-]+\.)?example\.(com|org)(/.*)?$",
98
+ r"^https?://([a-zA-Z0-9-]+\.)?application\.com(/.*)?$",
99
+ r"^https?://host:port",
100
+ r".*,.*",
101
+ r".*%2C.*",
102
+ ]
103
+
104
+
105
+ # =============================================================================
106
+ # Configuration file parsing
107
+ # =============================================================================
108
+
109
+ @dataclass
110
+ class Config:
111
+ """Configuration loaded from file."""
112
+ timeout: int | None = None
113
+ reports_dir: Path | None = None
114
+ ignore_patterns: list = field(default_factory=list)
115
+ rewrite_rules: list = field(default_factory=list) # list of (pattern, replacement) tuples
116
+
117
+
118
+ def load_config_file(config_path: Path) -> Config:
119
+ """
120
+ Load configuration from file.
121
+
122
+ Returns:
123
+ Config object with all settings
124
+ """
125
+ config = Config()
126
+
127
+ if not config_path.exists():
128
+ return config
129
+
130
+ current_section = None
131
+
132
+ with open(config_path, 'r') as f:
133
+ for line in f:
134
+ line = line.strip()
135
+
136
+ # Skip empty lines and comments
137
+ if not line or line.startswith('#'):
138
+ continue
139
+
140
+ # Check for section headers
141
+ if line.startswith('[') and line.endswith(']'):
142
+ current_section = line[1:-1].lower()
143
+ continue
144
+
145
+ # Parse based on current section
146
+ if current_section == 'settings':
147
+ if '=' in line:
148
+ key, value = line.split('=', 1)
149
+ key = key.strip().lower()
150
+ value = value.strip()
151
+ if key == 'timeout':
152
+ try:
153
+ config.timeout = int(value)
154
+ except ValueError:
155
+ pass # Ignore invalid timeout values
156
+ elif key == 'reports-dir':
157
+ config.reports_dir = Path(value)
158
+ elif current_section == 'ignore-patterns':
159
+ config.ignore_patterns.append(line)
160
+ elif current_section == 'rewrite-rules':
161
+ if '=' in line:
162
+ pattern, replacement = line.split('=', 1)
163
+ config.rewrite_rules.append((pattern.strip(), replacement.strip()))
164
+
165
+ return config
166
+
167
+
168
+ @dataclass
169
+ class RewriteRule:
170
+ """URL rewrite rule for correcting misresolved paths."""
171
+ pattern: str
172
+ replacement: str
173
+
174
+ def matches(self, url: str) -> bool:
175
+ """Check if URL matches the pattern."""
176
+ return self.pattern in url
177
+
178
+ def apply(self, url: str) -> str:
179
+ """Apply the rewrite rule to the URL."""
180
+ return url.replace(self.pattern, self.replacement)
181
+
182
+
183
+ @dataclass
184
+ class LinkError:
185
+ """Represents a single link error from linkchecker."""
186
+ url: str = ""
187
+ name: str = ""
188
+ parent_url: str = ""
189
+ real_url: str = ""
190
+ check_time: str = ""
191
+ result: str = ""
192
+
193
+
194
+ @dataclass
195
+ class CheckResult:
196
+ """Results from link checking a single URL."""
197
+ url: str = ""
198
+ guide_name: str = ""
199
+ total_errors: int = 0
200
+ total_links: int = 0
201
+ known_issues: int = 0
202
+ timeout_errors: int = 0
203
+ maven_403: int = 0
204
+ rewritten_valid: int = 0
205
+ rewritten_not_found: int = 0
206
+ errors: list = field(default_factory=list)
207
+ passed: bool = True
208
+ raw_output: str = ""
209
+
210
+
211
+ @dataclass
212
+ class BulkResult:
213
+ """Results from bulk link checking."""
214
+ total_guides: int = 0
215
+ passed_count: int = 0
216
+ failed_count: int = 0
217
+ total_known_issues: int = 0
218
+ total_timeout_errors: int = 0
219
+ total_rewritten_valid: int = 0
220
+ total_rewritten_not_found: int = 0
221
+ guide_results: list = field(default_factory=list)
222
+
223
+
224
+ # =============================================================================
225
+ # Output helpers
226
+ # =============================================================================
227
+
228
+ def info(msg: str):
229
+ print(f"{Colors.BLUE}ℹ{Colors.NC} {msg}")
230
+
231
+
232
+ def success(msg: str):
233
+ print(f"{Colors.GREEN}✓{Colors.NC} {msg}")
234
+
235
+
236
+ def error(msg: str):
237
+ print(f"{Colors.RED}✗{Colors.NC} {msg}", file=sys.stderr)
238
+
239
+
240
+ def warning(msg: str):
241
+ print(f"{Colors.YELLOW}⚠{Colors.NC} {msg}")
242
+
243
+
244
+ # =============================================================================
245
+ # Core link checking functions
246
+ # =============================================================================
247
+
248
+ def check_linkchecker_installed() -> bool:
249
+ """Check if linkchecker is available."""
250
+ try:
251
+ subprocess.run(["linkchecker", "--version"], capture_output=True, check=True)
252
+ return True
253
+ except (subprocess.CalledProcessError, FileNotFoundError):
254
+ return False
255
+
256
+
257
+ def is_rewritable_path(url: str, rewrite_rules: list[RewriteRule]) -> bool:
258
+ """Check if URL matches any rewrite rule."""
259
+ return any(rule.matches(url) for rule in rewrite_rules)
260
+
261
+
262
+ def get_rewritten_url(url: str, rewrite_rules: list[RewriteRule]) -> str:
263
+ """Apply the first matching rewrite rule to the URL."""
264
+ for rule in rewrite_rules:
265
+ if rule.matches(url):
266
+ return rule.apply(url)
267
+ return url
268
+
269
+
270
+ def check_rewritten_url(wrong_url: str, rewrite_rules: list[RewriteRule]) -> tuple[bool, str]:
271
+ """Verify resource exists at corrected path."""
272
+ correct_url = get_rewritten_url(wrong_url, rewrite_rules)
273
+ try:
274
+ req = urllib.request.Request(correct_url, method='HEAD')
275
+ req.add_header('User-Agent', 'Mozilla/5.0 (compatible; LinkChecker)')
276
+ with urllib.request.urlopen(req, timeout=10) as response:
277
+ return response.status in (200, 302, 301), correct_url
278
+ except urllib.error.HTTPError as e:
279
+ if e.code in (302, 301):
280
+ return True, correct_url
281
+ return False, correct_url
282
+ except (urllib.error.URLError, TimeoutError):
283
+ return False, correct_url
284
+
285
+
286
+ def run_linkchecker(url: str, timeout: int, ignore_patterns: list[str] = None) -> tuple[int, str]:
287
+ """Run linkchecker and return (exit_code, output)."""
288
+ if ignore_patterns is None:
289
+ ignore_patterns = DEFAULT_IGNORE_PATTERNS
290
+
291
+ cmd = [
292
+ "linkchecker",
293
+ "--check-extern",
294
+ "--no-follow-url=.*",
295
+ "--no-warnings",
296
+ f"--timeout={timeout}",
297
+ ]
298
+
299
+ for pattern in ignore_patterns:
300
+ cmd.append(f"--ignore-url={pattern}")
301
+
302
+ cmd.append(url)
303
+
304
+ result = subprocess.run(cmd, capture_output=True, text=True)
305
+ return result.returncode, result.stdout + result.stderr
306
+
307
+
308
+ def extract_guide_name(url: str) -> str:
309
+ """Extract a readable guide name from URL."""
310
+ # Try common documentation URL patterns
311
+ # Pattern: /html-single/guide-name/
312
+ match = re.search(r'/html-single/([^/]+)/', url)
313
+ if match:
314
+ return match.group(1).replace('_', ' ')
315
+
316
+ # Pattern: ?topic=guide-name
317
+ match = re.search(r'\?topic=(.+)$', url)
318
+ if match:
319
+ return match.group(1)
320
+
321
+ # Pattern: /guide-name/index.html or /guide-name.html
322
+ match = re.search(r'/([^/]+?)(?:/index)?\.html?$', url)
323
+ if match:
324
+ return match.group(1).replace('-', ' ').replace('_', ' ')
325
+
326
+ # Fallback
327
+ return url.split('/')[-1] or url
328
+
329
+
330
+ def parse_linkchecker_output(output: str) -> CheckResult:
331
+ """Parse linkchecker output and extract error details."""
332
+ result = CheckResult(raw_output=output)
333
+
334
+ # Extract total error count
335
+ error_match = re.search(r'(\d+)\s+error', output)
336
+ if error_match:
337
+ result.total_errors = int(error_match.group(1))
338
+
339
+ # Extract link count
340
+ link_match = re.search(r'(\d+)\s+link', output)
341
+ if link_match:
342
+ result.total_links = int(link_match.group(1))
343
+
344
+ # Count known issues
345
+ result.known_issues = output.count("URL host 'host:port' has invalid port")
346
+
347
+ # Count timeout errors
348
+ result.timeout_errors = len(re.findall(r'ReadTimeout|Timeout', output))
349
+
350
+ # Detect Maven Central 403
351
+ if 'search.maven.org' in output and '403 Forbidden' in output:
352
+ result.maven_403 = output.count('search.maven.org')
353
+
354
+ # Parse individual errors
355
+ current_error = LinkError()
356
+ for line in output.split('\n'):
357
+ if line.startswith('URL ') and not line.startswith('URL lengths'):
358
+ if current_error.url:
359
+ result.errors.append(current_error)
360
+ current_error = LinkError()
361
+ current_error.url = line[4:].strip().strip('`\'')
362
+ elif line.startswith('Name '):
363
+ current_error.name = line[5:].strip().strip('`\'')
364
+ elif line.startswith('Parent URL'):
365
+ current_error.parent_url = line[10:].strip()
366
+ elif line.startswith('Real URL'):
367
+ current_error.real_url = line[8:].strip()
368
+ elif line.startswith('Check time'):
369
+ current_error.check_time = line[10:].strip()
370
+ elif line.startswith('Result'):
371
+ current_error.result = line[6:].strip()
372
+
373
+ if current_error.url:
374
+ result.errors.append(current_error)
375
+
376
+ return result
377
+
378
+
379
+ def verify_rewritten_paths(result: CheckResult, rewrite_rules: list[RewriteRule]) -> CheckResult:
380
+ """Check misresolved paths at corrected paths using rewrite rules."""
381
+ if not rewrite_rules:
382
+ return result
383
+
384
+ verified_urls = set()
385
+
386
+ for err in result.errors:
387
+ real_url = err.real_url
388
+ if is_rewritable_path(real_url, rewrite_rules):
389
+ exists, _ = check_rewritten_url(real_url, rewrite_rules)
390
+ if exists:
391
+ result.rewritten_valid += 1
392
+ verified_urls.add(real_url)
393
+ else:
394
+ result.rewritten_not_found += 1
395
+
396
+ # Filter out verified paths from errors list
397
+ result.errors = [e for e in result.errors if e.real_url not in verified_urls]
398
+
399
+ return result
400
+
401
+
402
+ def check_single_url(url: str, timeout: int, rewrite_rules: list[RewriteRule] = None,
403
+ ignore_patterns: list[str] = None) -> CheckResult:
404
+ """Check a single URL and return results."""
405
+ rewrite_rules = rewrite_rules or []
406
+ result = CheckResult(url=url, guide_name=extract_guide_name(url))
407
+
408
+ exit_code, output = run_linkchecker(url, timeout, ignore_patterns)
409
+ result.raw_output = output
410
+
411
+ if exit_code == 0:
412
+ # Success
413
+ link_match = re.search(r'(\d+)\s+link', output)
414
+ result.total_links = int(link_match.group(1)) if link_match else 0
415
+ result.passed = True
416
+ return result
417
+
418
+ # Parse errors
419
+ parsed = parse_linkchecker_output(output)
420
+ result.total_errors = parsed.total_errors
421
+ result.total_links = parsed.total_links
422
+ result.known_issues = parsed.known_issues
423
+ result.timeout_errors = parsed.timeout_errors
424
+ result.maven_403 = parsed.maven_403
425
+ result.errors = parsed.errors
426
+
427
+ # Verify rewritten paths
428
+ result = verify_rewritten_paths(result, rewrite_rules)
429
+
430
+ # Determine if passed (all errors were false positives)
431
+ adjusted_errors = result.total_errors - result.rewritten_valid
432
+ real_errors = adjusted_errors - result.known_issues
433
+ result.passed = real_errors <= 0
434
+
435
+ return result
436
+
437
+
438
+ # =============================================================================
439
+ # Helper functions for bulk mode
440
+ # =============================================================================
441
+
442
+ def load_urls(url_list_file: Path) -> list[str]:
443
+ """Load URLs from file, skipping comments and empty lines."""
444
+ urls = []
445
+ with open(url_list_file, 'r') as f:
446
+ for line in f:
447
+ line = line.strip()
448
+ if line and not line.startswith('#'):
449
+ urls.append(line)
450
+ return urls
451
+
452
+
453
+ # =============================================================================
454
+ # Report generation
455
+ # =============================================================================
456
+
457
+ def generate_single_report(url: str, guide_name: str, timeout: int, result: CheckResult,
458
+ rewrite_rules: list[RewriteRule] = None) -> str:
459
+ """Generate report for single URL check."""
460
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
461
+
462
+ # Calculate adjusted error count
463
+ adjusted_errors = result.total_errors - result.rewritten_valid
464
+ real_errors = adjusted_errors - result.known_issues
465
+
466
+ if real_errors <= 0:
467
+ status = "PASSED (all errors were false positives)"
468
+ status_icon = "✓"
469
+ else:
470
+ status = "FAILED"
471
+ status_icon = "✗"
472
+
473
+ report = f"""================================================================================
474
+ Documentation Link Check Report
475
+ ================================================================================
476
+ Date: {timestamp}
477
+ URL: {url}
478
+ Guide: {guide_name}
479
+ Timeout: {timeout} seconds
480
+
481
+ ================================================================================
482
+ SUMMARY
483
+ ================================================================================
484
+
485
+ {status_icon} {guide_name} ({adjusted_errors} errors"""
486
+
487
+ if result.rewritten_valid > 0:
488
+ report += f", {result.rewritten_valid} rewritten paths OK"
489
+ if result.known_issues > 0:
490
+ report += f", {result.known_issues} known"
491
+ if result.timeout_errors > 0:
492
+ report += f", {result.timeout_errors} TIMEOUT"
493
+ report += ")\n"
494
+
495
+ report += f"""
496
+ ================================================================================
497
+ STATISTICS
498
+ ================================================================================
499
+ Total Errors Reported: {result.total_errors}
500
+ Rewritten Paths Verified OK: {result.rewritten_valid}
501
+ Rewritten Paths NOT Found: {result.rewritten_not_found}
502
+ Adjusted Error Count: {adjusted_errors}
503
+ Known Issues Found: {result.known_issues} (safe to ignore)
504
+ Timeout Errors: {result.timeout_errors}
505
+ Status: {status}
506
+ """
507
+
508
+ report += _generate_known_issues_section(rewrite_rules)
509
+
510
+ if result.timeout_errors > 0:
511
+ report += f"""
512
+ ================================================================================
513
+ *** TIMEOUT LIMIT REACHED ***
514
+ ================================================================================
515
+
516
+ {result.timeout_errors} links exceeded the timeout limit of {timeout} seconds.
517
+
518
+ This indicates slow server responses or network issues, not broken links.
519
+
520
+ RECOMMENDED ACTION:
521
+ Re-run with increased timeout: --timeout {timeout + 30}
522
+
523
+ Timeout errors should be investigated separately from broken links.
524
+ """
525
+
526
+ # Add detailed error information if there are real errors
527
+ if result.errors:
528
+ report += f"""
529
+ ================================================================================
530
+ DETAILED ERROR INFORMATION
531
+ ================================================================================
532
+
533
+ The following section provides detailed error information.
534
+ This allows you to trace specific failures to exact URLs and error messages.
535
+
536
+ ════════════════════════════════════════════════════════════════
537
+ FAILED GUIDE: {guide_name}
538
+ ════════════════════════════════════════════════════════════════
539
+ URL: {url}
540
+ Total Errors: {len(result.errors)}
541
+ """
542
+ if result.rewritten_valid > 0:
543
+ report += f"Rewritten Paths Verified OK: {result.rewritten_valid}\n"
544
+ if result.rewritten_not_found > 0:
545
+ report += f"Rewritten Paths NOT Found: {result.rewritten_not_found}\n"
546
+
547
+ report += _generate_error_details(result.errors)
548
+
549
+ return report
550
+
551
+
552
+ def generate_bulk_report(url_list_file: Path, timeout: int, bulk_result: BulkResult,
553
+ rewrite_rules: list[RewriteRule] = None) -> str:
554
+ """Generate report for bulk URL check."""
555
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
556
+
557
+ report = f"""================================================================================
558
+ Documentation Link Check Report
559
+ ================================================================================
560
+ Date: {timestamp}
561
+ Total Guides Checked: {bulk_result.total_guides}
562
+ URL List: {url_list_file}
563
+ Timeout: {timeout} seconds
564
+
565
+ ================================================================================
566
+ SUMMARY
567
+ ================================================================================
568
+
569
+ """
570
+
571
+ # Add summary line for each guide
572
+ for guide in bulk_result.guide_results:
573
+ if guide.passed:
574
+ line = f"✓ {guide.guide_name}"
575
+ if guide.total_links:
576
+ line += f" ({guide.total_links} links)"
577
+ if guide.rewritten_valid > 0:
578
+ line += f" [{guide.rewritten_valid} rewritten OK]"
579
+ else:
580
+ adjusted = guide.total_errors - guide.rewritten_valid
581
+ line = f"✗ {guide.guide_name} ({adjusted} errors"
582
+ if guide.rewritten_valid > 0:
583
+ line += f", {guide.rewritten_valid} rewritten OK"
584
+ if guide.known_issues > 0:
585
+ line += f", {guide.known_issues} known"
586
+ if guide.timeout_errors > 0:
587
+ line += f", {guide.timeout_errors} TIMEOUT"
588
+ line += ")"
589
+ report += line + "\n"
590
+
591
+ # Statistics
592
+ success_rate = (bulk_result.passed_count / bulk_result.total_guides * 100) if bulk_result.total_guides > 0 else 0
593
+
594
+ report += f"""
595
+ ================================================================================
596
+ STATISTICS
597
+ ================================================================================
598
+ Total Guides Checked: {bulk_result.total_guides}
599
+ Passed: {bulk_result.passed_count}
600
+ Failed: {bulk_result.failed_count}
601
+ Rewritten Paths Verified OK: {bulk_result.total_rewritten_valid}
602
+ Rewritten Paths NOT Found: {bulk_result.total_rewritten_not_found}
603
+ Known Issues Found: {bulk_result.total_known_issues} (safe to ignore)
604
+ Timeout Errors: {bulk_result.total_timeout_errors}
605
+ Success Rate: {success_rate:.1f}%
606
+ """
607
+
608
+ report += _generate_known_issues_section(rewrite_rules)
609
+
610
+ # Timeout warning if any
611
+ if bulk_result.total_timeout_errors > 0:
612
+ report += f"""
613
+ ================================================================================
614
+ *** TIMEOUT LIMIT REACHED ***
615
+ ================================================================================
616
+
617
+ {bulk_result.total_timeout_errors} links exceeded the timeout limit of {timeout} seconds.
618
+
619
+ This indicates slow server responses or network issues, not broken links.
620
+
621
+ RECOMMENDED ACTION:
622
+ Re-run with increased timeout: --file {url_list_file} --timeout {timeout + 30}
623
+
624
+ Timeout errors should be investigated separately from broken links.
625
+ """
626
+
627
+ # Detailed errors for failed guides
628
+ failed_guides = [g for g in bulk_result.guide_results if not g.passed]
629
+ if failed_guides:
630
+ report += """
631
+ ================================================================================
632
+ DETAILED ERROR INFORMATION BY GUIDE
633
+ ================================================================================
634
+
635
+ The following section provides detailed error information for each failed guide.
636
+ This allows you to trace specific failures to exact URLs and error messages.
637
+ """
638
+
639
+ for guide in failed_guides:
640
+ report += f"""
641
+ ════════════════════════════════════════════════════════════════
642
+ FAILED GUIDE: {guide.guide_name}
643
+ ════════════════════════════════════════════════════════════════
644
+ URL: {guide.url}
645
+ Total Errors: {len(guide.errors)}
646
+ """
647
+ if guide.rewritten_valid > 0:
648
+ report += f"Rewritten Paths Verified OK: {guide.rewritten_valid}\n"
649
+ if guide.rewritten_not_found > 0:
650
+ report += f"Rewritten Paths NOT Found: {guide.rewritten_not_found}\n"
651
+ if guide.known_issues > 0:
652
+ report += f"Known Issues: {guide.known_issues} (host:port errors - safe to ignore)\n"
653
+ if guide.timeout_errors > 0:
654
+ report += f"\n*** TIMEOUT LIMIT REACHED: {guide.timeout_errors} links ***\n"
655
+ report += f"Consider increasing timeout: --file {url_list_file} --timeout {timeout + 30}\n"
656
+
657
+ report += _generate_error_details(guide.errors)
658
+
659
+ return report
660
+
661
+
662
+ def _generate_known_issues_section(rewrite_rules: list[RewriteRule] = None) -> str:
663
+ """Generate the known issues section for reports."""
664
+ section = """
665
+ ================================================================================
666
+ KNOWN ISSUES (Safe to Ignore)
667
+ ================================================================================
668
+
669
+ The following errors are expected due to LinkChecker limitations:
670
+
671
+ 1. "URL host 'host:port' has invalid port"
672
+ - URLs like https://host:port/auth or https://host:port/realms/{realm}
673
+ - These are documentation placeholders using literal "port" text
674
+ - LinkChecker cannot skip syntax-invalid URLs
675
+ - Safe to ignore - not real broken links
676
+
677
+ 2. Comma-separated URL lists
678
+ - URLs like http://www.example.com,http://localhost:3000
679
+ - These are examples showing configuration format
680
+ - Already filtered by ignore patterns
681
+ - Should not appear in error logs
682
+
683
+ 3. Maven Central 403 Forbidden errors
684
+ - URLs like https://search.maven.org/artifact/...
685
+ - Maven Central blocks automated bots/scrapers with 403 Forbidden
686
+ - These links work fine for humans in a web browser
687
+ - Verify manually if needed - not broken documentation
688
+ """
689
+
690
+ if rewrite_rules:
691
+ section += """
692
+ 4. Misresolved image/resource paths (AUTOMATICALLY VERIFIED)
693
+ - Some documentation platforms use URL routing that causes LinkChecker
694
+ to resolve relative paths against an incorrect base URL
695
+ - This script automatically verifies resources at the corrected path
696
+ - If verified OK, the error is NOT counted; if not found, it IS a real error
697
+ """
698
+
699
+ section += """
700
+ If you see these errors, they do NOT indicate broken documentation.
701
+ All other errors should be investigated and fixed.
702
+ """
703
+ return section
704
+
705
+
706
+ def _generate_error_details(errors: list) -> str:
707
+ """Generate error details section."""
708
+ report = """
709
+ Error Details:
710
+ ────────────────────────────────────────────────────────────────
711
+ """
712
+ for err in errors:
713
+ report += f"URL {err.url}\n"
714
+ if err.name:
715
+ report += f"Name {err.name}\n"
716
+ if err.parent_url:
717
+ report += f"Parent URL {err.parent_url}\n"
718
+ if err.real_url:
719
+ report += f"Real URL {err.real_url}\n"
720
+ if err.check_time:
721
+ report += f"Check time {err.check_time}\n"
722
+ if err.result:
723
+ report += f"Result {err.result}\n"
724
+ report += "\n"
725
+ return report
726
+
727
+
728
+ # =============================================================================
729
+ # Main entry points
730
+ # =============================================================================
731
+
732
+ def run_single_mode(url: str, timeout: int, rewrite_rules: list[RewriteRule] = None,
733
+ ignore_patterns: list[str] = None, reports_dir: Path = None):
734
+ """Run link checker for a single URL."""
735
+ rewrite_rules = rewrite_rules or []
736
+ reports_dir = reports_dir or REPORTS_DIR
737
+
738
+ # Setup
739
+ reports_dir.mkdir(exist_ok=True)
740
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
741
+ report_file = reports_dir / f"link-check-report_{timestamp}.txt"
742
+ guide_name = extract_guide_name(url)
743
+
744
+ # Print header
745
+ print()
746
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
747
+ print(f"{Colors.CYAN} Documentation Link Checker{Colors.NC}")
748
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
749
+ print()
750
+ info(f"URL: {url}")
751
+ info(f"Guide: {guide_name}")
752
+ info(f"Timeout: {timeout} seconds")
753
+ if rewrite_rules:
754
+ info(f"Rewrite rules: {len(rewrite_rules)}")
755
+ if ignore_patterns and ignore_patterns != DEFAULT_IGNORE_PATTERNS:
756
+ info(f"Custom ignore patterns: {len(ignore_patterns) - len(DEFAULT_IGNORE_PATTERNS)} added")
757
+ info(f"Report: {report_file}")
758
+ print()
759
+
760
+ # Run linkchecker
761
+ result = check_single_url(url, timeout, rewrite_rules, ignore_patterns)
762
+
763
+ if result.passed and result.total_errors == 0:
764
+ # Complete success
765
+ print(f"{Colors.GREEN}✓ PASS{Colors.NC} - {result.total_links} links checked")
766
+ print()
767
+
768
+ # Generate simple success report
769
+ report = f"""================================================================================
770
+ Documentation Link Check Report
771
+ ================================================================================
772
+ Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")}
773
+ URL: {url}
774
+ Guide: {guide_name}
775
+ Timeout: {timeout} seconds
776
+
777
+ ================================================================================
778
+ SUMMARY
779
+ ================================================================================
780
+
781
+ ✓ {guide_name} ({result.total_links} links)
782
+
783
+ ================================================================================
784
+ STATISTICS
785
+ ================================================================================
786
+ Total Links Checked: {result.total_links}
787
+ Status: PASSED
788
+ Success Rate: 100.0%
789
+
790
+ All documentation links are valid!
791
+ """
792
+ report_file.write_text(report)
793
+ info(f"Report saved to: {report_file}")
794
+ print()
795
+ success("All documentation links are valid!")
796
+ return 0
797
+
798
+ # Calculate real errors
799
+ adjusted_errors = result.total_errors - result.rewritten_valid
800
+ real_errors = adjusted_errors - result.known_issues
801
+
802
+ # Print summary
803
+ if real_errors <= 0:
804
+ print(f"{Colors.GREEN}✓ PASS{Colors.NC} - All errors were false positives")
805
+ if result.rewritten_valid > 0:
806
+ print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified at corrected location")
807
+ if result.known_issues > 0:
808
+ print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
809
+ else:
810
+ print(f"{Colors.RED}✗ FAIL{Colors.NC} - {adjusted_errors} errors found")
811
+ if result.rewritten_valid > 0:
812
+ print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified at corrected location (not counted)")
813
+ if result.rewritten_not_found > 0:
814
+ print(f" {Colors.RED}Error:{Colors.NC} {result.rewritten_not_found} path(s) NOT FOUND at corrected location")
815
+ if result.known_issues > 0:
816
+ print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
817
+ if result.timeout_errors > 0:
818
+ print(f" {Colors.RED}TIMEOUT:{Colors.NC} {result.timeout_errors} links exceeded timeout limit")
819
+ print()
820
+
821
+ # Generate and save report
822
+ report = generate_single_report(url, guide_name, timeout, result, rewrite_rules)
823
+ report_file.write_text(report)
824
+
825
+ info(f"Report saved to: {report_file}")
826
+
827
+ if result.rewritten_valid > 0:
828
+ info(f"{result.rewritten_valid} path(s) verified at corrected location")
829
+
830
+ if result.known_issues > 0:
831
+ warning(f"{result.known_issues} known issues found (host:port errors) - safe to ignore")
832
+
833
+ if result.timeout_errors > 0:
834
+ print()
835
+ error("════════════════════════════════════════════════════════════════")
836
+ error(" TIMEOUT LIMIT REACHED")
837
+ error("════════════════════════════════════════════════════════════════")
838
+ error(f"{result.timeout_errors} links exceeded the timeout limit of {timeout} seconds")
839
+ error(f'Consider increasing timeout: --timeout {timeout + 30}')
840
+ error("════════════════════════════════════════════════════════════════")
841
+
842
+ print()
843
+ if real_errors <= 0:
844
+ success("All documentation links are valid!")
845
+ return 0
846
+ else:
847
+ warning("Link check found issues. Review the report for details.")
848
+ return 1
849
+
850
+
851
+ def run_bulk_mode(url_list_file: Path, timeout: int, rewrite_rules: list[RewriteRule] = None,
852
+ ignore_patterns: list[str] = None, reports_dir: Path = None):
853
+ """Run link checker for multiple URLs from a file."""
854
+ rewrite_rules = rewrite_rules or []
855
+ reports_dir = reports_dir or REPORTS_DIR
856
+
857
+ # Load URLs
858
+ urls = load_urls(url_list_file)
859
+ total_urls = len(urls)
860
+
861
+ if total_urls == 0:
862
+ error(f"No URLs found in {url_list_file}")
863
+ return 1
864
+
865
+ # Setup
866
+ reports_dir.mkdir(exist_ok=True)
867
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
868
+ report_file = reports_dir / f"link-check-report_{timestamp}.txt"
869
+
870
+ # Print header
871
+ print()
872
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
873
+ print(f"{Colors.CYAN} Documentation Link Checker - Bulk Mode{Colors.NC}")
874
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
875
+ print()
876
+ info(f"URL List: {url_list_file}")
877
+ info(f"Total URLs: {total_urls}")
878
+ info(f"Timeout: {timeout} seconds")
879
+ if rewrite_rules:
880
+ info(f"Rewrite rules: {len(rewrite_rules)}")
881
+ if ignore_patterns and ignore_patterns != DEFAULT_IGNORE_PATTERNS:
882
+ info(f"Custom ignore patterns: {len(ignore_patterns) - len(DEFAULT_IGNORE_PATTERNS)} added")
883
+ info(f"Report: {report_file}")
884
+ print()
885
+
886
+ # Process each URL
887
+ bulk_result = BulkResult(total_guides=total_urls)
888
+
889
+ for i, url in enumerate(urls, 1):
890
+ guide_name = extract_guide_name(url)
891
+ print(f"{Colors.CYAN}[{i}/{total_urls}]{Colors.NC} Checking: {guide_name}...")
892
+
893
+ result = check_single_url(url, timeout, rewrite_rules, ignore_patterns)
894
+ bulk_result.guide_results.append(result)
895
+
896
+ if result.passed:
897
+ bulk_result.passed_count += 1
898
+ links_info = f"{result.total_links} links" if result.total_links else "OK"
899
+ rewritten_info = f" [{result.rewritten_valid} rewritten OK]" if result.rewritten_valid > 0 else ""
900
+ print(f" {Colors.GREEN}✓ PASS{Colors.NC} - {links_info}{rewritten_info}")
901
+ else:
902
+ bulk_result.failed_count += 1
903
+ adjusted = result.total_errors - result.rewritten_valid
904
+ print(f" {Colors.RED}✗ FAIL{Colors.NC} - {adjusted} errors found")
905
+ if result.rewritten_valid > 0:
906
+ print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified OK (not counted)")
907
+ if result.rewritten_not_found > 0:
908
+ print(f" {Colors.RED}Error:{Colors.NC} {result.rewritten_not_found} path(s) NOT FOUND")
909
+ if result.known_issues > 0:
910
+ print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
911
+ if result.timeout_errors > 0:
912
+ print(f" {Colors.RED}TIMEOUT:{Colors.NC} {result.timeout_errors} links exceeded timeout limit")
913
+
914
+ # Accumulate totals
915
+ bulk_result.total_known_issues += result.known_issues
916
+ bulk_result.total_timeout_errors += result.timeout_errors
917
+ bulk_result.total_rewritten_valid += result.rewritten_valid
918
+ bulk_result.total_rewritten_not_found += result.rewritten_not_found
919
+
920
+ # Generate and save report
921
+ report = generate_bulk_report(url_list_file, timeout, bulk_result, rewrite_rules)
922
+ report_file.write_text(report)
923
+
924
+ # Print final summary
925
+ print()
926
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
927
+ print(f"{Colors.CYAN} Results{Colors.NC}")
928
+ print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
929
+ print()
930
+ print(f" Total: {total_urls} guides")
931
+ print(f" {Colors.GREEN}Pass: {bulk_result.passed_count}{Colors.NC}")
932
+ print(f" {Colors.RED}Fail: {bulk_result.failed_count}{Colors.NC}")
933
+ if bulk_result.total_rewritten_valid > 0:
934
+ print(f" {Colors.BLUE}Rewritten OK: {bulk_result.total_rewritten_valid} paths verified{Colors.NC}")
935
+ if bulk_result.total_rewritten_not_found > 0:
936
+ print(f" {Colors.RED}Rewritten Missing: {bulk_result.total_rewritten_not_found} paths{Colors.NC}")
937
+ if bulk_result.total_known_issues > 0:
938
+ print(f" {Colors.YELLOW}Known: {bulk_result.total_known_issues} issues (safe to ignore){Colors.NC}")
939
+ if bulk_result.total_timeout_errors > 0:
940
+ print(f" {Colors.RED}Timeout: {bulk_result.total_timeout_errors} links{Colors.NC}")
941
+
942
+ success_rate = (bulk_result.passed_count / total_urls * 100) if total_urls > 0 else 0
943
+ print(f" Rate: {success_rate:.1f}%")
944
+ print()
945
+ info(f"Report saved to: {report_file}")
946
+
947
+ if bulk_result.total_known_issues > 0:
948
+ warning(f"{bulk_result.total_known_issues} known issues found (host:port errors) - safe to ignore")
949
+
950
+ if bulk_result.total_timeout_errors > 0:
951
+ print()
952
+ error("════════════════════════════════════════════════════════════════")
953
+ error(" TIMEOUT LIMIT REACHED")
954
+ error("════════════════════════════════════════════════════════════════")
955
+ error(f"{bulk_result.total_timeout_errors} links exceeded the timeout limit of {timeout} seconds")
956
+ error(f"Consider increasing timeout: --file {url_list_file} --timeout {timeout + 30}")
957
+ error("════════════════════════════════════════════════════════════════")
958
+
959
+ print()
960
+ if bulk_result.failed_count == 0:
961
+ success("All documentation links are valid!")
962
+ return 0
963
+ else:
964
+ warning("Some documentation links have issues. Review the report for details.")
965
+ return 1
966
+
967
+
968
+ def main():
969
+ parser = argparse.ArgumentParser(
970
+ description='Published Documentation Link Checker',
971
+ formatter_class=argparse.RawDescriptionHelpFormatter,
972
+ epilog="""
973
+ Examples:
974
+ # Single URL
975
+ %(prog)s https://docs.example.com/guide/index.html
976
+ %(prog)s https://docs.example.com/guide/index.html --timeout 90
977
+
978
+ # Bulk validation from file
979
+ %(prog)s --file urls-to-check.txt
980
+ %(prog)s --file urls-to-check.txt --timeout 90
981
+
982
+ # With URL rewriting for misresolved paths
983
+ %(prog)s https://docs.example.com/product/guide \\
984
+ --rewrite-pattern "/docs/en/product/" \\
985
+ --rewrite-replacement "/docs/en/PRODUCT_V1.0/"
986
+
987
+ # With custom ignore patterns
988
+ %(prog)s https://docs.example.com/guide/ \\
989
+ --ignore-pattern "^https?://internal\\.example\\.com" \\
990
+ --ignore-pattern "^https?://staging\\."
991
+
992
+ # Using a configuration file
993
+ %(prog)s https://docs.example.com/guide/ --config linkcheck.conf
994
+ """
995
+ )
996
+ parser.add_argument('url', nargs='?',
997
+ help='Single URL to check')
998
+ parser.add_argument('--file', '-f', type=Path, dest='url_list',
999
+ help='File containing URLs to check (one per line)')
1000
+ parser.add_argument('--timeout', '-t', type=int, default=DEFAULT_TIMEOUT,
1001
+ help=f'Timeout for each link check in seconds (default: {DEFAULT_TIMEOUT})')
1002
+ parser.add_argument('--rewrite-pattern', action='append', dest='rewrite_patterns',
1003
+ help='URL pattern to match for rewriting (can be used multiple times)')
1004
+ parser.add_argument('--rewrite-replacement', action='append', dest='rewrite_replacements',
1005
+ help='Replacement for matched pattern (must match --rewrite-pattern count)')
1006
+ parser.add_argument('--ignore-pattern', action='append', dest='ignore_patterns',
1007
+ help='Regex pattern for URLs to ignore (can be used multiple times)')
1008
+ parser.add_argument('--config', '-c', type=Path, dest='config_file',
1009
+ help=f'Configuration file (default: {DEFAULT_CONFIG_FILE} if it exists)')
1010
+ parser.add_argument('--reports-dir', type=Path, dest='reports_dir',
1011
+ help=f'Directory for reports (default: {REPORTS_DIR})')
1012
+
1013
+ args = parser.parse_args()
1014
+
1015
+ # Check linkchecker is installed
1016
+ if not check_linkchecker_installed():
1017
+ error("linkchecker is not installed")
1018
+ print()
1019
+ print("Install with: pipx install linkchecker")
1020
+ sys.exit(1)
1021
+
1022
+ # Load configuration file
1023
+ config_path = args.config_file if args.config_file else DEFAULT_CONFIG_FILE
1024
+ config = load_config_file(config_path)
1025
+
1026
+ if config_path.exists() and (config.ignore_patterns or config.rewrite_rules or
1027
+ config.timeout is not None or config.reports_dir is not None):
1028
+ info(f"Loaded configuration from {config_path}")
1029
+
1030
+ # Determine timeout: CLI > config file > default
1031
+ if args.timeout != DEFAULT_TIMEOUT:
1032
+ timeout = args.timeout # CLI explicitly set
1033
+ elif config.timeout is not None:
1034
+ timeout = config.timeout # From config file
1035
+ else:
1036
+ timeout = DEFAULT_TIMEOUT
1037
+
1038
+ # Determine reports directory: CLI > config file > default
1039
+ if args.reports_dir is not None:
1040
+ reports_dir = args.reports_dir
1041
+ elif config.reports_dir is not None:
1042
+ reports_dir = config.reports_dir
1043
+ else:
1044
+ reports_dir = REPORTS_DIR
1045
+
1046
+ # Build ignore patterns: defaults + config file + CLI
1047
+ ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
1048
+ if config.ignore_patterns:
1049
+ ignore_patterns.extend(config.ignore_patterns)
1050
+ if args.ignore_patterns:
1051
+ ignore_patterns.extend(args.ignore_patterns)
1052
+
1053
+ # Parse rewrite rules: config file + CLI
1054
+ rewrite_rules = []
1055
+ for pattern, replacement in config.rewrite_rules:
1056
+ rewrite_rules.append(RewriteRule(pattern=pattern, replacement=replacement))
1057
+ if args.rewrite_patterns:
1058
+ if not args.rewrite_replacements or len(args.rewrite_patterns) != len(args.rewrite_replacements):
1059
+ error("--rewrite-pattern and --rewrite-replacement must be used in pairs")
1060
+ sys.exit(1)
1061
+ for pattern, replacement in zip(args.rewrite_patterns, args.rewrite_replacements):
1062
+ rewrite_rules.append(RewriteRule(pattern=pattern, replacement=replacement))
1063
+
1064
+ # Determine mode
1065
+ if args.url_list:
1066
+ # Bulk mode from file
1067
+ if not args.url_list.exists():
1068
+ error(f"URL list not found: {args.url_list}")
1069
+ sys.exit(1)
1070
+ sys.exit(run_bulk_mode(args.url_list, timeout, rewrite_rules, ignore_patterns, reports_dir))
1071
+
1072
+ elif args.url:
1073
+ # Single URL mode
1074
+ sys.exit(run_single_mode(args.url, timeout, rewrite_rules, ignore_patterns, reports_dir))
1075
+
1076
+ else:
1077
+ # No arguments - show help
1078
+ parser.print_help()
1079
+ sys.exit(1)
1080
+
1081
+
1082
+ if __name__ == "__main__":
1083
+ main()