rolfedh-doc-utils 0.1.35__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- check_published_links.py +1083 -0
- check_source_directives.py +101 -0
- convert_tables_to_deflists.py +479 -0
- doc_utils/missing_source_directive.py +211 -0
- doc_utils/version.py +1 -1
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/METADATA +3 -2
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/RECORD +11 -7
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/entry_points.txt +3 -0
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/top_level.txt +3 -0
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/WHEEL +0 -0
- {rolfedh_doc_utils-0.1.35.dist-info → rolfedh_doc_utils-0.1.38.dist-info}/licenses/LICENSE +0 -0
check_published_links.py
ADDED
|
@@ -0,0 +1,1083 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Published Documentation Link Checker
|
|
4
|
+
|
|
5
|
+
Uses linkchecker to validate links on published HTML documentation pages with special handling for:
|
|
6
|
+
- Misresolved image paths (automatically corrected and verified via configurable URL rewriting)
|
|
7
|
+
- Known false positives (host:port placeholders, Maven Central 403)
|
|
8
|
+
- Timeout detection and reporting
|
|
9
|
+
- Custom ignore patterns via CLI or configuration file
|
|
10
|
+
|
|
11
|
+
Supports both single URL and bulk validation modes.
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
# Single URL
|
|
15
|
+
./check_published_links.py <URL> [--timeout SECONDS]
|
|
16
|
+
|
|
17
|
+
# Bulk validation from file
|
|
18
|
+
./check_published_links.py --file <URL-LIST-FILE> [--timeout SECONDS]
|
|
19
|
+
|
|
20
|
+
# With URL rewriting for misresolved paths
|
|
21
|
+
./check_published_links.py <URL> --rewrite-pattern "/docs/en/product/" --rewrite-replacement "/docs/en/PRODUCT_CODE_1.0/"
|
|
22
|
+
|
|
23
|
+
# With custom ignore patterns
|
|
24
|
+
./check_published_links.py <URL> --ignore-pattern "^https?://internal\\.example\\.com"
|
|
25
|
+
|
|
26
|
+
# Using a configuration file
|
|
27
|
+
./check_published_links.py <URL> --config linkcheck.conf
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
# Single URL
|
|
31
|
+
./check_published_links.py https://docs.example.com/guide/index.html
|
|
32
|
+
./check_published_links.py https://docs.example.com/guide/index.html --timeout 90
|
|
33
|
+
|
|
34
|
+
# Bulk validation
|
|
35
|
+
./check_published_links.py --file urls-to-check.txt
|
|
36
|
+
./check_published_links.py --file urls-to-check.txt --timeout 90
|
|
37
|
+
|
|
38
|
+
# With URL rewriting for documentation platforms that misresolve relative paths
|
|
39
|
+
./check_published_links.py https://docs.example.com/product/guide \\
|
|
40
|
+
--rewrite-pattern "/docs/en/product/" \\
|
|
41
|
+
--rewrite-replacement "/docs/en/PRODUCT_V1.0/"
|
|
42
|
+
|
|
43
|
+
# With custom ignore patterns
|
|
44
|
+
./check_published_links.py https://docs.example.com/guide/ \\
|
|
45
|
+
--ignore-pattern "^https?://internal\\.example\\.com" \\
|
|
46
|
+
--ignore-pattern "^https?://staging\\."
|
|
47
|
+
|
|
48
|
+
Configuration File:
|
|
49
|
+
Create a file (default: .check-published-links.conf) with options:
|
|
50
|
+
|
|
51
|
+
# General settings
|
|
52
|
+
[settings]
|
|
53
|
+
timeout = 30
|
|
54
|
+
reports-dir = ./build/link-reports/
|
|
55
|
+
|
|
56
|
+
# Ignore patterns (one regex per line)
|
|
57
|
+
[ignore-patterns]
|
|
58
|
+
^https?://internal\\.example\\.com
|
|
59
|
+
^https?://staging\\.
|
|
60
|
+
^https?://private-api\\.
|
|
61
|
+
|
|
62
|
+
# Rewrite rules (pattern = replacement)
|
|
63
|
+
[rewrite-rules]
|
|
64
|
+
/docs/en/product/ = /docs/en/PRODUCT_V1.0/
|
|
65
|
+
/docs/en/product/images/ = /docs/en/PRODUCT_V1.0/images/
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
import subprocess
|
|
69
|
+
import sys
|
|
70
|
+
import re
|
|
71
|
+
import argparse
|
|
72
|
+
from datetime import datetime
|
|
73
|
+
from pathlib import Path
|
|
74
|
+
from dataclasses import dataclass, field
|
|
75
|
+
import urllib.request
|
|
76
|
+
import urllib.error
|
|
77
|
+
|
|
78
|
+
# Configuration
|
|
79
|
+
DEFAULT_TIMEOUT = 15
|
|
80
|
+
REPORTS_DIR = Path("reports")
|
|
81
|
+
DEFAULT_CONFIG_FILE = Path(".check-published-links.conf")
|
|
82
|
+
|
|
83
|
+
# ANSI colors
|
|
84
|
+
class Colors:
|
|
85
|
+
RED = '\033[0;31m'
|
|
86
|
+
GREEN = '\033[0;32m'
|
|
87
|
+
YELLOW = '\033[1;33m'
|
|
88
|
+
BLUE = '\033[0;34m'
|
|
89
|
+
CYAN = '\033[0;36m'
|
|
90
|
+
NC = '\033[0m' # No Color
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Default ignore URL patterns for linkchecker
|
|
94
|
+
DEFAULT_IGNORE_PATTERNS = [
|
|
95
|
+
r"^https?://localhost(:[0-9]+)?(/.*)?$",
|
|
96
|
+
r"^https?://127\.0\.0\.1(:[0-9]+)?(/.*)?$",
|
|
97
|
+
r"^https?://([a-zA-Z0-9-]+\.)?example\.(com|org)(/.*)?$",
|
|
98
|
+
r"^https?://([a-zA-Z0-9-]+\.)?application\.com(/.*)?$",
|
|
99
|
+
r"^https?://host:port",
|
|
100
|
+
r".*,.*",
|
|
101
|
+
r".*%2C.*",
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# =============================================================================
|
|
106
|
+
# Configuration file parsing
|
|
107
|
+
# =============================================================================
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class Config:
|
|
111
|
+
"""Configuration loaded from file."""
|
|
112
|
+
timeout: int | None = None
|
|
113
|
+
reports_dir: Path | None = None
|
|
114
|
+
ignore_patterns: list = field(default_factory=list)
|
|
115
|
+
rewrite_rules: list = field(default_factory=list) # list of (pattern, replacement) tuples
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def load_config_file(config_path: Path) -> Config:
|
|
119
|
+
"""
|
|
120
|
+
Load configuration from file.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Config object with all settings
|
|
124
|
+
"""
|
|
125
|
+
config = Config()
|
|
126
|
+
|
|
127
|
+
if not config_path.exists():
|
|
128
|
+
return config
|
|
129
|
+
|
|
130
|
+
current_section = None
|
|
131
|
+
|
|
132
|
+
with open(config_path, 'r') as f:
|
|
133
|
+
for line in f:
|
|
134
|
+
line = line.strip()
|
|
135
|
+
|
|
136
|
+
# Skip empty lines and comments
|
|
137
|
+
if not line or line.startswith('#'):
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
# Check for section headers
|
|
141
|
+
if line.startswith('[') and line.endswith(']'):
|
|
142
|
+
current_section = line[1:-1].lower()
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
# Parse based on current section
|
|
146
|
+
if current_section == 'settings':
|
|
147
|
+
if '=' in line:
|
|
148
|
+
key, value = line.split('=', 1)
|
|
149
|
+
key = key.strip().lower()
|
|
150
|
+
value = value.strip()
|
|
151
|
+
if key == 'timeout':
|
|
152
|
+
try:
|
|
153
|
+
config.timeout = int(value)
|
|
154
|
+
except ValueError:
|
|
155
|
+
pass # Ignore invalid timeout values
|
|
156
|
+
elif key == 'reports-dir':
|
|
157
|
+
config.reports_dir = Path(value)
|
|
158
|
+
elif current_section == 'ignore-patterns':
|
|
159
|
+
config.ignore_patterns.append(line)
|
|
160
|
+
elif current_section == 'rewrite-rules':
|
|
161
|
+
if '=' in line:
|
|
162
|
+
pattern, replacement = line.split('=', 1)
|
|
163
|
+
config.rewrite_rules.append((pattern.strip(), replacement.strip()))
|
|
164
|
+
|
|
165
|
+
return config
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class RewriteRule:
|
|
170
|
+
"""URL rewrite rule for correcting misresolved paths."""
|
|
171
|
+
pattern: str
|
|
172
|
+
replacement: str
|
|
173
|
+
|
|
174
|
+
def matches(self, url: str) -> bool:
|
|
175
|
+
"""Check if URL matches the pattern."""
|
|
176
|
+
return self.pattern in url
|
|
177
|
+
|
|
178
|
+
def apply(self, url: str) -> str:
|
|
179
|
+
"""Apply the rewrite rule to the URL."""
|
|
180
|
+
return url.replace(self.pattern, self.replacement)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class LinkError:
|
|
185
|
+
"""Represents a single link error from linkchecker."""
|
|
186
|
+
url: str = ""
|
|
187
|
+
name: str = ""
|
|
188
|
+
parent_url: str = ""
|
|
189
|
+
real_url: str = ""
|
|
190
|
+
check_time: str = ""
|
|
191
|
+
result: str = ""
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@dataclass
|
|
195
|
+
class CheckResult:
|
|
196
|
+
"""Results from link checking a single URL."""
|
|
197
|
+
url: str = ""
|
|
198
|
+
guide_name: str = ""
|
|
199
|
+
total_errors: int = 0
|
|
200
|
+
total_links: int = 0
|
|
201
|
+
known_issues: int = 0
|
|
202
|
+
timeout_errors: int = 0
|
|
203
|
+
maven_403: int = 0
|
|
204
|
+
rewritten_valid: int = 0
|
|
205
|
+
rewritten_not_found: int = 0
|
|
206
|
+
errors: list = field(default_factory=list)
|
|
207
|
+
passed: bool = True
|
|
208
|
+
raw_output: str = ""
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class BulkResult:
|
|
213
|
+
"""Results from bulk link checking."""
|
|
214
|
+
total_guides: int = 0
|
|
215
|
+
passed_count: int = 0
|
|
216
|
+
failed_count: int = 0
|
|
217
|
+
total_known_issues: int = 0
|
|
218
|
+
total_timeout_errors: int = 0
|
|
219
|
+
total_rewritten_valid: int = 0
|
|
220
|
+
total_rewritten_not_found: int = 0
|
|
221
|
+
guide_results: list = field(default_factory=list)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# =============================================================================
|
|
225
|
+
# Output helpers
|
|
226
|
+
# =============================================================================
|
|
227
|
+
|
|
228
|
+
def info(msg: str):
|
|
229
|
+
print(f"{Colors.BLUE}ℹ{Colors.NC} {msg}")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def success(msg: str):
|
|
233
|
+
print(f"{Colors.GREEN}✓{Colors.NC} {msg}")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def error(msg: str):
|
|
237
|
+
print(f"{Colors.RED}✗{Colors.NC} {msg}", file=sys.stderr)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def warning(msg: str):
|
|
241
|
+
print(f"{Colors.YELLOW}⚠{Colors.NC} {msg}")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# =============================================================================
|
|
245
|
+
# Core link checking functions
|
|
246
|
+
# =============================================================================
|
|
247
|
+
|
|
248
|
+
def check_linkchecker_installed() -> bool:
|
|
249
|
+
"""Check if linkchecker is available."""
|
|
250
|
+
try:
|
|
251
|
+
subprocess.run(["linkchecker", "--version"], capture_output=True, check=True)
|
|
252
|
+
return True
|
|
253
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def is_rewritable_path(url: str, rewrite_rules: list[RewriteRule]) -> bool:
|
|
258
|
+
"""Check if URL matches any rewrite rule."""
|
|
259
|
+
return any(rule.matches(url) for rule in rewrite_rules)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def get_rewritten_url(url: str, rewrite_rules: list[RewriteRule]) -> str:
|
|
263
|
+
"""Apply the first matching rewrite rule to the URL."""
|
|
264
|
+
for rule in rewrite_rules:
|
|
265
|
+
if rule.matches(url):
|
|
266
|
+
return rule.apply(url)
|
|
267
|
+
return url
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def check_rewritten_url(wrong_url: str, rewrite_rules: list[RewriteRule]) -> tuple[bool, str]:
|
|
271
|
+
"""Verify resource exists at corrected path."""
|
|
272
|
+
correct_url = get_rewritten_url(wrong_url, rewrite_rules)
|
|
273
|
+
try:
|
|
274
|
+
req = urllib.request.Request(correct_url, method='HEAD')
|
|
275
|
+
req.add_header('User-Agent', 'Mozilla/5.0 (compatible; LinkChecker)')
|
|
276
|
+
with urllib.request.urlopen(req, timeout=10) as response:
|
|
277
|
+
return response.status in (200, 302, 301), correct_url
|
|
278
|
+
except urllib.error.HTTPError as e:
|
|
279
|
+
if e.code in (302, 301):
|
|
280
|
+
return True, correct_url
|
|
281
|
+
return False, correct_url
|
|
282
|
+
except (urllib.error.URLError, TimeoutError):
|
|
283
|
+
return False, correct_url
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def run_linkchecker(url: str, timeout: int, ignore_patterns: list[str] = None) -> tuple[int, str]:
|
|
287
|
+
"""Run linkchecker and return (exit_code, output)."""
|
|
288
|
+
if ignore_patterns is None:
|
|
289
|
+
ignore_patterns = DEFAULT_IGNORE_PATTERNS
|
|
290
|
+
|
|
291
|
+
cmd = [
|
|
292
|
+
"linkchecker",
|
|
293
|
+
"--check-extern",
|
|
294
|
+
"--no-follow-url=.*",
|
|
295
|
+
"--no-warnings",
|
|
296
|
+
f"--timeout={timeout}",
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
for pattern in ignore_patterns:
|
|
300
|
+
cmd.append(f"--ignore-url={pattern}")
|
|
301
|
+
|
|
302
|
+
cmd.append(url)
|
|
303
|
+
|
|
304
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
305
|
+
return result.returncode, result.stdout + result.stderr
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def extract_guide_name(url: str) -> str:
|
|
309
|
+
"""Extract a readable guide name from URL."""
|
|
310
|
+
# Try common documentation URL patterns
|
|
311
|
+
# Pattern: /html-single/guide-name/
|
|
312
|
+
match = re.search(r'/html-single/([^/]+)/', url)
|
|
313
|
+
if match:
|
|
314
|
+
return match.group(1).replace('_', ' ')
|
|
315
|
+
|
|
316
|
+
# Pattern: ?topic=guide-name
|
|
317
|
+
match = re.search(r'\?topic=(.+)$', url)
|
|
318
|
+
if match:
|
|
319
|
+
return match.group(1)
|
|
320
|
+
|
|
321
|
+
# Pattern: /guide-name/index.html or /guide-name.html
|
|
322
|
+
match = re.search(r'/([^/]+?)(?:/index)?\.html?$', url)
|
|
323
|
+
if match:
|
|
324
|
+
return match.group(1).replace('-', ' ').replace('_', ' ')
|
|
325
|
+
|
|
326
|
+
# Fallback
|
|
327
|
+
return url.split('/')[-1] or url
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def parse_linkchecker_output(output: str) -> CheckResult:
|
|
331
|
+
"""Parse linkchecker output and extract error details."""
|
|
332
|
+
result = CheckResult(raw_output=output)
|
|
333
|
+
|
|
334
|
+
# Extract total error count
|
|
335
|
+
error_match = re.search(r'(\d+)\s+error', output)
|
|
336
|
+
if error_match:
|
|
337
|
+
result.total_errors = int(error_match.group(1))
|
|
338
|
+
|
|
339
|
+
# Extract link count
|
|
340
|
+
link_match = re.search(r'(\d+)\s+link', output)
|
|
341
|
+
if link_match:
|
|
342
|
+
result.total_links = int(link_match.group(1))
|
|
343
|
+
|
|
344
|
+
# Count known issues
|
|
345
|
+
result.known_issues = output.count("URL host 'host:port' has invalid port")
|
|
346
|
+
|
|
347
|
+
# Count timeout errors
|
|
348
|
+
result.timeout_errors = len(re.findall(r'ReadTimeout|Timeout', output))
|
|
349
|
+
|
|
350
|
+
# Detect Maven Central 403
|
|
351
|
+
if 'search.maven.org' in output and '403 Forbidden' in output:
|
|
352
|
+
result.maven_403 = output.count('search.maven.org')
|
|
353
|
+
|
|
354
|
+
# Parse individual errors
|
|
355
|
+
current_error = LinkError()
|
|
356
|
+
for line in output.split('\n'):
|
|
357
|
+
if line.startswith('URL ') and not line.startswith('URL lengths'):
|
|
358
|
+
if current_error.url:
|
|
359
|
+
result.errors.append(current_error)
|
|
360
|
+
current_error = LinkError()
|
|
361
|
+
current_error.url = line[4:].strip().strip('`\'')
|
|
362
|
+
elif line.startswith('Name '):
|
|
363
|
+
current_error.name = line[5:].strip().strip('`\'')
|
|
364
|
+
elif line.startswith('Parent URL'):
|
|
365
|
+
current_error.parent_url = line[10:].strip()
|
|
366
|
+
elif line.startswith('Real URL'):
|
|
367
|
+
current_error.real_url = line[8:].strip()
|
|
368
|
+
elif line.startswith('Check time'):
|
|
369
|
+
current_error.check_time = line[10:].strip()
|
|
370
|
+
elif line.startswith('Result'):
|
|
371
|
+
current_error.result = line[6:].strip()
|
|
372
|
+
|
|
373
|
+
if current_error.url:
|
|
374
|
+
result.errors.append(current_error)
|
|
375
|
+
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def verify_rewritten_paths(result: CheckResult, rewrite_rules: list[RewriteRule]) -> CheckResult:
|
|
380
|
+
"""Check misresolved paths at corrected paths using rewrite rules."""
|
|
381
|
+
if not rewrite_rules:
|
|
382
|
+
return result
|
|
383
|
+
|
|
384
|
+
verified_urls = set()
|
|
385
|
+
|
|
386
|
+
for err in result.errors:
|
|
387
|
+
real_url = err.real_url
|
|
388
|
+
if is_rewritable_path(real_url, rewrite_rules):
|
|
389
|
+
exists, _ = check_rewritten_url(real_url, rewrite_rules)
|
|
390
|
+
if exists:
|
|
391
|
+
result.rewritten_valid += 1
|
|
392
|
+
verified_urls.add(real_url)
|
|
393
|
+
else:
|
|
394
|
+
result.rewritten_not_found += 1
|
|
395
|
+
|
|
396
|
+
# Filter out verified paths from errors list
|
|
397
|
+
result.errors = [e for e in result.errors if e.real_url not in verified_urls]
|
|
398
|
+
|
|
399
|
+
return result
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def check_single_url(url: str, timeout: int, rewrite_rules: list[RewriteRule] = None,
|
|
403
|
+
ignore_patterns: list[str] = None) -> CheckResult:
|
|
404
|
+
"""Check a single URL and return results."""
|
|
405
|
+
rewrite_rules = rewrite_rules or []
|
|
406
|
+
result = CheckResult(url=url, guide_name=extract_guide_name(url))
|
|
407
|
+
|
|
408
|
+
exit_code, output = run_linkchecker(url, timeout, ignore_patterns)
|
|
409
|
+
result.raw_output = output
|
|
410
|
+
|
|
411
|
+
if exit_code == 0:
|
|
412
|
+
# Success
|
|
413
|
+
link_match = re.search(r'(\d+)\s+link', output)
|
|
414
|
+
result.total_links = int(link_match.group(1)) if link_match else 0
|
|
415
|
+
result.passed = True
|
|
416
|
+
return result
|
|
417
|
+
|
|
418
|
+
# Parse errors
|
|
419
|
+
parsed = parse_linkchecker_output(output)
|
|
420
|
+
result.total_errors = parsed.total_errors
|
|
421
|
+
result.total_links = parsed.total_links
|
|
422
|
+
result.known_issues = parsed.known_issues
|
|
423
|
+
result.timeout_errors = parsed.timeout_errors
|
|
424
|
+
result.maven_403 = parsed.maven_403
|
|
425
|
+
result.errors = parsed.errors
|
|
426
|
+
|
|
427
|
+
# Verify rewritten paths
|
|
428
|
+
result = verify_rewritten_paths(result, rewrite_rules)
|
|
429
|
+
|
|
430
|
+
# Determine if passed (all errors were false positives)
|
|
431
|
+
adjusted_errors = result.total_errors - result.rewritten_valid
|
|
432
|
+
real_errors = adjusted_errors - result.known_issues
|
|
433
|
+
result.passed = real_errors <= 0
|
|
434
|
+
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# =============================================================================
|
|
439
|
+
# Helper functions for bulk mode
|
|
440
|
+
# =============================================================================
|
|
441
|
+
|
|
442
|
+
def load_urls(url_list_file: Path) -> list[str]:
|
|
443
|
+
"""Load URLs from file, skipping comments and empty lines."""
|
|
444
|
+
urls = []
|
|
445
|
+
with open(url_list_file, 'r') as f:
|
|
446
|
+
for line in f:
|
|
447
|
+
line = line.strip()
|
|
448
|
+
if line and not line.startswith('#'):
|
|
449
|
+
urls.append(line)
|
|
450
|
+
return urls
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# =============================================================================
|
|
454
|
+
# Report generation
|
|
455
|
+
# =============================================================================
|
|
456
|
+
|
|
457
|
+
def generate_single_report(url: str, guide_name: str, timeout: int, result: CheckResult,
|
|
458
|
+
rewrite_rules: list[RewriteRule] = None) -> str:
|
|
459
|
+
"""Generate report for single URL check."""
|
|
460
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
|
|
461
|
+
|
|
462
|
+
# Calculate adjusted error count
|
|
463
|
+
adjusted_errors = result.total_errors - result.rewritten_valid
|
|
464
|
+
real_errors = adjusted_errors - result.known_issues
|
|
465
|
+
|
|
466
|
+
if real_errors <= 0:
|
|
467
|
+
status = "PASSED (all errors were false positives)"
|
|
468
|
+
status_icon = "✓"
|
|
469
|
+
else:
|
|
470
|
+
status = "FAILED"
|
|
471
|
+
status_icon = "✗"
|
|
472
|
+
|
|
473
|
+
report = f"""================================================================================
|
|
474
|
+
Documentation Link Check Report
|
|
475
|
+
================================================================================
|
|
476
|
+
Date: {timestamp}
|
|
477
|
+
URL: {url}
|
|
478
|
+
Guide: {guide_name}
|
|
479
|
+
Timeout: {timeout} seconds
|
|
480
|
+
|
|
481
|
+
================================================================================
|
|
482
|
+
SUMMARY
|
|
483
|
+
================================================================================
|
|
484
|
+
|
|
485
|
+
{status_icon} {guide_name} ({adjusted_errors} errors"""
|
|
486
|
+
|
|
487
|
+
if result.rewritten_valid > 0:
|
|
488
|
+
report += f", {result.rewritten_valid} rewritten paths OK"
|
|
489
|
+
if result.known_issues > 0:
|
|
490
|
+
report += f", {result.known_issues} known"
|
|
491
|
+
if result.timeout_errors > 0:
|
|
492
|
+
report += f", {result.timeout_errors} TIMEOUT"
|
|
493
|
+
report += ")\n"
|
|
494
|
+
|
|
495
|
+
report += f"""
|
|
496
|
+
================================================================================
|
|
497
|
+
STATISTICS
|
|
498
|
+
================================================================================
|
|
499
|
+
Total Errors Reported: {result.total_errors}
|
|
500
|
+
Rewritten Paths Verified OK: {result.rewritten_valid}
|
|
501
|
+
Rewritten Paths NOT Found: {result.rewritten_not_found}
|
|
502
|
+
Adjusted Error Count: {adjusted_errors}
|
|
503
|
+
Known Issues Found: {result.known_issues} (safe to ignore)
|
|
504
|
+
Timeout Errors: {result.timeout_errors}
|
|
505
|
+
Status: {status}
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
report += _generate_known_issues_section(rewrite_rules)
|
|
509
|
+
|
|
510
|
+
if result.timeout_errors > 0:
|
|
511
|
+
report += f"""
|
|
512
|
+
================================================================================
|
|
513
|
+
*** TIMEOUT LIMIT REACHED ***
|
|
514
|
+
================================================================================
|
|
515
|
+
|
|
516
|
+
{result.timeout_errors} links exceeded the timeout limit of {timeout} seconds.
|
|
517
|
+
|
|
518
|
+
This indicates slow server responses or network issues, not broken links.
|
|
519
|
+
|
|
520
|
+
RECOMMENDED ACTION:
|
|
521
|
+
Re-run with increased timeout: --timeout {timeout + 30}
|
|
522
|
+
|
|
523
|
+
Timeout errors should be investigated separately from broken links.
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
# Add detailed error information if there are real errors
|
|
527
|
+
if result.errors:
|
|
528
|
+
report += f"""
|
|
529
|
+
================================================================================
|
|
530
|
+
DETAILED ERROR INFORMATION
|
|
531
|
+
================================================================================
|
|
532
|
+
|
|
533
|
+
The following section provides detailed error information.
|
|
534
|
+
This allows you to trace specific failures to exact URLs and error messages.
|
|
535
|
+
|
|
536
|
+
════════════════════════════════════════════════════════════════
|
|
537
|
+
FAILED GUIDE: {guide_name}
|
|
538
|
+
════════════════════════════════════════════════════════════════
|
|
539
|
+
URL: {url}
|
|
540
|
+
Total Errors: {len(result.errors)}
|
|
541
|
+
"""
|
|
542
|
+
if result.rewritten_valid > 0:
|
|
543
|
+
report += f"Rewritten Paths Verified OK: {result.rewritten_valid}\n"
|
|
544
|
+
if result.rewritten_not_found > 0:
|
|
545
|
+
report += f"Rewritten Paths NOT Found: {result.rewritten_not_found}\n"
|
|
546
|
+
|
|
547
|
+
report += _generate_error_details(result.errors)
|
|
548
|
+
|
|
549
|
+
return report
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def generate_bulk_report(url_list_file: Path, timeout: int, bulk_result: BulkResult,
|
|
553
|
+
rewrite_rules: list[RewriteRule] = None) -> str:
|
|
554
|
+
"""Generate report for bulk URL check."""
|
|
555
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
|
|
556
|
+
|
|
557
|
+
report = f"""================================================================================
|
|
558
|
+
Documentation Link Check Report
|
|
559
|
+
================================================================================
|
|
560
|
+
Date: {timestamp}
|
|
561
|
+
Total Guides Checked: {bulk_result.total_guides}
|
|
562
|
+
URL List: {url_list_file}
|
|
563
|
+
Timeout: {timeout} seconds
|
|
564
|
+
|
|
565
|
+
================================================================================
|
|
566
|
+
SUMMARY
|
|
567
|
+
================================================================================
|
|
568
|
+
|
|
569
|
+
"""
|
|
570
|
+
|
|
571
|
+
# Add summary line for each guide
|
|
572
|
+
for guide in bulk_result.guide_results:
|
|
573
|
+
if guide.passed:
|
|
574
|
+
line = f"✓ {guide.guide_name}"
|
|
575
|
+
if guide.total_links:
|
|
576
|
+
line += f" ({guide.total_links} links)"
|
|
577
|
+
if guide.rewritten_valid > 0:
|
|
578
|
+
line += f" [{guide.rewritten_valid} rewritten OK]"
|
|
579
|
+
else:
|
|
580
|
+
adjusted = guide.total_errors - guide.rewritten_valid
|
|
581
|
+
line = f"✗ {guide.guide_name} ({adjusted} errors"
|
|
582
|
+
if guide.rewritten_valid > 0:
|
|
583
|
+
line += f", {guide.rewritten_valid} rewritten OK"
|
|
584
|
+
if guide.known_issues > 0:
|
|
585
|
+
line += f", {guide.known_issues} known"
|
|
586
|
+
if guide.timeout_errors > 0:
|
|
587
|
+
line += f", {guide.timeout_errors} TIMEOUT"
|
|
588
|
+
line += ")"
|
|
589
|
+
report += line + "\n"
|
|
590
|
+
|
|
591
|
+
# Statistics
|
|
592
|
+
success_rate = (bulk_result.passed_count / bulk_result.total_guides * 100) if bulk_result.total_guides > 0 else 0
|
|
593
|
+
|
|
594
|
+
report += f"""
|
|
595
|
+
================================================================================
|
|
596
|
+
STATISTICS
|
|
597
|
+
================================================================================
|
|
598
|
+
Total Guides Checked: {bulk_result.total_guides}
|
|
599
|
+
Passed: {bulk_result.passed_count}
|
|
600
|
+
Failed: {bulk_result.failed_count}
|
|
601
|
+
Rewritten Paths Verified OK: {bulk_result.total_rewritten_valid}
|
|
602
|
+
Rewritten Paths NOT Found: {bulk_result.total_rewritten_not_found}
|
|
603
|
+
Known Issues Found: {bulk_result.total_known_issues} (safe to ignore)
|
|
604
|
+
Timeout Errors: {bulk_result.total_timeout_errors}
|
|
605
|
+
Success Rate: {success_rate:.1f}%
|
|
606
|
+
"""
|
|
607
|
+
|
|
608
|
+
report += _generate_known_issues_section(rewrite_rules)
|
|
609
|
+
|
|
610
|
+
# Timeout warning if any
|
|
611
|
+
if bulk_result.total_timeout_errors > 0:
|
|
612
|
+
report += f"""
|
|
613
|
+
================================================================================
|
|
614
|
+
*** TIMEOUT LIMIT REACHED ***
|
|
615
|
+
================================================================================
|
|
616
|
+
|
|
617
|
+
{bulk_result.total_timeout_errors} links exceeded the timeout limit of {timeout} seconds.
|
|
618
|
+
|
|
619
|
+
This indicates slow server responses or network issues, not broken links.
|
|
620
|
+
|
|
621
|
+
RECOMMENDED ACTION:
|
|
622
|
+
Re-run with increased timeout: --file {url_list_file} --timeout {timeout + 30}
|
|
623
|
+
|
|
624
|
+
Timeout errors should be investigated separately from broken links.
|
|
625
|
+
"""
|
|
626
|
+
|
|
627
|
+
# Detailed errors for failed guides
|
|
628
|
+
failed_guides = [g for g in bulk_result.guide_results if not g.passed]
|
|
629
|
+
if failed_guides:
|
|
630
|
+
report += """
|
|
631
|
+
================================================================================
|
|
632
|
+
DETAILED ERROR INFORMATION BY GUIDE
|
|
633
|
+
================================================================================
|
|
634
|
+
|
|
635
|
+
The following section provides detailed error information for each failed guide.
|
|
636
|
+
This allows you to trace specific failures to exact URLs and error messages.
|
|
637
|
+
"""
|
|
638
|
+
|
|
639
|
+
for guide in failed_guides:
|
|
640
|
+
report += f"""
|
|
641
|
+
════════════════════════════════════════════════════════════════
|
|
642
|
+
FAILED GUIDE: {guide.guide_name}
|
|
643
|
+
════════════════════════════════════════════════════════════════
|
|
644
|
+
URL: {guide.url}
|
|
645
|
+
Total Errors: {len(guide.errors)}
|
|
646
|
+
"""
|
|
647
|
+
if guide.rewritten_valid > 0:
|
|
648
|
+
report += f"Rewritten Paths Verified OK: {guide.rewritten_valid}\n"
|
|
649
|
+
if guide.rewritten_not_found > 0:
|
|
650
|
+
report += f"Rewritten Paths NOT Found: {guide.rewritten_not_found}\n"
|
|
651
|
+
if guide.known_issues > 0:
|
|
652
|
+
report += f"Known Issues: {guide.known_issues} (host:port errors - safe to ignore)\n"
|
|
653
|
+
if guide.timeout_errors > 0:
|
|
654
|
+
report += f"\n*** TIMEOUT LIMIT REACHED: {guide.timeout_errors} links ***\n"
|
|
655
|
+
report += f"Consider increasing timeout: --file {url_list_file} --timeout {timeout + 30}\n"
|
|
656
|
+
|
|
657
|
+
report += _generate_error_details(guide.errors)
|
|
658
|
+
|
|
659
|
+
return report
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _generate_known_issues_section(rewrite_rules: list[RewriteRule] = None) -> str:
|
|
663
|
+
"""Generate the known issues section for reports."""
|
|
664
|
+
section = """
|
|
665
|
+
================================================================================
|
|
666
|
+
KNOWN ISSUES (Safe to Ignore)
|
|
667
|
+
================================================================================
|
|
668
|
+
|
|
669
|
+
The following errors are expected due to LinkChecker limitations:
|
|
670
|
+
|
|
671
|
+
1. "URL host 'host:port' has invalid port"
|
|
672
|
+
- URLs like https://host:port/auth or https://host:port/realms/{realm}
|
|
673
|
+
- These are documentation placeholders using literal "port" text
|
|
674
|
+
- LinkChecker cannot skip syntax-invalid URLs
|
|
675
|
+
- Safe to ignore - not real broken links
|
|
676
|
+
|
|
677
|
+
2. Comma-separated URL lists
|
|
678
|
+
- URLs like http://www.example.com,http://localhost:3000
|
|
679
|
+
- These are examples showing configuration format
|
|
680
|
+
- Already filtered by ignore patterns
|
|
681
|
+
- Should not appear in error logs
|
|
682
|
+
|
|
683
|
+
3. Maven Central 403 Forbidden errors
|
|
684
|
+
- URLs like https://search.maven.org/artifact/...
|
|
685
|
+
- Maven Central blocks automated bots/scrapers with 403 Forbidden
|
|
686
|
+
- These links work fine for humans in a web browser
|
|
687
|
+
- Verify manually if needed - not broken documentation
|
|
688
|
+
"""
|
|
689
|
+
|
|
690
|
+
if rewrite_rules:
|
|
691
|
+
section += """
|
|
692
|
+
4. Misresolved image/resource paths (AUTOMATICALLY VERIFIED)
|
|
693
|
+
- Some documentation platforms use URL routing that causes LinkChecker
|
|
694
|
+
to resolve relative paths against an incorrect base URL
|
|
695
|
+
- This script automatically verifies resources at the corrected path
|
|
696
|
+
- If verified OK, the error is NOT counted; if not found, it IS a real error
|
|
697
|
+
"""
|
|
698
|
+
|
|
699
|
+
section += """
|
|
700
|
+
If you see these errors, they do NOT indicate broken documentation.
|
|
701
|
+
All other errors should be investigated and fixed.
|
|
702
|
+
"""
|
|
703
|
+
return section
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _generate_error_details(errors: list) -> str:
|
|
707
|
+
"""Generate error details section."""
|
|
708
|
+
report = """
|
|
709
|
+
Error Details:
|
|
710
|
+
────────────────────────────────────────────────────────────────
|
|
711
|
+
"""
|
|
712
|
+
for err in errors:
|
|
713
|
+
report += f"URL {err.url}\n"
|
|
714
|
+
if err.name:
|
|
715
|
+
report += f"Name {err.name}\n"
|
|
716
|
+
if err.parent_url:
|
|
717
|
+
report += f"Parent URL {err.parent_url}\n"
|
|
718
|
+
if err.real_url:
|
|
719
|
+
report += f"Real URL {err.real_url}\n"
|
|
720
|
+
if err.check_time:
|
|
721
|
+
report += f"Check time {err.check_time}\n"
|
|
722
|
+
if err.result:
|
|
723
|
+
report += f"Result {err.result}\n"
|
|
724
|
+
report += "\n"
|
|
725
|
+
return report
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
# =============================================================================
|
|
729
|
+
# Main entry points
|
|
730
|
+
# =============================================================================
|
|
731
|
+
|
|
732
|
+
def run_single_mode(url: str, timeout: int, rewrite_rules: list[RewriteRule] = None,
|
|
733
|
+
ignore_patterns: list[str] = None, reports_dir: Path = None):
|
|
734
|
+
"""Run link checker for a single URL."""
|
|
735
|
+
rewrite_rules = rewrite_rules or []
|
|
736
|
+
reports_dir = reports_dir or REPORTS_DIR
|
|
737
|
+
|
|
738
|
+
# Setup
|
|
739
|
+
reports_dir.mkdir(exist_ok=True)
|
|
740
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
741
|
+
report_file = reports_dir / f"link-check-report_{timestamp}.txt"
|
|
742
|
+
guide_name = extract_guide_name(url)
|
|
743
|
+
|
|
744
|
+
# Print header
|
|
745
|
+
print()
|
|
746
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
747
|
+
print(f"{Colors.CYAN} Documentation Link Checker{Colors.NC}")
|
|
748
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
749
|
+
print()
|
|
750
|
+
info(f"URL: {url}")
|
|
751
|
+
info(f"Guide: {guide_name}")
|
|
752
|
+
info(f"Timeout: {timeout} seconds")
|
|
753
|
+
if rewrite_rules:
|
|
754
|
+
info(f"Rewrite rules: {len(rewrite_rules)}")
|
|
755
|
+
if ignore_patterns and ignore_patterns != DEFAULT_IGNORE_PATTERNS:
|
|
756
|
+
info(f"Custom ignore patterns: {len(ignore_patterns) - len(DEFAULT_IGNORE_PATTERNS)} added")
|
|
757
|
+
info(f"Report: {report_file}")
|
|
758
|
+
print()
|
|
759
|
+
|
|
760
|
+
# Run linkchecker
|
|
761
|
+
result = check_single_url(url, timeout, rewrite_rules, ignore_patterns)
|
|
762
|
+
|
|
763
|
+
if result.passed and result.total_errors == 0:
|
|
764
|
+
# Complete success
|
|
765
|
+
print(f"{Colors.GREEN}✓ PASS{Colors.NC} - {result.total_links} links checked")
|
|
766
|
+
print()
|
|
767
|
+
|
|
768
|
+
# Generate simple success report
|
|
769
|
+
report = f"""================================================================================
|
|
770
|
+
Documentation Link Check Report
|
|
771
|
+
================================================================================
|
|
772
|
+
Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")}
|
|
773
|
+
URL: {url}
|
|
774
|
+
Guide: {guide_name}
|
|
775
|
+
Timeout: {timeout} seconds
|
|
776
|
+
|
|
777
|
+
================================================================================
|
|
778
|
+
SUMMARY
|
|
779
|
+
================================================================================
|
|
780
|
+
|
|
781
|
+
✓ {guide_name} ({result.total_links} links)
|
|
782
|
+
|
|
783
|
+
================================================================================
|
|
784
|
+
STATISTICS
|
|
785
|
+
================================================================================
|
|
786
|
+
Total Links Checked: {result.total_links}
|
|
787
|
+
Status: PASSED
|
|
788
|
+
Success Rate: 100.0%
|
|
789
|
+
|
|
790
|
+
All documentation links are valid!
|
|
791
|
+
"""
|
|
792
|
+
report_file.write_text(report)
|
|
793
|
+
info(f"Report saved to: {report_file}")
|
|
794
|
+
print()
|
|
795
|
+
success("All documentation links are valid!")
|
|
796
|
+
return 0
|
|
797
|
+
|
|
798
|
+
# Calculate real errors
|
|
799
|
+
adjusted_errors = result.total_errors - result.rewritten_valid
|
|
800
|
+
real_errors = adjusted_errors - result.known_issues
|
|
801
|
+
|
|
802
|
+
# Print summary
|
|
803
|
+
if real_errors <= 0:
|
|
804
|
+
print(f"{Colors.GREEN}✓ PASS{Colors.NC} - All errors were false positives")
|
|
805
|
+
if result.rewritten_valid > 0:
|
|
806
|
+
print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified at corrected location")
|
|
807
|
+
if result.known_issues > 0:
|
|
808
|
+
print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
|
|
809
|
+
else:
|
|
810
|
+
print(f"{Colors.RED}✗ FAIL{Colors.NC} - {adjusted_errors} errors found")
|
|
811
|
+
if result.rewritten_valid > 0:
|
|
812
|
+
print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified at corrected location (not counted)")
|
|
813
|
+
if result.rewritten_not_found > 0:
|
|
814
|
+
print(f" {Colors.RED}Error:{Colors.NC} {result.rewritten_not_found} path(s) NOT FOUND at corrected location")
|
|
815
|
+
if result.known_issues > 0:
|
|
816
|
+
print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
|
|
817
|
+
if result.timeout_errors > 0:
|
|
818
|
+
print(f" {Colors.RED}TIMEOUT:{Colors.NC} {result.timeout_errors} links exceeded timeout limit")
|
|
819
|
+
print()
|
|
820
|
+
|
|
821
|
+
# Generate and save report
|
|
822
|
+
report = generate_single_report(url, guide_name, timeout, result, rewrite_rules)
|
|
823
|
+
report_file.write_text(report)
|
|
824
|
+
|
|
825
|
+
info(f"Report saved to: {report_file}")
|
|
826
|
+
|
|
827
|
+
if result.rewritten_valid > 0:
|
|
828
|
+
info(f"{result.rewritten_valid} path(s) verified at corrected location")
|
|
829
|
+
|
|
830
|
+
if result.known_issues > 0:
|
|
831
|
+
warning(f"{result.known_issues} known issues found (host:port errors) - safe to ignore")
|
|
832
|
+
|
|
833
|
+
if result.timeout_errors > 0:
|
|
834
|
+
print()
|
|
835
|
+
error("════════════════════════════════════════════════════════════════")
|
|
836
|
+
error(" TIMEOUT LIMIT REACHED")
|
|
837
|
+
error("════════════════════════════════════════════════════════════════")
|
|
838
|
+
error(f"{result.timeout_errors} links exceeded the timeout limit of {timeout} seconds")
|
|
839
|
+
error(f'Consider increasing timeout: --timeout {timeout + 30}')
|
|
840
|
+
error("════════════════════════════════════════════════════════════════")
|
|
841
|
+
|
|
842
|
+
print()
|
|
843
|
+
if real_errors <= 0:
|
|
844
|
+
success("All documentation links are valid!")
|
|
845
|
+
return 0
|
|
846
|
+
else:
|
|
847
|
+
warning("Link check found issues. Review the report for details.")
|
|
848
|
+
return 1
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def run_bulk_mode(url_list_file: Path, timeout: int, rewrite_rules: list[RewriteRule] = None,
|
|
852
|
+
ignore_patterns: list[str] = None, reports_dir: Path = None):
|
|
853
|
+
"""Run link checker for multiple URLs from a file."""
|
|
854
|
+
rewrite_rules = rewrite_rules or []
|
|
855
|
+
reports_dir = reports_dir or REPORTS_DIR
|
|
856
|
+
|
|
857
|
+
# Load URLs
|
|
858
|
+
urls = load_urls(url_list_file)
|
|
859
|
+
total_urls = len(urls)
|
|
860
|
+
|
|
861
|
+
if total_urls == 0:
|
|
862
|
+
error(f"No URLs found in {url_list_file}")
|
|
863
|
+
return 1
|
|
864
|
+
|
|
865
|
+
# Setup
|
|
866
|
+
reports_dir.mkdir(exist_ok=True)
|
|
867
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
868
|
+
report_file = reports_dir / f"link-check-report_{timestamp}.txt"
|
|
869
|
+
|
|
870
|
+
# Print header
|
|
871
|
+
print()
|
|
872
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
873
|
+
print(f"{Colors.CYAN} Documentation Link Checker - Bulk Mode{Colors.NC}")
|
|
874
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
875
|
+
print()
|
|
876
|
+
info(f"URL List: {url_list_file}")
|
|
877
|
+
info(f"Total URLs: {total_urls}")
|
|
878
|
+
info(f"Timeout: {timeout} seconds")
|
|
879
|
+
if rewrite_rules:
|
|
880
|
+
info(f"Rewrite rules: {len(rewrite_rules)}")
|
|
881
|
+
if ignore_patterns and ignore_patterns != DEFAULT_IGNORE_PATTERNS:
|
|
882
|
+
info(f"Custom ignore patterns: {len(ignore_patterns) - len(DEFAULT_IGNORE_PATTERNS)} added")
|
|
883
|
+
info(f"Report: {report_file}")
|
|
884
|
+
print()
|
|
885
|
+
|
|
886
|
+
# Process each URL
|
|
887
|
+
bulk_result = BulkResult(total_guides=total_urls)
|
|
888
|
+
|
|
889
|
+
for i, url in enumerate(urls, 1):
|
|
890
|
+
guide_name = extract_guide_name(url)
|
|
891
|
+
print(f"{Colors.CYAN}[{i}/{total_urls}]{Colors.NC} Checking: {guide_name}...")
|
|
892
|
+
|
|
893
|
+
result = check_single_url(url, timeout, rewrite_rules, ignore_patterns)
|
|
894
|
+
bulk_result.guide_results.append(result)
|
|
895
|
+
|
|
896
|
+
if result.passed:
|
|
897
|
+
bulk_result.passed_count += 1
|
|
898
|
+
links_info = f"{result.total_links} links" if result.total_links else "OK"
|
|
899
|
+
rewritten_info = f" [{result.rewritten_valid} rewritten OK]" if result.rewritten_valid > 0 else ""
|
|
900
|
+
print(f" {Colors.GREEN}✓ PASS{Colors.NC} - {links_info}{rewritten_info}")
|
|
901
|
+
else:
|
|
902
|
+
bulk_result.failed_count += 1
|
|
903
|
+
adjusted = result.total_errors - result.rewritten_valid
|
|
904
|
+
print(f" {Colors.RED}✗ FAIL{Colors.NC} - {adjusted} errors found")
|
|
905
|
+
if result.rewritten_valid > 0:
|
|
906
|
+
print(f" {Colors.BLUE}Note:{Colors.NC} {result.rewritten_valid} path(s) verified OK (not counted)")
|
|
907
|
+
if result.rewritten_not_found > 0:
|
|
908
|
+
print(f" {Colors.RED}Error:{Colors.NC} {result.rewritten_not_found} path(s) NOT FOUND")
|
|
909
|
+
if result.known_issues > 0:
|
|
910
|
+
print(f" {Colors.YELLOW}Note:{Colors.NC} {result.known_issues} known issues (host:port) - safe to ignore")
|
|
911
|
+
if result.timeout_errors > 0:
|
|
912
|
+
print(f" {Colors.RED}TIMEOUT:{Colors.NC} {result.timeout_errors} links exceeded timeout limit")
|
|
913
|
+
|
|
914
|
+
# Accumulate totals
|
|
915
|
+
bulk_result.total_known_issues += result.known_issues
|
|
916
|
+
bulk_result.total_timeout_errors += result.timeout_errors
|
|
917
|
+
bulk_result.total_rewritten_valid += result.rewritten_valid
|
|
918
|
+
bulk_result.total_rewritten_not_found += result.rewritten_not_found
|
|
919
|
+
|
|
920
|
+
# Generate and save report
|
|
921
|
+
report = generate_bulk_report(url_list_file, timeout, bulk_result, rewrite_rules)
|
|
922
|
+
report_file.write_text(report)
|
|
923
|
+
|
|
924
|
+
# Print final summary
|
|
925
|
+
print()
|
|
926
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
927
|
+
print(f"{Colors.CYAN} Results{Colors.NC}")
|
|
928
|
+
print(f"{Colors.CYAN}═══════════════════════════════════════════════════════════════{Colors.NC}")
|
|
929
|
+
print()
|
|
930
|
+
print(f" Total: {total_urls} guides")
|
|
931
|
+
print(f" {Colors.GREEN}Pass: {bulk_result.passed_count}{Colors.NC}")
|
|
932
|
+
print(f" {Colors.RED}Fail: {bulk_result.failed_count}{Colors.NC}")
|
|
933
|
+
if bulk_result.total_rewritten_valid > 0:
|
|
934
|
+
print(f" {Colors.BLUE}Rewritten OK: {bulk_result.total_rewritten_valid} paths verified{Colors.NC}")
|
|
935
|
+
if bulk_result.total_rewritten_not_found > 0:
|
|
936
|
+
print(f" {Colors.RED}Rewritten Missing: {bulk_result.total_rewritten_not_found} paths{Colors.NC}")
|
|
937
|
+
if bulk_result.total_known_issues > 0:
|
|
938
|
+
print(f" {Colors.YELLOW}Known: {bulk_result.total_known_issues} issues (safe to ignore){Colors.NC}")
|
|
939
|
+
if bulk_result.total_timeout_errors > 0:
|
|
940
|
+
print(f" {Colors.RED}Timeout: {bulk_result.total_timeout_errors} links{Colors.NC}")
|
|
941
|
+
|
|
942
|
+
success_rate = (bulk_result.passed_count / total_urls * 100) if total_urls > 0 else 0
|
|
943
|
+
print(f" Rate: {success_rate:.1f}%")
|
|
944
|
+
print()
|
|
945
|
+
info(f"Report saved to: {report_file}")
|
|
946
|
+
|
|
947
|
+
if bulk_result.total_known_issues > 0:
|
|
948
|
+
warning(f"{bulk_result.total_known_issues} known issues found (host:port errors) - safe to ignore")
|
|
949
|
+
|
|
950
|
+
if bulk_result.total_timeout_errors > 0:
|
|
951
|
+
print()
|
|
952
|
+
error("════════════════════════════════════════════════════════════════")
|
|
953
|
+
error(" TIMEOUT LIMIT REACHED")
|
|
954
|
+
error("════════════════════════════════════════════════════════════════")
|
|
955
|
+
error(f"{bulk_result.total_timeout_errors} links exceeded the timeout limit of {timeout} seconds")
|
|
956
|
+
error(f"Consider increasing timeout: --file {url_list_file} --timeout {timeout + 30}")
|
|
957
|
+
error("════════════════════════════════════════════════════════════════")
|
|
958
|
+
|
|
959
|
+
print()
|
|
960
|
+
if bulk_result.failed_count == 0:
|
|
961
|
+
success("All documentation links are valid!")
|
|
962
|
+
return 0
|
|
963
|
+
else:
|
|
964
|
+
warning("Some documentation links have issues. Review the report for details.")
|
|
965
|
+
return 1
|
|
966
|
+
|
|
967
|
+
|
|
968
|
+
def main():
|
|
969
|
+
parser = argparse.ArgumentParser(
|
|
970
|
+
description='Published Documentation Link Checker',
|
|
971
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
972
|
+
epilog="""
|
|
973
|
+
Examples:
|
|
974
|
+
# Single URL
|
|
975
|
+
%(prog)s https://docs.example.com/guide/index.html
|
|
976
|
+
%(prog)s https://docs.example.com/guide/index.html --timeout 90
|
|
977
|
+
|
|
978
|
+
# Bulk validation from file
|
|
979
|
+
%(prog)s --file urls-to-check.txt
|
|
980
|
+
%(prog)s --file urls-to-check.txt --timeout 90
|
|
981
|
+
|
|
982
|
+
# With URL rewriting for misresolved paths
|
|
983
|
+
%(prog)s https://docs.example.com/product/guide \\
|
|
984
|
+
--rewrite-pattern "/docs/en/product/" \\
|
|
985
|
+
--rewrite-replacement "/docs/en/PRODUCT_V1.0/"
|
|
986
|
+
|
|
987
|
+
# With custom ignore patterns
|
|
988
|
+
%(prog)s https://docs.example.com/guide/ \\
|
|
989
|
+
--ignore-pattern "^https?://internal\\.example\\.com" \\
|
|
990
|
+
--ignore-pattern "^https?://staging\\."
|
|
991
|
+
|
|
992
|
+
# Using a configuration file
|
|
993
|
+
%(prog)s https://docs.example.com/guide/ --config linkcheck.conf
|
|
994
|
+
"""
|
|
995
|
+
)
|
|
996
|
+
parser.add_argument('url', nargs='?',
|
|
997
|
+
help='Single URL to check')
|
|
998
|
+
parser.add_argument('--file', '-f', type=Path, dest='url_list',
|
|
999
|
+
help='File containing URLs to check (one per line)')
|
|
1000
|
+
parser.add_argument('--timeout', '-t', type=int, default=DEFAULT_TIMEOUT,
|
|
1001
|
+
help=f'Timeout for each link check in seconds (default: {DEFAULT_TIMEOUT})')
|
|
1002
|
+
parser.add_argument('--rewrite-pattern', action='append', dest='rewrite_patterns',
|
|
1003
|
+
help='URL pattern to match for rewriting (can be used multiple times)')
|
|
1004
|
+
parser.add_argument('--rewrite-replacement', action='append', dest='rewrite_replacements',
|
|
1005
|
+
help='Replacement for matched pattern (must match --rewrite-pattern count)')
|
|
1006
|
+
parser.add_argument('--ignore-pattern', action='append', dest='ignore_patterns',
|
|
1007
|
+
help='Regex pattern for URLs to ignore (can be used multiple times)')
|
|
1008
|
+
parser.add_argument('--config', '-c', type=Path, dest='config_file',
|
|
1009
|
+
help=f'Configuration file (default: {DEFAULT_CONFIG_FILE} if it exists)')
|
|
1010
|
+
parser.add_argument('--reports-dir', type=Path, dest='reports_dir',
|
|
1011
|
+
help=f'Directory for reports (default: {REPORTS_DIR})')
|
|
1012
|
+
|
|
1013
|
+
args = parser.parse_args()
|
|
1014
|
+
|
|
1015
|
+
# Check linkchecker is installed
|
|
1016
|
+
if not check_linkchecker_installed():
|
|
1017
|
+
error("linkchecker is not installed")
|
|
1018
|
+
print()
|
|
1019
|
+
print("Install with: pipx install linkchecker")
|
|
1020
|
+
sys.exit(1)
|
|
1021
|
+
|
|
1022
|
+
# Load configuration file
|
|
1023
|
+
config_path = args.config_file if args.config_file else DEFAULT_CONFIG_FILE
|
|
1024
|
+
config = load_config_file(config_path)
|
|
1025
|
+
|
|
1026
|
+
if config_path.exists() and (config.ignore_patterns or config.rewrite_rules or
|
|
1027
|
+
config.timeout is not None or config.reports_dir is not None):
|
|
1028
|
+
info(f"Loaded configuration from {config_path}")
|
|
1029
|
+
|
|
1030
|
+
# Determine timeout: CLI > config file > default
|
|
1031
|
+
if args.timeout != DEFAULT_TIMEOUT:
|
|
1032
|
+
timeout = args.timeout # CLI explicitly set
|
|
1033
|
+
elif config.timeout is not None:
|
|
1034
|
+
timeout = config.timeout # From config file
|
|
1035
|
+
else:
|
|
1036
|
+
timeout = DEFAULT_TIMEOUT
|
|
1037
|
+
|
|
1038
|
+
# Determine reports directory: CLI > config file > default
|
|
1039
|
+
if args.reports_dir is not None:
|
|
1040
|
+
reports_dir = args.reports_dir
|
|
1041
|
+
elif config.reports_dir is not None:
|
|
1042
|
+
reports_dir = config.reports_dir
|
|
1043
|
+
else:
|
|
1044
|
+
reports_dir = REPORTS_DIR
|
|
1045
|
+
|
|
1046
|
+
# Build ignore patterns: defaults + config file + CLI
|
|
1047
|
+
ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
|
|
1048
|
+
if config.ignore_patterns:
|
|
1049
|
+
ignore_patterns.extend(config.ignore_patterns)
|
|
1050
|
+
if args.ignore_patterns:
|
|
1051
|
+
ignore_patterns.extend(args.ignore_patterns)
|
|
1052
|
+
|
|
1053
|
+
# Parse rewrite rules: config file + CLI
|
|
1054
|
+
rewrite_rules = []
|
|
1055
|
+
for pattern, replacement in config.rewrite_rules:
|
|
1056
|
+
rewrite_rules.append(RewriteRule(pattern=pattern, replacement=replacement))
|
|
1057
|
+
if args.rewrite_patterns:
|
|
1058
|
+
if not args.rewrite_replacements or len(args.rewrite_patterns) != len(args.rewrite_replacements):
|
|
1059
|
+
error("--rewrite-pattern and --rewrite-replacement must be used in pairs")
|
|
1060
|
+
sys.exit(1)
|
|
1061
|
+
for pattern, replacement in zip(args.rewrite_patterns, args.rewrite_replacements):
|
|
1062
|
+
rewrite_rules.append(RewriteRule(pattern=pattern, replacement=replacement))
|
|
1063
|
+
|
|
1064
|
+
# Determine mode
|
|
1065
|
+
if args.url_list:
|
|
1066
|
+
# Bulk mode from file
|
|
1067
|
+
if not args.url_list.exists():
|
|
1068
|
+
error(f"URL list not found: {args.url_list}")
|
|
1069
|
+
sys.exit(1)
|
|
1070
|
+
sys.exit(run_bulk_mode(args.url_list, timeout, rewrite_rules, ignore_patterns, reports_dir))
|
|
1071
|
+
|
|
1072
|
+
elif args.url:
|
|
1073
|
+
# Single URL mode
|
|
1074
|
+
sys.exit(run_single_mode(args.url, timeout, rewrite_rules, ignore_patterns, reports_dir))
|
|
1075
|
+
|
|
1076
|
+
else:
|
|
1077
|
+
# No arguments - show help
|
|
1078
|
+
parser.print_help()
|
|
1079
|
+
sys.exit(1)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
if __name__ == "__main__":
|
|
1083
|
+
main()
|