rolfedh-doc-utils 0.1.4__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archive_unused_files.py +18 -5
- archive_unused_images.py +9 -2
- callout_lib/__init__.py +22 -0
- callout_lib/converter_bullets.py +103 -0
- callout_lib/converter_comments.py +295 -0
- callout_lib/converter_deflist.py +134 -0
- callout_lib/detector.py +364 -0
- callout_lib/table_parser.py +804 -0
- check_published_links.py +1083 -0
- check_scannability.py +6 -0
- check_source_directives.py +101 -0
- convert_callouts_interactive.py +567 -0
- convert_callouts_to_deflist.py +628 -0
- convert_freemarker_to_asciidoc.py +288 -0
- convert_tables_to_deflists.py +479 -0
- doc_utils/convert_freemarker_to_asciidoc.py +708 -0
- doc_utils/duplicate_content.py +409 -0
- doc_utils/duplicate_includes.py +347 -0
- doc_utils/extract_link_attributes.py +618 -0
- doc_utils/format_asciidoc_spacing.py +285 -0
- doc_utils/insert_abstract_role.py +220 -0
- doc_utils/inventory_conditionals.py +164 -0
- doc_utils/missing_source_directive.py +211 -0
- doc_utils/replace_link_attributes.py +187 -0
- doc_utils/spinner.py +119 -0
- doc_utils/unused_adoc.py +150 -22
- doc_utils/unused_attributes.py +218 -6
- doc_utils/unused_images.py +81 -9
- doc_utils/validate_links.py +576 -0
- doc_utils/version.py +8 -0
- doc_utils/version_check.py +243 -0
- doc_utils/warnings_report.py +237 -0
- doc_utils_cli.py +158 -0
- extract_link_attributes.py +120 -0
- find_duplicate_content.py +209 -0
- find_duplicate_includes.py +198 -0
- find_unused_attributes.py +84 -6
- format_asciidoc_spacing.py +134 -0
- insert_abstract_role.py +163 -0
- inventory_conditionals.py +53 -0
- replace_link_attributes.py +214 -0
- rolfedh_doc_utils-0.1.41.dist-info/METADATA +246 -0
- rolfedh_doc_utils-0.1.41.dist-info/RECORD +52 -0
- {rolfedh_doc_utils-0.1.4.dist-info → rolfedh_doc_utils-0.1.41.dist-info}/WHEEL +1 -1
- rolfedh_doc_utils-0.1.41.dist-info/entry_points.txt +20 -0
- rolfedh_doc_utils-0.1.41.dist-info/top_level.txt +21 -0
- validate_links.py +213 -0
- rolfedh_doc_utils-0.1.4.dist-info/METADATA +0 -285
- rolfedh_doc_utils-0.1.4.dist-info/RECORD +0 -17
- rolfedh_doc_utils-0.1.4.dist-info/entry_points.txt +0 -5
- rolfedh_doc_utils-0.1.4.dist-info/top_level.txt +0 -5
- {rolfedh_doc_utils-0.1.4.dist-info → rolfedh_doc_utils-0.1.41.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Validate links in AsciiDoc documentation, checking for broken URLs and missing references.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import hashlib
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Tuple, Optional, Set
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
|
+
from urllib.parse import urlparse, urljoin
|
|
16
|
+
import urllib.request
|
|
17
|
+
import urllib.error
|
|
18
|
+
import socket
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LinkValidator:
|
|
23
|
+
"""Validates links in AsciiDoc documentation."""
|
|
24
|
+
|
|
25
|
+
def __init__(self,
|
|
26
|
+
timeout: int = 10,
|
|
27
|
+
retry: int = 3,
|
|
28
|
+
parallel: int = 10,
|
|
29
|
+
cache_duration: int = 3600,
|
|
30
|
+
transpositions: List[Tuple[str, str]] = None):
|
|
31
|
+
"""
|
|
32
|
+
Initialize the link validator.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
timeout: Timeout in seconds for each URL check
|
|
36
|
+
retry: Number of retries for failed URLs
|
|
37
|
+
parallel: Number of parallel URL checks
|
|
38
|
+
cache_duration: Cache duration in seconds
|
|
39
|
+
transpositions: List of (from_url, to_url) tuples for URL replacement
|
|
40
|
+
"""
|
|
41
|
+
self.timeout = timeout
|
|
42
|
+
self.retry = retry
|
|
43
|
+
self.parallel = parallel
|
|
44
|
+
self.cache_duration = cache_duration
|
|
45
|
+
self.transpositions = transpositions or []
|
|
46
|
+
self.cache = {}
|
|
47
|
+
self.cache_file = Path.home() / '.cache' / 'doc-utils' / 'link-validation.json'
|
|
48
|
+
self._load_cache()
|
|
49
|
+
|
|
50
|
+
def _load_cache(self):
|
|
51
|
+
"""Load cached validation results."""
|
|
52
|
+
if self.cache_file.exists():
|
|
53
|
+
try:
|
|
54
|
+
with open(self.cache_file, 'r') as f:
|
|
55
|
+
cached_data = json.load(f)
|
|
56
|
+
# Check cache expiry
|
|
57
|
+
now = datetime.now().timestamp()
|
|
58
|
+
self.cache = {
|
|
59
|
+
url: result for url, result in cached_data.items()
|
|
60
|
+
if now - result.get('timestamp', 0) < self.cache_duration
|
|
61
|
+
}
|
|
62
|
+
except (json.JSONDecodeError, IOError):
|
|
63
|
+
self.cache = {}
|
|
64
|
+
|
|
65
|
+
def _save_cache(self):
|
|
66
|
+
"""Save validation results to cache."""
|
|
67
|
+
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
with open(self.cache_file, 'w') as f:
|
|
69
|
+
json.dump(self.cache, f, indent=2)
|
|
70
|
+
|
|
71
|
+
def transpose_url(self, url: str) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Apply transposition rules to URL.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
url: Original URL
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Transposed URL if rules match, otherwise original URL
|
|
80
|
+
"""
|
|
81
|
+
for from_pattern, to_pattern in self.transpositions:
|
|
82
|
+
if url.startswith(from_pattern):
|
|
83
|
+
return url.replace(from_pattern, to_pattern, 1)
|
|
84
|
+
return url
|
|
85
|
+
|
|
86
|
+
def extract_links(self, file_path: str, attributes: Dict[str, str] = None) -> List[Dict]:
|
|
87
|
+
"""
|
|
88
|
+
Extract all links from an AsciiDoc file.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
file_path: Path to the AsciiDoc file
|
|
92
|
+
attributes: Dictionary of attribute definitions
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of link dictionaries with url, text, type, line_number
|
|
96
|
+
"""
|
|
97
|
+
links = []
|
|
98
|
+
attributes = attributes or {}
|
|
99
|
+
|
|
100
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
101
|
+
for line_num, line in enumerate(f, 1):
|
|
102
|
+
# Find link: macros
|
|
103
|
+
link_matches = re.finditer(r'link:([^[\]]+)\[([^\]]*)\]', line)
|
|
104
|
+
for match in link_matches:
|
|
105
|
+
url = match.group(1)
|
|
106
|
+
text = match.group(2)
|
|
107
|
+
# Resolve attributes in URL
|
|
108
|
+
resolved_url = self._resolve_attributes(url, attributes)
|
|
109
|
+
links.append({
|
|
110
|
+
'url': resolved_url,
|
|
111
|
+
'original_url': url,
|
|
112
|
+
'text': text,
|
|
113
|
+
'type': 'external',
|
|
114
|
+
'file': file_path,
|
|
115
|
+
'line': line_num
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
# Find xref: macros
|
|
119
|
+
xref_matches = re.finditer(r'xref:([^[\]]+)\[([^\]]*)\]', line)
|
|
120
|
+
for match in xref_matches:
|
|
121
|
+
target = match.group(1)
|
|
122
|
+
text = match.group(2)
|
|
123
|
+
# Resolve attributes in target
|
|
124
|
+
resolved_target = self._resolve_attributes(target, attributes)
|
|
125
|
+
links.append({
|
|
126
|
+
'url': resolved_target,
|
|
127
|
+
'original_url': target,
|
|
128
|
+
'text': text,
|
|
129
|
+
'type': 'internal',
|
|
130
|
+
'file': file_path,
|
|
131
|
+
'line': line_num
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
# Find image:: directives
|
|
135
|
+
image_matches = re.finditer(r'image::([^[\]]+)\[', line)
|
|
136
|
+
for match in image_matches:
|
|
137
|
+
path = match.group(1)
|
|
138
|
+
resolved_path = self._resolve_attributes(path, attributes)
|
|
139
|
+
links.append({
|
|
140
|
+
'url': resolved_path,
|
|
141
|
+
'original_url': path,
|
|
142
|
+
'text': 'image',
|
|
143
|
+
'type': 'image',
|
|
144
|
+
'file': file_path,
|
|
145
|
+
'line': line_num
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
return links
|
|
149
|
+
|
|
150
|
+
def _resolve_attributes(self, text: str, attributes: Dict[str, str]) -> str:
|
|
151
|
+
"""Resolve attributes in text."""
|
|
152
|
+
resolved = text
|
|
153
|
+
max_iterations = 10
|
|
154
|
+
|
|
155
|
+
for _ in range(max_iterations):
|
|
156
|
+
# Find all attribute references
|
|
157
|
+
refs = re.findall(r'\{([^}]+)\}', resolved)
|
|
158
|
+
if not refs:
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
changes_made = False
|
|
162
|
+
for ref in refs:
|
|
163
|
+
if ref in attributes:
|
|
164
|
+
resolved = resolved.replace(f'{{{ref}}}', attributes[ref])
|
|
165
|
+
changes_made = True
|
|
166
|
+
|
|
167
|
+
if not changes_made:
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
return resolved
|
|
171
|
+
|
|
172
|
+
def validate_url(self, url: str, original_url: str = None, use_cache: bool = True) -> Dict:
|
|
173
|
+
"""
|
|
174
|
+
Validate a single URL.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
url: URL to validate
|
|
178
|
+
original_url: Original URL before transposition
|
|
179
|
+
use_cache: Whether to use cached results
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dictionary with validation results
|
|
183
|
+
"""
|
|
184
|
+
# Check cache first
|
|
185
|
+
cache_key = f"{url}:{original_url}" if original_url else url
|
|
186
|
+
if use_cache and cache_key in self.cache:
|
|
187
|
+
cached = self.cache[cache_key]
|
|
188
|
+
if datetime.now().timestamp() - cached['timestamp'] < self.cache_duration:
|
|
189
|
+
return cached
|
|
190
|
+
|
|
191
|
+
result = {
|
|
192
|
+
'url': url,
|
|
193
|
+
'original_url': original_url or url,
|
|
194
|
+
'status': None,
|
|
195
|
+
'error': None,
|
|
196
|
+
'redirect': None,
|
|
197
|
+
'timestamp': datetime.now().timestamp()
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# Apply transposition if needed
|
|
201
|
+
check_url = self.transpose_url(url)
|
|
202
|
+
if check_url != url:
|
|
203
|
+
result['transposed_url'] = check_url
|
|
204
|
+
|
|
205
|
+
# Validate the URL
|
|
206
|
+
for attempt in range(self.retry):
|
|
207
|
+
try:
|
|
208
|
+
req = urllib.request.Request(
|
|
209
|
+
check_url,
|
|
210
|
+
headers={
|
|
211
|
+
'User-Agent': 'Mozilla/5.0 (doc-utils link validator)',
|
|
212
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as response:
|
|
217
|
+
result['status'] = response.status
|
|
218
|
+
# Check for redirect
|
|
219
|
+
if response.url != check_url:
|
|
220
|
+
result['redirect'] = response.url
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
except urllib.error.HTTPError as e:
|
|
224
|
+
result['status'] = e.code
|
|
225
|
+
result['error'] = str(e)
|
|
226
|
+
if e.code not in [500, 502, 503, 504]: # Don't retry client errors
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
except urllib.error.URLError as e:
|
|
230
|
+
result['error'] = str(e.reason)
|
|
231
|
+
|
|
232
|
+
except socket.timeout:
|
|
233
|
+
result['error'] = 'Timeout'
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
result['error'] = str(e)
|
|
237
|
+
|
|
238
|
+
# Wait before retry
|
|
239
|
+
if attempt < self.retry - 1:
|
|
240
|
+
time.sleep(2 ** attempt) # Exponential backoff
|
|
241
|
+
|
|
242
|
+
# Cache the result
|
|
243
|
+
self.cache[cache_key] = result
|
|
244
|
+
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
def validate_internal_reference(self, ref: str, base_dir: str) -> Dict:
|
|
248
|
+
"""
|
|
249
|
+
Validate an internal reference (xref).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
ref: Reference path
|
|
253
|
+
base_dir: Base directory for relative paths
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Dictionary with validation results
|
|
257
|
+
"""
|
|
258
|
+
result = {
|
|
259
|
+
'url': ref,
|
|
260
|
+
'type': 'internal',
|
|
261
|
+
'status': None,
|
|
262
|
+
'error': None
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
# Handle anchor references
|
|
266
|
+
if ref.startswith('#'):
|
|
267
|
+
# TODO: Check if anchor exists in current file
|
|
268
|
+
result['status'] = 'anchor'
|
|
269
|
+
return result
|
|
270
|
+
|
|
271
|
+
# Parse file and anchor
|
|
272
|
+
parts = ref.split('#', 1)
|
|
273
|
+
file_ref = parts[0]
|
|
274
|
+
anchor = parts[1] if len(parts) > 1 else None
|
|
275
|
+
|
|
276
|
+
# Resolve file path
|
|
277
|
+
if os.path.isabs(file_ref):
|
|
278
|
+
file_path = file_ref
|
|
279
|
+
else:
|
|
280
|
+
file_path = os.path.normpath(os.path.join(base_dir, file_ref))
|
|
281
|
+
|
|
282
|
+
# Check if file exists
|
|
283
|
+
if os.path.exists(file_path):
|
|
284
|
+
result['status'] = 'ok'
|
|
285
|
+
# TODO: If anchor provided, check if it exists in the file
|
|
286
|
+
else:
|
|
287
|
+
result['status'] = 'missing'
|
|
288
|
+
result['error'] = f"File not found: {file_path}"
|
|
289
|
+
|
|
290
|
+
return result
|
|
291
|
+
|
|
292
|
+
def validate_image(self, path: str, base_dir: str) -> Dict:
|
|
293
|
+
"""
|
|
294
|
+
Validate an image path.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
path: Image path
|
|
298
|
+
base_dir: Base directory for relative paths
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary with validation results
|
|
302
|
+
"""
|
|
303
|
+
result = {
|
|
304
|
+
'url': path,
|
|
305
|
+
'type': 'image',
|
|
306
|
+
'status': None,
|
|
307
|
+
'error': None
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# Check if it's a URL
|
|
311
|
+
if path.startswith(('http://', 'https://')):
|
|
312
|
+
return self.validate_url(path)
|
|
313
|
+
|
|
314
|
+
# Resolve file path
|
|
315
|
+
if os.path.isabs(path):
|
|
316
|
+
file_path = path
|
|
317
|
+
else:
|
|
318
|
+
file_path = os.path.normpath(os.path.join(base_dir, path))
|
|
319
|
+
|
|
320
|
+
# Check if file exists
|
|
321
|
+
if os.path.exists(file_path):
|
|
322
|
+
result['status'] = 'ok'
|
|
323
|
+
else:
|
|
324
|
+
result['status'] = 'missing'
|
|
325
|
+
result['error'] = f"Image not found: {file_path}"
|
|
326
|
+
|
|
327
|
+
return result
|
|
328
|
+
|
|
329
|
+
def validate_links_in_file(self, file_path: str, attributes: Dict[str, str] = None) -> List[Dict]:
|
|
330
|
+
"""
|
|
331
|
+
Validate all links in a single file.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
file_path: Path to the AsciiDoc file
|
|
335
|
+
attributes: Dictionary of attribute definitions
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
List of validation results
|
|
339
|
+
"""
|
|
340
|
+
links = self.extract_links(file_path, attributes)
|
|
341
|
+
results = []
|
|
342
|
+
base_dir = os.path.dirname(file_path)
|
|
343
|
+
|
|
344
|
+
# Group links by type for efficient processing
|
|
345
|
+
external_links = [l for l in links if l['type'] == 'external']
|
|
346
|
+
internal_links = [l for l in links if l['type'] == 'internal']
|
|
347
|
+
image_links = [l for l in links if l['type'] == 'image']
|
|
348
|
+
|
|
349
|
+
# Validate external links in parallel
|
|
350
|
+
if external_links:
|
|
351
|
+
with ThreadPoolExecutor(max_workers=self.parallel) as executor:
|
|
352
|
+
futures = {
|
|
353
|
+
executor.submit(self.validate_url, link['url'], link['original_url']): link
|
|
354
|
+
for link in external_links
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
for future in as_completed(futures):
|
|
358
|
+
link = futures[future]
|
|
359
|
+
try:
|
|
360
|
+
result = future.result()
|
|
361
|
+
result.update(link)
|
|
362
|
+
results.append(result)
|
|
363
|
+
except Exception as e:
|
|
364
|
+
result = link.copy()
|
|
365
|
+
result['error'] = str(e)
|
|
366
|
+
results.append(result)
|
|
367
|
+
|
|
368
|
+
# Validate internal references
|
|
369
|
+
for link in internal_links:
|
|
370
|
+
result = self.validate_internal_reference(link['url'], base_dir)
|
|
371
|
+
result.update(link)
|
|
372
|
+
results.append(result)
|
|
373
|
+
|
|
374
|
+
# Validate image paths
|
|
375
|
+
for link in image_links:
|
|
376
|
+
result = self.validate_image(link['url'], base_dir)
|
|
377
|
+
result.update(link)
|
|
378
|
+
results.append(result)
|
|
379
|
+
|
|
380
|
+
return results
|
|
381
|
+
|
|
382
|
+
def validate_all(self, scan_dirs: List[str] = None,
|
|
383
|
+
attributes_file: str = None,
|
|
384
|
+
exclude_domains: List[str] = None) -> Dict:
|
|
385
|
+
"""
|
|
386
|
+
Validate all links in documentation.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
scan_dirs: Directories to scan
|
|
390
|
+
attributes_file: Path to attributes file
|
|
391
|
+
exclude_domains: Domains to skip
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Dictionary with all validation results
|
|
395
|
+
"""
|
|
396
|
+
if scan_dirs is None:
|
|
397
|
+
scan_dirs = ['.']
|
|
398
|
+
|
|
399
|
+
exclude_domains = exclude_domains or []
|
|
400
|
+
|
|
401
|
+
# Load attributes
|
|
402
|
+
attributes = {}
|
|
403
|
+
if attributes_file and os.path.exists(attributes_file):
|
|
404
|
+
attributes = self._load_attributes(attributes_file)
|
|
405
|
+
|
|
406
|
+
# Collect all .adoc files
|
|
407
|
+
adoc_files = []
|
|
408
|
+
for scan_dir in scan_dirs:
|
|
409
|
+
for root, _, files in os.walk(scan_dir):
|
|
410
|
+
# Skip hidden directories
|
|
411
|
+
if '/.' in root:
|
|
412
|
+
continue
|
|
413
|
+
for file in files:
|
|
414
|
+
if file.endswith('.adoc'):
|
|
415
|
+
adoc_files.append(os.path.join(root, file))
|
|
416
|
+
|
|
417
|
+
# Validate links in all files
|
|
418
|
+
all_results = {
|
|
419
|
+
'files': {},
|
|
420
|
+
'summary': {
|
|
421
|
+
'total': 0,
|
|
422
|
+
'valid': 0,
|
|
423
|
+
'broken': 0,
|
|
424
|
+
'warnings': 0,
|
|
425
|
+
'skipped': 0
|
|
426
|
+
},
|
|
427
|
+
'broken_links': [],
|
|
428
|
+
'warnings': [],
|
|
429
|
+
'transpositions': [
|
|
430
|
+
{'from': t[0], 'to': t[1]} for t in self.transpositions
|
|
431
|
+
]
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
for file_path in adoc_files:
|
|
435
|
+
results = self.validate_links_in_file(file_path, attributes)
|
|
436
|
+
|
|
437
|
+
# Filter out excluded domains
|
|
438
|
+
filtered_results = []
|
|
439
|
+
for result in results:
|
|
440
|
+
url = result.get('url', '')
|
|
441
|
+
parsed = urlparse(url)
|
|
442
|
+
if parsed.netloc in exclude_domains:
|
|
443
|
+
result['status'] = 'skipped'
|
|
444
|
+
result['reason'] = 'Domain excluded'
|
|
445
|
+
filtered_results.append(result)
|
|
446
|
+
|
|
447
|
+
all_results['files'][file_path] = filtered_results
|
|
448
|
+
|
|
449
|
+
# Update summary
|
|
450
|
+
for result in filtered_results:
|
|
451
|
+
all_results['summary']['total'] += 1
|
|
452
|
+
|
|
453
|
+
if result.get('status') == 'skipped':
|
|
454
|
+
all_results['summary']['skipped'] += 1
|
|
455
|
+
elif result.get('status') in ['ok', 200, 'anchor']:
|
|
456
|
+
all_results['summary']['valid'] += 1
|
|
457
|
+
elif result.get('status') in [301, 302, 303, 307, 308]:
|
|
458
|
+
all_results['summary']['warnings'] += 1
|
|
459
|
+
all_results['warnings'].append(result)
|
|
460
|
+
elif result.get('error') or result.get('status') in ['missing', 404]:
|
|
461
|
+
all_results['summary']['broken'] += 1
|
|
462
|
+
all_results['broken_links'].append(result)
|
|
463
|
+
else:
|
|
464
|
+
# Treat other status codes as broken
|
|
465
|
+
all_results['summary']['broken'] += 1
|
|
466
|
+
all_results['broken_links'].append(result)
|
|
467
|
+
|
|
468
|
+
# Save cache
|
|
469
|
+
self._save_cache()
|
|
470
|
+
|
|
471
|
+
return all_results
|
|
472
|
+
|
|
473
|
+
def _load_attributes(self, attributes_file: str) -> Dict[str, str]:
|
|
474
|
+
"""Load attributes from file."""
|
|
475
|
+
attributes = {}
|
|
476
|
+
|
|
477
|
+
with open(attributes_file, 'r', encoding='utf-8') as f:
|
|
478
|
+
for line in f:
|
|
479
|
+
# Match attribute definitions
|
|
480
|
+
match = re.match(r'^:([^:]+):\s*(.*)$', line)
|
|
481
|
+
if match:
|
|
482
|
+
attr_name = match.group(1).strip()
|
|
483
|
+
attr_value = match.group(2).strip()
|
|
484
|
+
attributes[attr_name] = attr_value
|
|
485
|
+
|
|
486
|
+
return attributes
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def parse_transpositions(transpose_args: List[str]) -> List[Tuple[str, str]]:
|
|
490
|
+
"""
|
|
491
|
+
Parse transposition arguments.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
transpose_args: List of transposition strings in format "from--to"
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
List of (from_url, to_url) tuples
|
|
498
|
+
"""
|
|
499
|
+
transpositions = []
|
|
500
|
+
|
|
501
|
+
for arg in transpose_args or []:
|
|
502
|
+
parts = arg.split('--')
|
|
503
|
+
if len(parts) == 2:
|
|
504
|
+
from_url = parts[0].strip()
|
|
505
|
+
to_url = parts[1].strip()
|
|
506
|
+
transpositions.append((from_url, to_url))
|
|
507
|
+
else:
|
|
508
|
+
print(f"Warning: Invalid transposition format: {arg}")
|
|
509
|
+
print("Expected format: from_url--to_url")
|
|
510
|
+
|
|
511
|
+
return transpositions
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def format_results(results: Dict, verbose: bool = False) -> str:
|
|
515
|
+
"""
|
|
516
|
+
Format validation results for display.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
results: Validation results dictionary
|
|
520
|
+
verbose: Whether to show verbose output
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Formatted string for display
|
|
524
|
+
"""
|
|
525
|
+
output = []
|
|
526
|
+
|
|
527
|
+
# Show transpositions if any
|
|
528
|
+
if results.get('transpositions'):
|
|
529
|
+
output.append("URL Transposition Rules:")
|
|
530
|
+
for trans in results['transpositions']:
|
|
531
|
+
output.append(f" {trans['from']} → {trans['to']}")
|
|
532
|
+
output.append("")
|
|
533
|
+
|
|
534
|
+
# Summary
|
|
535
|
+
summary = results['summary']
|
|
536
|
+
output.append("SUMMARY:")
|
|
537
|
+
output.append(f"✓ Valid: {summary['valid']} links")
|
|
538
|
+
if summary['broken'] > 0:
|
|
539
|
+
output.append(f"✗ Broken: {summary['broken']} links")
|
|
540
|
+
if summary['warnings'] > 0:
|
|
541
|
+
output.append(f"⚠ Warnings: {summary['warnings']} redirects")
|
|
542
|
+
if summary['skipped'] > 0:
|
|
543
|
+
output.append(f"⊘ Skipped: {summary['skipped']} links (excluded domains)")
|
|
544
|
+
output.append("")
|
|
545
|
+
|
|
546
|
+
# Broken links
|
|
547
|
+
if results['broken_links']:
|
|
548
|
+
output.append("BROKEN LINKS:")
|
|
549
|
+
for i, link in enumerate(results['broken_links'], 1):
|
|
550
|
+
output.append(f"\n{i}. {link['file']}:{link['line']}")
|
|
551
|
+
if link.get('original_url') and link.get('original_url') != link.get('url'):
|
|
552
|
+
output.append(f" Original: {link['original_url']}")
|
|
553
|
+
output.append(f" Resolved: {link['url']}")
|
|
554
|
+
else:
|
|
555
|
+
output.append(f" URL: {link['url']}")
|
|
556
|
+
|
|
557
|
+
if link.get('transposed_url'):
|
|
558
|
+
output.append(f" Checked: {link['transposed_url']}")
|
|
559
|
+
|
|
560
|
+
if link.get('status'):
|
|
561
|
+
output.append(f" Status: {link['status']}")
|
|
562
|
+
if link.get('error'):
|
|
563
|
+
output.append(f" Error: {link['error']}")
|
|
564
|
+
output.append("")
|
|
565
|
+
|
|
566
|
+
# Warnings (redirects)
|
|
567
|
+
if results['warnings'] and verbose:
|
|
568
|
+
output.append("WARNINGS (Redirects):")
|
|
569
|
+
for i, link in enumerate(results['warnings'], 1):
|
|
570
|
+
output.append(f"\n{i}. {link['file']}:{link['line']}")
|
|
571
|
+
output.append(f" URL: {link['url']}")
|
|
572
|
+
if link.get('redirect'):
|
|
573
|
+
output.append(f" Redirects to: {link['redirect']}")
|
|
574
|
+
output.append("")
|
|
575
|
+
|
|
576
|
+
return '\n'.join(output)
|