rolfedh-doc-utils 0.1.4__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. archive_unused_files.py +18 -5
  2. archive_unused_images.py +9 -2
  3. callout_lib/__init__.py +22 -0
  4. callout_lib/converter_bullets.py +103 -0
  5. callout_lib/converter_comments.py +295 -0
  6. callout_lib/converter_deflist.py +134 -0
  7. callout_lib/detector.py +364 -0
  8. callout_lib/table_parser.py +804 -0
  9. check_published_links.py +1083 -0
  10. check_scannability.py +6 -0
  11. check_source_directives.py +101 -0
  12. convert_callouts_interactive.py +567 -0
  13. convert_callouts_to_deflist.py +628 -0
  14. convert_freemarker_to_asciidoc.py +288 -0
  15. convert_tables_to_deflists.py +479 -0
  16. doc_utils/convert_freemarker_to_asciidoc.py +708 -0
  17. doc_utils/duplicate_content.py +409 -0
  18. doc_utils/duplicate_includes.py +347 -0
  19. doc_utils/extract_link_attributes.py +618 -0
  20. doc_utils/format_asciidoc_spacing.py +285 -0
  21. doc_utils/insert_abstract_role.py +220 -0
  22. doc_utils/inventory_conditionals.py +164 -0
  23. doc_utils/missing_source_directive.py +211 -0
  24. doc_utils/replace_link_attributes.py +187 -0
  25. doc_utils/spinner.py +119 -0
  26. doc_utils/unused_adoc.py +150 -22
  27. doc_utils/unused_attributes.py +218 -6
  28. doc_utils/unused_images.py +81 -9
  29. doc_utils/validate_links.py +576 -0
  30. doc_utils/version.py +8 -0
  31. doc_utils/version_check.py +243 -0
  32. doc_utils/warnings_report.py +237 -0
  33. doc_utils_cli.py +158 -0
  34. extract_link_attributes.py +120 -0
  35. find_duplicate_content.py +209 -0
  36. find_duplicate_includes.py +198 -0
  37. find_unused_attributes.py +84 -6
  38. format_asciidoc_spacing.py +134 -0
  39. insert_abstract_role.py +163 -0
  40. inventory_conditionals.py +53 -0
  41. replace_link_attributes.py +214 -0
  42. rolfedh_doc_utils-0.1.41.dist-info/METADATA +246 -0
  43. rolfedh_doc_utils-0.1.41.dist-info/RECORD +52 -0
  44. {rolfedh_doc_utils-0.1.4.dist-info → rolfedh_doc_utils-0.1.41.dist-info}/WHEEL +1 -1
  45. rolfedh_doc_utils-0.1.41.dist-info/entry_points.txt +20 -0
  46. rolfedh_doc_utils-0.1.41.dist-info/top_level.txt +21 -0
  47. validate_links.py +213 -0
  48. rolfedh_doc_utils-0.1.4.dist-info/METADATA +0 -285
  49. rolfedh_doc_utils-0.1.4.dist-info/RECORD +0 -17
  50. rolfedh_doc_utils-0.1.4.dist-info/entry_points.txt +0 -5
  51. rolfedh_doc_utils-0.1.4.dist-info/top_level.txt +0 -5
  52. {rolfedh_doc_utils-0.1.4.dist-info → rolfedh_doc_utils-0.1.41.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,576 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Validate links in AsciiDoc documentation, checking for broken URLs and missing references.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import time
9
+ import json
10
+ import hashlib
11
+ from pathlib import Path
12
+ from typing import Dict, List, Tuple, Optional, Set
13
+ from collections import defaultdict
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ from urllib.parse import urlparse, urljoin
16
+ import urllib.request
17
+ import urllib.error
18
+ import socket
19
+ from datetime import datetime, timedelta
20
+
21
+
22
+ class LinkValidator:
23
+ """Validates links in AsciiDoc documentation."""
24
+
25
+ def __init__(self,
26
+ timeout: int = 10,
27
+ retry: int = 3,
28
+ parallel: int = 10,
29
+ cache_duration: int = 3600,
30
+ transpositions: List[Tuple[str, str]] = None):
31
+ """
32
+ Initialize the link validator.
33
+
34
+ Args:
35
+ timeout: Timeout in seconds for each URL check
36
+ retry: Number of retries for failed URLs
37
+ parallel: Number of parallel URL checks
38
+ cache_duration: Cache duration in seconds
39
+ transpositions: List of (from_url, to_url) tuples for URL replacement
40
+ """
41
+ self.timeout = timeout
42
+ self.retry = retry
43
+ self.parallel = parallel
44
+ self.cache_duration = cache_duration
45
+ self.transpositions = transpositions or []
46
+ self.cache = {}
47
+ self.cache_file = Path.home() / '.cache' / 'doc-utils' / 'link-validation.json'
48
+ self._load_cache()
49
+
50
+ def _load_cache(self):
51
+ """Load cached validation results."""
52
+ if self.cache_file.exists():
53
+ try:
54
+ with open(self.cache_file, 'r') as f:
55
+ cached_data = json.load(f)
56
+ # Check cache expiry
57
+ now = datetime.now().timestamp()
58
+ self.cache = {
59
+ url: result for url, result in cached_data.items()
60
+ if now - result.get('timestamp', 0) < self.cache_duration
61
+ }
62
+ except (json.JSONDecodeError, IOError):
63
+ self.cache = {}
64
+
65
+ def _save_cache(self):
66
+ """Save validation results to cache."""
67
+ self.cache_file.parent.mkdir(parents=True, exist_ok=True)
68
+ with open(self.cache_file, 'w') as f:
69
+ json.dump(self.cache, f, indent=2)
70
+
71
+ def transpose_url(self, url: str) -> str:
72
+ """
73
+ Apply transposition rules to URL.
74
+
75
+ Args:
76
+ url: Original URL
77
+
78
+ Returns:
79
+ Transposed URL if rules match, otherwise original URL
80
+ """
81
+ for from_pattern, to_pattern in self.transpositions:
82
+ if url.startswith(from_pattern):
83
+ return url.replace(from_pattern, to_pattern, 1)
84
+ return url
85
+
86
+ def extract_links(self, file_path: str, attributes: Dict[str, str] = None) -> List[Dict]:
87
+ """
88
+ Extract all links from an AsciiDoc file.
89
+
90
+ Args:
91
+ file_path: Path to the AsciiDoc file
92
+ attributes: Dictionary of attribute definitions
93
+
94
+ Returns:
95
+ List of link dictionaries with url, text, type, line_number
96
+ """
97
+ links = []
98
+ attributes = attributes or {}
99
+
100
+ with open(file_path, 'r', encoding='utf-8') as f:
101
+ for line_num, line in enumerate(f, 1):
102
+ # Find link: macros
103
+ link_matches = re.finditer(r'link:([^[\]]+)\[([^\]]*)\]', line)
104
+ for match in link_matches:
105
+ url = match.group(1)
106
+ text = match.group(2)
107
+ # Resolve attributes in URL
108
+ resolved_url = self._resolve_attributes(url, attributes)
109
+ links.append({
110
+ 'url': resolved_url,
111
+ 'original_url': url,
112
+ 'text': text,
113
+ 'type': 'external',
114
+ 'file': file_path,
115
+ 'line': line_num
116
+ })
117
+
118
+ # Find xref: macros
119
+ xref_matches = re.finditer(r'xref:([^[\]]+)\[([^\]]*)\]', line)
120
+ for match in xref_matches:
121
+ target = match.group(1)
122
+ text = match.group(2)
123
+ # Resolve attributes in target
124
+ resolved_target = self._resolve_attributes(target, attributes)
125
+ links.append({
126
+ 'url': resolved_target,
127
+ 'original_url': target,
128
+ 'text': text,
129
+ 'type': 'internal',
130
+ 'file': file_path,
131
+ 'line': line_num
132
+ })
133
+
134
+ # Find image:: directives
135
+ image_matches = re.finditer(r'image::([^[\]]+)\[', line)
136
+ for match in image_matches:
137
+ path = match.group(1)
138
+ resolved_path = self._resolve_attributes(path, attributes)
139
+ links.append({
140
+ 'url': resolved_path,
141
+ 'original_url': path,
142
+ 'text': 'image',
143
+ 'type': 'image',
144
+ 'file': file_path,
145
+ 'line': line_num
146
+ })
147
+
148
+ return links
149
+
150
+ def _resolve_attributes(self, text: str, attributes: Dict[str, str]) -> str:
151
+ """Resolve attributes in text."""
152
+ resolved = text
153
+ max_iterations = 10
154
+
155
+ for _ in range(max_iterations):
156
+ # Find all attribute references
157
+ refs = re.findall(r'\{([^}]+)\}', resolved)
158
+ if not refs:
159
+ break
160
+
161
+ changes_made = False
162
+ for ref in refs:
163
+ if ref in attributes:
164
+ resolved = resolved.replace(f'{{{ref}}}', attributes[ref])
165
+ changes_made = True
166
+
167
+ if not changes_made:
168
+ break
169
+
170
+ return resolved
171
+
172
+ def validate_url(self, url: str, original_url: str = None, use_cache: bool = True) -> Dict:
173
+ """
174
+ Validate a single URL.
175
+
176
+ Args:
177
+ url: URL to validate
178
+ original_url: Original URL before transposition
179
+ use_cache: Whether to use cached results
180
+
181
+ Returns:
182
+ Dictionary with validation results
183
+ """
184
+ # Check cache first
185
+ cache_key = f"{url}:{original_url}" if original_url else url
186
+ if use_cache and cache_key in self.cache:
187
+ cached = self.cache[cache_key]
188
+ if datetime.now().timestamp() - cached['timestamp'] < self.cache_duration:
189
+ return cached
190
+
191
+ result = {
192
+ 'url': url,
193
+ 'original_url': original_url or url,
194
+ 'status': None,
195
+ 'error': None,
196
+ 'redirect': None,
197
+ 'timestamp': datetime.now().timestamp()
198
+ }
199
+
200
+ # Apply transposition if needed
201
+ check_url = self.transpose_url(url)
202
+ if check_url != url:
203
+ result['transposed_url'] = check_url
204
+
205
+ # Validate the URL
206
+ for attempt in range(self.retry):
207
+ try:
208
+ req = urllib.request.Request(
209
+ check_url,
210
+ headers={
211
+ 'User-Agent': 'Mozilla/5.0 (doc-utils link validator)',
212
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
213
+ }
214
+ )
215
+
216
+ with urllib.request.urlopen(req, timeout=self.timeout) as response:
217
+ result['status'] = response.status
218
+ # Check for redirect
219
+ if response.url != check_url:
220
+ result['redirect'] = response.url
221
+ break
222
+
223
+ except urllib.error.HTTPError as e:
224
+ result['status'] = e.code
225
+ result['error'] = str(e)
226
+ if e.code not in [500, 502, 503, 504]: # Don't retry client errors
227
+ break
228
+
229
+ except urllib.error.URLError as e:
230
+ result['error'] = str(e.reason)
231
+
232
+ except socket.timeout:
233
+ result['error'] = 'Timeout'
234
+
235
+ except Exception as e:
236
+ result['error'] = str(e)
237
+
238
+ # Wait before retry
239
+ if attempt < self.retry - 1:
240
+ time.sleep(2 ** attempt) # Exponential backoff
241
+
242
+ # Cache the result
243
+ self.cache[cache_key] = result
244
+
245
+ return result
246
+
247
+ def validate_internal_reference(self, ref: str, base_dir: str) -> Dict:
248
+ """
249
+ Validate an internal reference (xref).
250
+
251
+ Args:
252
+ ref: Reference path
253
+ base_dir: Base directory for relative paths
254
+
255
+ Returns:
256
+ Dictionary with validation results
257
+ """
258
+ result = {
259
+ 'url': ref,
260
+ 'type': 'internal',
261
+ 'status': None,
262
+ 'error': None
263
+ }
264
+
265
+ # Handle anchor references
266
+ if ref.startswith('#'):
267
+ # TODO: Check if anchor exists in current file
268
+ result['status'] = 'anchor'
269
+ return result
270
+
271
+ # Parse file and anchor
272
+ parts = ref.split('#', 1)
273
+ file_ref = parts[0]
274
+ anchor = parts[1] if len(parts) > 1 else None
275
+
276
+ # Resolve file path
277
+ if os.path.isabs(file_ref):
278
+ file_path = file_ref
279
+ else:
280
+ file_path = os.path.normpath(os.path.join(base_dir, file_ref))
281
+
282
+ # Check if file exists
283
+ if os.path.exists(file_path):
284
+ result['status'] = 'ok'
285
+ # TODO: If anchor provided, check if it exists in the file
286
+ else:
287
+ result['status'] = 'missing'
288
+ result['error'] = f"File not found: {file_path}"
289
+
290
+ return result
291
+
292
+ def validate_image(self, path: str, base_dir: str) -> Dict:
293
+ """
294
+ Validate an image path.
295
+
296
+ Args:
297
+ path: Image path
298
+ base_dir: Base directory for relative paths
299
+
300
+ Returns:
301
+ Dictionary with validation results
302
+ """
303
+ result = {
304
+ 'url': path,
305
+ 'type': 'image',
306
+ 'status': None,
307
+ 'error': None
308
+ }
309
+
310
+ # Check if it's a URL
311
+ if path.startswith(('http://', 'https://')):
312
+ return self.validate_url(path)
313
+
314
+ # Resolve file path
315
+ if os.path.isabs(path):
316
+ file_path = path
317
+ else:
318
+ file_path = os.path.normpath(os.path.join(base_dir, path))
319
+
320
+ # Check if file exists
321
+ if os.path.exists(file_path):
322
+ result['status'] = 'ok'
323
+ else:
324
+ result['status'] = 'missing'
325
+ result['error'] = f"Image not found: {file_path}"
326
+
327
+ return result
328
+
329
+ def validate_links_in_file(self, file_path: str, attributes: Dict[str, str] = None) -> List[Dict]:
330
+ """
331
+ Validate all links in a single file.
332
+
333
+ Args:
334
+ file_path: Path to the AsciiDoc file
335
+ attributes: Dictionary of attribute definitions
336
+
337
+ Returns:
338
+ List of validation results
339
+ """
340
+ links = self.extract_links(file_path, attributes)
341
+ results = []
342
+ base_dir = os.path.dirname(file_path)
343
+
344
+ # Group links by type for efficient processing
345
+ external_links = [l for l in links if l['type'] == 'external']
346
+ internal_links = [l for l in links if l['type'] == 'internal']
347
+ image_links = [l for l in links if l['type'] == 'image']
348
+
349
+ # Validate external links in parallel
350
+ if external_links:
351
+ with ThreadPoolExecutor(max_workers=self.parallel) as executor:
352
+ futures = {
353
+ executor.submit(self.validate_url, link['url'], link['original_url']): link
354
+ for link in external_links
355
+ }
356
+
357
+ for future in as_completed(futures):
358
+ link = futures[future]
359
+ try:
360
+ result = future.result()
361
+ result.update(link)
362
+ results.append(result)
363
+ except Exception as e:
364
+ result = link.copy()
365
+ result['error'] = str(e)
366
+ results.append(result)
367
+
368
+ # Validate internal references
369
+ for link in internal_links:
370
+ result = self.validate_internal_reference(link['url'], base_dir)
371
+ result.update(link)
372
+ results.append(result)
373
+
374
+ # Validate image paths
375
+ for link in image_links:
376
+ result = self.validate_image(link['url'], base_dir)
377
+ result.update(link)
378
+ results.append(result)
379
+
380
+ return results
381
+
382
+ def validate_all(self, scan_dirs: List[str] = None,
383
+ attributes_file: str = None,
384
+ exclude_domains: List[str] = None) -> Dict:
385
+ """
386
+ Validate all links in documentation.
387
+
388
+ Args:
389
+ scan_dirs: Directories to scan
390
+ attributes_file: Path to attributes file
391
+ exclude_domains: Domains to skip
392
+
393
+ Returns:
394
+ Dictionary with all validation results
395
+ """
396
+ if scan_dirs is None:
397
+ scan_dirs = ['.']
398
+
399
+ exclude_domains = exclude_domains or []
400
+
401
+ # Load attributes
402
+ attributes = {}
403
+ if attributes_file and os.path.exists(attributes_file):
404
+ attributes = self._load_attributes(attributes_file)
405
+
406
+ # Collect all .adoc files
407
+ adoc_files = []
408
+ for scan_dir in scan_dirs:
409
+ for root, _, files in os.walk(scan_dir):
410
+ # Skip hidden directories
411
+ if '/.' in root:
412
+ continue
413
+ for file in files:
414
+ if file.endswith('.adoc'):
415
+ adoc_files.append(os.path.join(root, file))
416
+
417
+ # Validate links in all files
418
+ all_results = {
419
+ 'files': {},
420
+ 'summary': {
421
+ 'total': 0,
422
+ 'valid': 0,
423
+ 'broken': 0,
424
+ 'warnings': 0,
425
+ 'skipped': 0
426
+ },
427
+ 'broken_links': [],
428
+ 'warnings': [],
429
+ 'transpositions': [
430
+ {'from': t[0], 'to': t[1]} for t in self.transpositions
431
+ ]
432
+ }
433
+
434
+ for file_path in adoc_files:
435
+ results = self.validate_links_in_file(file_path, attributes)
436
+
437
+ # Filter out excluded domains
438
+ filtered_results = []
439
+ for result in results:
440
+ url = result.get('url', '')
441
+ parsed = urlparse(url)
442
+ if parsed.netloc in exclude_domains:
443
+ result['status'] = 'skipped'
444
+ result['reason'] = 'Domain excluded'
445
+ filtered_results.append(result)
446
+
447
+ all_results['files'][file_path] = filtered_results
448
+
449
+ # Update summary
450
+ for result in filtered_results:
451
+ all_results['summary']['total'] += 1
452
+
453
+ if result.get('status') == 'skipped':
454
+ all_results['summary']['skipped'] += 1
455
+ elif result.get('status') in ['ok', 200, 'anchor']:
456
+ all_results['summary']['valid'] += 1
457
+ elif result.get('status') in [301, 302, 303, 307, 308]:
458
+ all_results['summary']['warnings'] += 1
459
+ all_results['warnings'].append(result)
460
+ elif result.get('error') or result.get('status') in ['missing', 404]:
461
+ all_results['summary']['broken'] += 1
462
+ all_results['broken_links'].append(result)
463
+ else:
464
+ # Treat other status codes as broken
465
+ all_results['summary']['broken'] += 1
466
+ all_results['broken_links'].append(result)
467
+
468
+ # Save cache
469
+ self._save_cache()
470
+
471
+ return all_results
472
+
473
+ def _load_attributes(self, attributes_file: str) -> Dict[str, str]:
474
+ """Load attributes from file."""
475
+ attributes = {}
476
+
477
+ with open(attributes_file, 'r', encoding='utf-8') as f:
478
+ for line in f:
479
+ # Match attribute definitions
480
+ match = re.match(r'^:([^:]+):\s*(.*)$', line)
481
+ if match:
482
+ attr_name = match.group(1).strip()
483
+ attr_value = match.group(2).strip()
484
+ attributes[attr_name] = attr_value
485
+
486
+ return attributes
487
+
488
+
489
+ def parse_transpositions(transpose_args: List[str]) -> List[Tuple[str, str]]:
490
+ """
491
+ Parse transposition arguments.
492
+
493
+ Args:
494
+ transpose_args: List of transposition strings in format "from--to"
495
+
496
+ Returns:
497
+ List of (from_url, to_url) tuples
498
+ """
499
+ transpositions = []
500
+
501
+ for arg in transpose_args or []:
502
+ parts = arg.split('--')
503
+ if len(parts) == 2:
504
+ from_url = parts[0].strip()
505
+ to_url = parts[1].strip()
506
+ transpositions.append((from_url, to_url))
507
+ else:
508
+ print(f"Warning: Invalid transposition format: {arg}")
509
+ print("Expected format: from_url--to_url")
510
+
511
+ return transpositions
512
+
513
+
514
+ def format_results(results: Dict, verbose: bool = False) -> str:
515
+ """
516
+ Format validation results for display.
517
+
518
+ Args:
519
+ results: Validation results dictionary
520
+ verbose: Whether to show verbose output
521
+
522
+ Returns:
523
+ Formatted string for display
524
+ """
525
+ output = []
526
+
527
+ # Show transpositions if any
528
+ if results.get('transpositions'):
529
+ output.append("URL Transposition Rules:")
530
+ for trans in results['transpositions']:
531
+ output.append(f" {trans['from']} → {trans['to']}")
532
+ output.append("")
533
+
534
+ # Summary
535
+ summary = results['summary']
536
+ output.append("SUMMARY:")
537
+ output.append(f"✓ Valid: {summary['valid']} links")
538
+ if summary['broken'] > 0:
539
+ output.append(f"✗ Broken: {summary['broken']} links")
540
+ if summary['warnings'] > 0:
541
+ output.append(f"⚠ Warnings: {summary['warnings']} redirects")
542
+ if summary['skipped'] > 0:
543
+ output.append(f"⊘ Skipped: {summary['skipped']} links (excluded domains)")
544
+ output.append("")
545
+
546
+ # Broken links
547
+ if results['broken_links']:
548
+ output.append("BROKEN LINKS:")
549
+ for i, link in enumerate(results['broken_links'], 1):
550
+ output.append(f"\n{i}. {link['file']}:{link['line']}")
551
+ if link.get('original_url') and link.get('original_url') != link.get('url'):
552
+ output.append(f" Original: {link['original_url']}")
553
+ output.append(f" Resolved: {link['url']}")
554
+ else:
555
+ output.append(f" URL: {link['url']}")
556
+
557
+ if link.get('transposed_url'):
558
+ output.append(f" Checked: {link['transposed_url']}")
559
+
560
+ if link.get('status'):
561
+ output.append(f" Status: {link['status']}")
562
+ if link.get('error'):
563
+ output.append(f" Error: {link['error']}")
564
+ output.append("")
565
+
566
+ # Warnings (redirects)
567
+ if results['warnings'] and verbose:
568
+ output.append("WARNINGS (Redirects):")
569
+ for i, link in enumerate(results['warnings'], 1):
570
+ output.append(f"\n{i}. {link['file']}:{link['line']}")
571
+ output.append(f" URL: {link['url']}")
572
+ if link.get('redirect'):
573
+ output.append(f" Redirects to: {link['redirect']}")
574
+ output.append("")
575
+
576
+ return '\n'.join(output)
doc_utils/version.py ADDED
@@ -0,0 +1,8 @@
1
+ """Version information for doc-utils."""
2
+
3
+ # This should match the version in pyproject.toml
4
+ __version__ = "0.1.41"
5
+
6
+ def get_version():
7
+ """Return the current version string."""
8
+ return __version__