rolfedh-doc-utils 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- convert_freemarker_to_asciidoc.py +288 -0
- doc_utils/convert_freemarker_to_asciidoc.py +708 -0
- doc_utils/duplicate_content.py +409 -0
- doc_utils/duplicate_includes.py +347 -0
- doc_utils/inventory_conditionals.py +164 -0
- doc_utils/unused_attributes.py +48 -0
- doc_utils/version.py +1 -1
- find_duplicate_content.py +209 -0
- find_duplicate_includes.py +198 -0
- find_unused_attributes.py +16 -1
- inventory_conditionals.py +53 -0
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/METADATA +2 -1
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/RECORD +17 -9
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/WHEEL +1 -1
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/entry_points.txt +4 -0
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/top_level.txt +4 -0
- {rolfedh_doc_utils-0.1.38.dist-info → rolfedh_doc_utils-0.1.40.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for finding duplicate and similar content in AsciiDoc files.
|
|
3
|
+
|
|
4
|
+
This module detects:
|
|
5
|
+
- Recurring notes (NOTE, TIP, WARNING, IMPORTANT, CAUTION)
|
|
6
|
+
- Tables
|
|
7
|
+
- Step sequences (ordered lists)
|
|
8
|
+
- Code blocks
|
|
9
|
+
- Any other repeated content elements
|
|
10
|
+
|
|
11
|
+
Functions:
|
|
12
|
+
- extract_content_blocks: Extract content blocks from an AsciiDoc file
|
|
13
|
+
- find_duplicates: Find duplicate and similar content across files
|
|
14
|
+
- calculate_similarity: Calculate text similarity between two strings
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import hashlib
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import List, Dict, Set, Tuple, Optional
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from collections import defaultdict
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ContentBlock:
|
|
28
|
+
"""Represents a content block extracted from an AsciiDoc file."""
|
|
29
|
+
block_type: str # 'note', 'table', 'steps', 'code', 'paragraph'
|
|
30
|
+
content: str
|
|
31
|
+
file_path: str
|
|
32
|
+
line_number: int
|
|
33
|
+
content_hash: str = field(default="", init=False)
|
|
34
|
+
|
|
35
|
+
def __post_init__(self):
|
|
36
|
+
# Normalize content for comparison (strip whitespace, lowercase)
|
|
37
|
+
normalized = ' '.join(self.content.split()).lower()
|
|
38
|
+
self.content_hash = hashlib.md5(normalized.encode()).hexdigest()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class DuplicateGroup:
|
|
43
|
+
"""Represents a group of duplicate or similar content blocks."""
|
|
44
|
+
block_type: str
|
|
45
|
+
blocks: List[ContentBlock]
|
|
46
|
+
similarity: float # 1.0 for exact, < 1.0 for similar
|
|
47
|
+
canonical_content: str # Representative content for this group
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def find_adoc_files(root_dir: str, exclude_dirs: List[str] = None) -> List[str]:
|
|
51
|
+
"""Recursively find all .adoc files in a directory (ignoring symlinks)."""
|
|
52
|
+
if exclude_dirs is None:
|
|
53
|
+
exclude_dirs = ['.git', '.archive', 'target', 'build', 'node_modules']
|
|
54
|
+
|
|
55
|
+
adoc_files = []
|
|
56
|
+
for dirpath, dirnames, filenames in os.walk(root_dir, followlinks=False):
|
|
57
|
+
# Remove excluded directories from dirnames to prevent descending into them
|
|
58
|
+
dirnames[:] = [d for d in dirnames if d not in exclude_dirs and not d.startswith('.')]
|
|
59
|
+
|
|
60
|
+
for fname in filenames:
|
|
61
|
+
if fname.endswith('.adoc'):
|
|
62
|
+
full_path = os.path.join(dirpath, fname)
|
|
63
|
+
if not os.path.islink(full_path):
|
|
64
|
+
adoc_files.append(full_path)
|
|
65
|
+
return adoc_files
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_content_blocks(file_path: str) -> List[ContentBlock]:
|
|
69
|
+
"""
|
|
70
|
+
Extract content blocks from an AsciiDoc file.
|
|
71
|
+
|
|
72
|
+
Identifies:
|
|
73
|
+
- Admonition blocks (NOTE, TIP, WARNING, IMPORTANT, CAUTION)
|
|
74
|
+
- Tables (|=== blocks)
|
|
75
|
+
- Ordered lists (step sequences)
|
|
76
|
+
- Code blocks (---- or .... blocks)
|
|
77
|
+
- Delimited blocks ([NOTE], [TIP], etc.)
|
|
78
|
+
"""
|
|
79
|
+
blocks = []
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
83
|
+
lines = f.readlines()
|
|
84
|
+
except (UnicodeDecodeError, PermissionError):
|
|
85
|
+
return blocks
|
|
86
|
+
|
|
87
|
+
i = 0
|
|
88
|
+
while i < len(lines):
|
|
89
|
+
line = lines[i]
|
|
90
|
+
stripped = line.strip()
|
|
91
|
+
|
|
92
|
+
# Check for inline admonitions (NOTE:, TIP:, etc.)
|
|
93
|
+
admonition_match = re.match(r'^(NOTE|TIP|WARNING|IMPORTANT|CAUTION):\s*(.+)$', stripped)
|
|
94
|
+
if admonition_match:
|
|
95
|
+
block_type = admonition_match.group(1).lower()
|
|
96
|
+
content = admonition_match.group(2)
|
|
97
|
+
blocks.append(ContentBlock(
|
|
98
|
+
block_type=block_type,
|
|
99
|
+
content=content,
|
|
100
|
+
file_path=file_path,
|
|
101
|
+
line_number=i + 1
|
|
102
|
+
))
|
|
103
|
+
i += 1
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Check for delimited admonition blocks [NOTE], [TIP], etc.
|
|
107
|
+
delimited_match = re.match(r'^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]$', stripped)
|
|
108
|
+
if delimited_match:
|
|
109
|
+
block_type = delimited_match.group(1).lower()
|
|
110
|
+
# Read until we hit a delimiter or empty line pattern
|
|
111
|
+
content_lines = []
|
|
112
|
+
i += 1
|
|
113
|
+
# Check for delimiter on next line
|
|
114
|
+
if i < len(lines) and lines[i].strip().startswith('===='):
|
|
115
|
+
delimiter = lines[i].strip()
|
|
116
|
+
i += 1
|
|
117
|
+
while i < len(lines) and lines[i].strip() != delimiter:
|
|
118
|
+
content_lines.append(lines[i])
|
|
119
|
+
i += 1
|
|
120
|
+
i += 1 # Skip closing delimiter
|
|
121
|
+
else:
|
|
122
|
+
# No delimiter, read paragraph
|
|
123
|
+
while i < len(lines) and lines[i].strip():
|
|
124
|
+
content_lines.append(lines[i])
|
|
125
|
+
i += 1
|
|
126
|
+
|
|
127
|
+
if content_lines:
|
|
128
|
+
blocks.append(ContentBlock(
|
|
129
|
+
block_type=block_type,
|
|
130
|
+
content=''.join(content_lines).strip(),
|
|
131
|
+
file_path=file_path,
|
|
132
|
+
line_number=i - len(content_lines)
|
|
133
|
+
))
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Check for tables (|===)
|
|
137
|
+
if stripped.startswith('|==='):
|
|
138
|
+
content_lines = [line]
|
|
139
|
+
start_line = i + 1
|
|
140
|
+
i += 1
|
|
141
|
+
while i < len(lines) and not lines[i].strip().startswith('|==='):
|
|
142
|
+
content_lines.append(lines[i])
|
|
143
|
+
i += 1
|
|
144
|
+
if i < len(lines):
|
|
145
|
+
content_lines.append(lines[i])
|
|
146
|
+
i += 1
|
|
147
|
+
|
|
148
|
+
blocks.append(ContentBlock(
|
|
149
|
+
block_type='table',
|
|
150
|
+
content=''.join(content_lines).strip(),
|
|
151
|
+
file_path=file_path,
|
|
152
|
+
line_number=start_line
|
|
153
|
+
))
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Check for code blocks (---- or ....)
|
|
157
|
+
if stripped.startswith('----') or stripped.startswith('....'):
|
|
158
|
+
delimiter = stripped[:4]
|
|
159
|
+
content_lines = [line]
|
|
160
|
+
start_line = i + 1
|
|
161
|
+
i += 1
|
|
162
|
+
while i < len(lines) and not lines[i].strip().startswith(delimiter):
|
|
163
|
+
content_lines.append(lines[i])
|
|
164
|
+
i += 1
|
|
165
|
+
if i < len(lines):
|
|
166
|
+
content_lines.append(lines[i])
|
|
167
|
+
i += 1
|
|
168
|
+
|
|
169
|
+
blocks.append(ContentBlock(
|
|
170
|
+
block_type='code',
|
|
171
|
+
content=''.join(content_lines).strip(),
|
|
172
|
+
file_path=file_path,
|
|
173
|
+
line_number=start_line
|
|
174
|
+
))
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
# Check for ordered lists (step sequences) - consecutive numbered items
|
|
178
|
+
if re.match(r'^\d+\.\s+', stripped) or stripped.startswith('. '):
|
|
179
|
+
content_lines = [line]
|
|
180
|
+
start_line = i + 1
|
|
181
|
+
i += 1
|
|
182
|
+
while i < len(lines):
|
|
183
|
+
next_line = lines[i].strip()
|
|
184
|
+
# Continue if numbered item, continuation (+), or indented content
|
|
185
|
+
if (re.match(r'^\d+\.\s+', next_line) or
|
|
186
|
+
next_line.startswith('. ') or
|
|
187
|
+
next_line == '+' or
|
|
188
|
+
(next_line and lines[i].startswith(' '))):
|
|
189
|
+
content_lines.append(lines[i])
|
|
190
|
+
i += 1
|
|
191
|
+
elif not next_line:
|
|
192
|
+
# Empty line might be part of list if followed by more items
|
|
193
|
+
if i + 1 < len(lines) and (re.match(r'^\d+\.\s+', lines[i+1].strip()) or
|
|
194
|
+
lines[i+1].strip().startswith('. ')):
|
|
195
|
+
content_lines.append(lines[i])
|
|
196
|
+
i += 1
|
|
197
|
+
else:
|
|
198
|
+
break
|
|
199
|
+
else:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
# Only record if we have multiple steps
|
|
203
|
+
if len([l for l in content_lines if re.match(r'^\d+\.\s+', l.strip()) or l.strip().startswith('. ')]) >= 2:
|
|
204
|
+
blocks.append(ContentBlock(
|
|
205
|
+
block_type='steps',
|
|
206
|
+
content=''.join(content_lines).strip(),
|
|
207
|
+
file_path=file_path,
|
|
208
|
+
line_number=start_line
|
|
209
|
+
))
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
i += 1
|
|
213
|
+
|
|
214
|
+
return blocks
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def calculate_similarity(text1: str, text2: str) -> float:
|
|
218
|
+
"""
|
|
219
|
+
Calculate similarity between two text strings.
|
|
220
|
+
|
|
221
|
+
Uses a simple word-based Jaccard similarity for efficiency.
|
|
222
|
+
Returns a value between 0.0 (completely different) and 1.0 (identical).
|
|
223
|
+
"""
|
|
224
|
+
# Normalize texts
|
|
225
|
+
words1 = set(text1.lower().split())
|
|
226
|
+
words2 = set(text2.lower().split())
|
|
227
|
+
|
|
228
|
+
if not words1 and not words2:
|
|
229
|
+
return 1.0
|
|
230
|
+
if not words1 or not words2:
|
|
231
|
+
return 0.0
|
|
232
|
+
|
|
233
|
+
intersection = len(words1 & words2)
|
|
234
|
+
union = len(words1 | words2)
|
|
235
|
+
|
|
236
|
+
return intersection / union if union > 0 else 0.0
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def find_duplicates(
|
|
240
|
+
root_dir: str,
|
|
241
|
+
min_similarity: float = 0.8,
|
|
242
|
+
min_content_length: int = 50,
|
|
243
|
+
exclude_dirs: List[str] = None,
|
|
244
|
+
block_types: List[str] = None
|
|
245
|
+
) -> List[DuplicateGroup]:
|
|
246
|
+
"""
|
|
247
|
+
Find duplicate and similar content blocks across AsciiDoc files.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
root_dir: Directory to scan
|
|
251
|
+
min_similarity: Minimum similarity threshold (0.0-1.0)
|
|
252
|
+
min_content_length: Minimum content length to consider
|
|
253
|
+
exclude_dirs: Directories to exclude from scanning
|
|
254
|
+
block_types: Types of blocks to search for (None = all types)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List of DuplicateGroup objects containing groups of similar content
|
|
258
|
+
"""
|
|
259
|
+
adoc_files = find_adoc_files(root_dir, exclude_dirs)
|
|
260
|
+
|
|
261
|
+
# Collect all content blocks
|
|
262
|
+
all_blocks: List[ContentBlock] = []
|
|
263
|
+
for file_path in adoc_files:
|
|
264
|
+
blocks = extract_content_blocks(file_path)
|
|
265
|
+
for block in blocks:
|
|
266
|
+
if len(block.content) >= min_content_length:
|
|
267
|
+
if block_types is None or block.block_type in block_types:
|
|
268
|
+
all_blocks.append(block)
|
|
269
|
+
|
|
270
|
+
# Group by exact hash first (exact duplicates)
|
|
271
|
+
hash_groups: Dict[str, List[ContentBlock]] = defaultdict(list)
|
|
272
|
+
for block in all_blocks:
|
|
273
|
+
hash_groups[block.content_hash].append(block)
|
|
274
|
+
|
|
275
|
+
# Find exact duplicates
|
|
276
|
+
duplicate_groups: List[DuplicateGroup] = []
|
|
277
|
+
processed_hashes: Set[str] = set()
|
|
278
|
+
|
|
279
|
+
for content_hash, blocks in hash_groups.items():
|
|
280
|
+
if len(blocks) > 1:
|
|
281
|
+
processed_hashes.add(content_hash)
|
|
282
|
+
duplicate_groups.append(DuplicateGroup(
|
|
283
|
+
block_type=blocks[0].block_type,
|
|
284
|
+
blocks=blocks,
|
|
285
|
+
similarity=1.0,
|
|
286
|
+
canonical_content=blocks[0].content
|
|
287
|
+
))
|
|
288
|
+
|
|
289
|
+
# Find similar (but not exact) duplicates
|
|
290
|
+
if min_similarity < 1.0:
|
|
291
|
+
# Get blocks that weren't exact duplicates
|
|
292
|
+
remaining_blocks = [b for b in all_blocks if b.content_hash not in processed_hashes]
|
|
293
|
+
|
|
294
|
+
# Compare remaining blocks for similarity
|
|
295
|
+
used_indices: Set[int] = set()
|
|
296
|
+
|
|
297
|
+
for i, block1 in enumerate(remaining_blocks):
|
|
298
|
+
if i in used_indices:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
similar_blocks = [block1]
|
|
302
|
+
used_indices.add(i)
|
|
303
|
+
|
|
304
|
+
for j, block2 in enumerate(remaining_blocks[i+1:], start=i+1):
|
|
305
|
+
if j in used_indices:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
# Only compare blocks of the same type
|
|
309
|
+
if block1.block_type != block2.block_type:
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
similarity = calculate_similarity(block1.content, block2.content)
|
|
313
|
+
if similarity >= min_similarity:
|
|
314
|
+
similar_blocks.append(block2)
|
|
315
|
+
used_indices.add(j)
|
|
316
|
+
|
|
317
|
+
if len(similar_blocks) > 1:
|
|
318
|
+
# Calculate average similarity within the group
|
|
319
|
+
total_sim = 0
|
|
320
|
+
count = 0
|
|
321
|
+
for k, b1 in enumerate(similar_blocks):
|
|
322
|
+
for b2 in similar_blocks[k+1:]:
|
|
323
|
+
total_sim += calculate_similarity(b1.content, b2.content)
|
|
324
|
+
count += 1
|
|
325
|
+
avg_similarity = total_sim / count if count > 0 else 1.0
|
|
326
|
+
|
|
327
|
+
duplicate_groups.append(DuplicateGroup(
|
|
328
|
+
block_type=similar_blocks[0].block_type,
|
|
329
|
+
blocks=similar_blocks,
|
|
330
|
+
similarity=avg_similarity,
|
|
331
|
+
canonical_content=similar_blocks[0].content
|
|
332
|
+
))
|
|
333
|
+
|
|
334
|
+
# Sort by number of duplicates (most duplicates first)
|
|
335
|
+
duplicate_groups.sort(key=lambda g: len(g.blocks), reverse=True)
|
|
336
|
+
|
|
337
|
+
return duplicate_groups
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def format_report(
|
|
341
|
+
duplicate_groups: List[DuplicateGroup],
|
|
342
|
+
show_content: bool = True,
|
|
343
|
+
max_content_preview: int = 200
|
|
344
|
+
) -> str:
|
|
345
|
+
"""
|
|
346
|
+
Format a report of duplicate content groups.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
duplicate_groups: List of DuplicateGroup objects
|
|
350
|
+
show_content: Whether to show content preview
|
|
351
|
+
max_content_preview: Maximum characters for content preview
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Formatted report string
|
|
355
|
+
"""
|
|
356
|
+
if not duplicate_groups:
|
|
357
|
+
return "No duplicate content found."
|
|
358
|
+
|
|
359
|
+
lines = []
|
|
360
|
+
lines.append(f"Found {len(duplicate_groups)} groups of duplicate/similar content\n")
|
|
361
|
+
lines.append("=" * 70)
|
|
362
|
+
|
|
363
|
+
for i, group in enumerate(duplicate_groups, 1):
|
|
364
|
+
similarity_label = "EXACT" if group.similarity == 1.0 else f"{group.similarity:.0%} similar"
|
|
365
|
+
lines.append(f"\n[{i}] {group.block_type.upper()} ({similarity_label}) - {len(group.blocks)} occurrences")
|
|
366
|
+
lines.append("-" * 50)
|
|
367
|
+
|
|
368
|
+
if show_content:
|
|
369
|
+
preview = group.canonical_content
|
|
370
|
+
if len(preview) > max_content_preview:
|
|
371
|
+
preview = preview[:max_content_preview] + "..."
|
|
372
|
+
# Indent the preview
|
|
373
|
+
preview_lines = preview.split('\n')
|
|
374
|
+
lines.append(" Content preview:")
|
|
375
|
+
for pl in preview_lines[:5]: # Limit to 5 lines
|
|
376
|
+
lines.append(f" {pl}")
|
|
377
|
+
if len(preview_lines) > 5:
|
|
378
|
+
lines.append(f" ... ({len(preview_lines) - 5} more lines)")
|
|
379
|
+
lines.append("")
|
|
380
|
+
|
|
381
|
+
lines.append(" Locations:")
|
|
382
|
+
for block in group.blocks:
|
|
383
|
+
rel_path = os.path.relpath(block.file_path)
|
|
384
|
+
lines.append(f" - {rel_path}:{block.line_number}")
|
|
385
|
+
|
|
386
|
+
lines.append("")
|
|
387
|
+
|
|
388
|
+
return '\n'.join(lines)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def generate_csv_report(duplicate_groups: List[DuplicateGroup]) -> str:
|
|
392
|
+
"""
|
|
393
|
+
Generate a CSV report of duplicate content.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
CSV formatted string
|
|
397
|
+
"""
|
|
398
|
+
lines = ["Block Type,Similarity,Occurrences,File Path,Line Number,Content Preview"]
|
|
399
|
+
|
|
400
|
+
for group in duplicate_groups:
|
|
401
|
+
similarity_str = "exact" if group.similarity == 1.0 else f"{group.similarity:.2f}"
|
|
402
|
+
# Escape content for CSV
|
|
403
|
+
preview = group.canonical_content[:100].replace('"', '""').replace('\n', ' ')
|
|
404
|
+
|
|
405
|
+
for block in group.blocks:
|
|
406
|
+
rel_path = os.path.relpath(block.file_path)
|
|
407
|
+
lines.append(f'"{group.block_type}","{similarity_str}",{len(group.blocks)},"{rel_path}",{block.line_number},"{preview}"')
|
|
408
|
+
|
|
409
|
+
return '\n'.join(lines)
|