rolfedh-doc-utils 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,409 @@
1
+ """
2
+ Module for finding duplicate and similar content in AsciiDoc files.
3
+
4
+ This module detects:
5
+ - Recurring notes (NOTE, TIP, WARNING, IMPORTANT, CAUTION)
6
+ - Tables
7
+ - Step sequences (ordered lists)
8
+ - Code blocks
9
+ - Any other repeated content elements
10
+
11
+ Functions:
12
+ - extract_content_blocks: Extract content blocks from an AsciiDoc file
13
+ - find_duplicates: Find duplicate and similar content across files
14
+ - calculate_similarity: Calculate text similarity between two strings
15
+ """
16
+
17
+ import os
18
+ import re
19
+ import hashlib
20
+ from pathlib import Path
21
+ from typing import List, Dict, Set, Tuple, Optional
22
+ from dataclasses import dataclass, field
23
+ from collections import defaultdict
24
+
25
+
26
+ @dataclass
27
+ class ContentBlock:
28
+ """Represents a content block extracted from an AsciiDoc file."""
29
+ block_type: str # 'note', 'table', 'steps', 'code', 'paragraph'
30
+ content: str
31
+ file_path: str
32
+ line_number: int
33
+ content_hash: str = field(default="", init=False)
34
+
35
+ def __post_init__(self):
36
+ # Normalize content for comparison (strip whitespace, lowercase)
37
+ normalized = ' '.join(self.content.split()).lower()
38
+ self.content_hash = hashlib.md5(normalized.encode()).hexdigest()
39
+
40
+
41
+ @dataclass
42
+ class DuplicateGroup:
43
+ """Represents a group of duplicate or similar content blocks."""
44
+ block_type: str
45
+ blocks: List[ContentBlock]
46
+ similarity: float # 1.0 for exact, < 1.0 for similar
47
+ canonical_content: str # Representative content for this group
48
+
49
+
50
+ def find_adoc_files(root_dir: str, exclude_dirs: List[str] = None) -> List[str]:
51
+ """Recursively find all .adoc files in a directory (ignoring symlinks)."""
52
+ if exclude_dirs is None:
53
+ exclude_dirs = ['.git', '.archive', 'target', 'build', 'node_modules']
54
+
55
+ adoc_files = []
56
+ for dirpath, dirnames, filenames in os.walk(root_dir, followlinks=False):
57
+ # Remove excluded directories from dirnames to prevent descending into them
58
+ dirnames[:] = [d for d in dirnames if d not in exclude_dirs and not d.startswith('.')]
59
+
60
+ for fname in filenames:
61
+ if fname.endswith('.adoc'):
62
+ full_path = os.path.join(dirpath, fname)
63
+ if not os.path.islink(full_path):
64
+ adoc_files.append(full_path)
65
+ return adoc_files
66
+
67
+
68
+ def extract_content_blocks(file_path: str) -> List[ContentBlock]:
69
+ """
70
+ Extract content blocks from an AsciiDoc file.
71
+
72
+ Identifies:
73
+ - Admonition blocks (NOTE, TIP, WARNING, IMPORTANT, CAUTION)
74
+ - Tables (|=== blocks)
75
+ - Ordered lists (step sequences)
76
+ - Code blocks (---- or .... blocks)
77
+ - Delimited blocks ([NOTE], [TIP], etc.)
78
+ """
79
+ blocks = []
80
+
81
+ try:
82
+ with open(file_path, 'r', encoding='utf-8') as f:
83
+ lines = f.readlines()
84
+ except (UnicodeDecodeError, PermissionError):
85
+ return blocks
86
+
87
+ i = 0
88
+ while i < len(lines):
89
+ line = lines[i]
90
+ stripped = line.strip()
91
+
92
+ # Check for inline admonitions (NOTE:, TIP:, etc.)
93
+ admonition_match = re.match(r'^(NOTE|TIP|WARNING|IMPORTANT|CAUTION):\s*(.+)$', stripped)
94
+ if admonition_match:
95
+ block_type = admonition_match.group(1).lower()
96
+ content = admonition_match.group(2)
97
+ blocks.append(ContentBlock(
98
+ block_type=block_type,
99
+ content=content,
100
+ file_path=file_path,
101
+ line_number=i + 1
102
+ ))
103
+ i += 1
104
+ continue
105
+
106
+ # Check for delimited admonition blocks [NOTE], [TIP], etc.
107
+ delimited_match = re.match(r'^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]$', stripped)
108
+ if delimited_match:
109
+ block_type = delimited_match.group(1).lower()
110
+ # Read until we hit a delimiter or empty line pattern
111
+ content_lines = []
112
+ i += 1
113
+ # Check for delimiter on next line
114
+ if i < len(lines) and lines[i].strip().startswith('===='):
115
+ delimiter = lines[i].strip()
116
+ i += 1
117
+ while i < len(lines) and lines[i].strip() != delimiter:
118
+ content_lines.append(lines[i])
119
+ i += 1
120
+ i += 1 # Skip closing delimiter
121
+ else:
122
+ # No delimiter, read paragraph
123
+ while i < len(lines) and lines[i].strip():
124
+ content_lines.append(lines[i])
125
+ i += 1
126
+
127
+ if content_lines:
128
+ blocks.append(ContentBlock(
129
+ block_type=block_type,
130
+ content=''.join(content_lines).strip(),
131
+ file_path=file_path,
132
+ line_number=i - len(content_lines)
133
+ ))
134
+ continue
135
+
136
+ # Check for tables (|===)
137
+ if stripped.startswith('|==='):
138
+ content_lines = [line]
139
+ start_line = i + 1
140
+ i += 1
141
+ while i < len(lines) and not lines[i].strip().startswith('|==='):
142
+ content_lines.append(lines[i])
143
+ i += 1
144
+ if i < len(lines):
145
+ content_lines.append(lines[i])
146
+ i += 1
147
+
148
+ blocks.append(ContentBlock(
149
+ block_type='table',
150
+ content=''.join(content_lines).strip(),
151
+ file_path=file_path,
152
+ line_number=start_line
153
+ ))
154
+ continue
155
+
156
+ # Check for code blocks (---- or ....)
157
+ if stripped.startswith('----') or stripped.startswith('....'):
158
+ delimiter = stripped[:4]
159
+ content_lines = [line]
160
+ start_line = i + 1
161
+ i += 1
162
+ while i < len(lines) and not lines[i].strip().startswith(delimiter):
163
+ content_lines.append(lines[i])
164
+ i += 1
165
+ if i < len(lines):
166
+ content_lines.append(lines[i])
167
+ i += 1
168
+
169
+ blocks.append(ContentBlock(
170
+ block_type='code',
171
+ content=''.join(content_lines).strip(),
172
+ file_path=file_path,
173
+ line_number=start_line
174
+ ))
175
+ continue
176
+
177
+ # Check for ordered lists (step sequences) - consecutive numbered items
178
+ if re.match(r'^\d+\.\s+', stripped) or stripped.startswith('. '):
179
+ content_lines = [line]
180
+ start_line = i + 1
181
+ i += 1
182
+ while i < len(lines):
183
+ next_line = lines[i].strip()
184
+ # Continue if numbered item, continuation (+), or indented content
185
+ if (re.match(r'^\d+\.\s+', next_line) or
186
+ next_line.startswith('. ') or
187
+ next_line == '+' or
188
+ (next_line and lines[i].startswith(' '))):
189
+ content_lines.append(lines[i])
190
+ i += 1
191
+ elif not next_line:
192
+ # Empty line might be part of list if followed by more items
193
+ if i + 1 < len(lines) and (re.match(r'^\d+\.\s+', lines[i+1].strip()) or
194
+ lines[i+1].strip().startswith('. ')):
195
+ content_lines.append(lines[i])
196
+ i += 1
197
+ else:
198
+ break
199
+ else:
200
+ break
201
+
202
+ # Only record if we have multiple steps
203
+ if len([l for l in content_lines if re.match(r'^\d+\.\s+', l.strip()) or l.strip().startswith('. ')]) >= 2:
204
+ blocks.append(ContentBlock(
205
+ block_type='steps',
206
+ content=''.join(content_lines).strip(),
207
+ file_path=file_path,
208
+ line_number=start_line
209
+ ))
210
+ continue
211
+
212
+ i += 1
213
+
214
+ return blocks
215
+
216
+
217
+ def calculate_similarity(text1: str, text2: str) -> float:
218
+ """
219
+ Calculate similarity between two text strings.
220
+
221
+ Uses a simple word-based Jaccard similarity for efficiency.
222
+ Returns a value between 0.0 (completely different) and 1.0 (identical).
223
+ """
224
+ # Normalize texts
225
+ words1 = set(text1.lower().split())
226
+ words2 = set(text2.lower().split())
227
+
228
+ if not words1 and not words2:
229
+ return 1.0
230
+ if not words1 or not words2:
231
+ return 0.0
232
+
233
+ intersection = len(words1 & words2)
234
+ union = len(words1 | words2)
235
+
236
+ return intersection / union if union > 0 else 0.0
237
+
238
+
239
+ def find_duplicates(
240
+ root_dir: str,
241
+ min_similarity: float = 0.8,
242
+ min_content_length: int = 50,
243
+ exclude_dirs: List[str] = None,
244
+ block_types: List[str] = None
245
+ ) -> List[DuplicateGroup]:
246
+ """
247
+ Find duplicate and similar content blocks across AsciiDoc files.
248
+
249
+ Args:
250
+ root_dir: Directory to scan
251
+ min_similarity: Minimum similarity threshold (0.0-1.0)
252
+ min_content_length: Minimum content length to consider
253
+ exclude_dirs: Directories to exclude from scanning
254
+ block_types: Types of blocks to search for (None = all types)
255
+
256
+ Returns:
257
+ List of DuplicateGroup objects containing groups of similar content
258
+ """
259
+ adoc_files = find_adoc_files(root_dir, exclude_dirs)
260
+
261
+ # Collect all content blocks
262
+ all_blocks: List[ContentBlock] = []
263
+ for file_path in adoc_files:
264
+ blocks = extract_content_blocks(file_path)
265
+ for block in blocks:
266
+ if len(block.content) >= min_content_length:
267
+ if block_types is None or block.block_type in block_types:
268
+ all_blocks.append(block)
269
+
270
+ # Group by exact hash first (exact duplicates)
271
+ hash_groups: Dict[str, List[ContentBlock]] = defaultdict(list)
272
+ for block in all_blocks:
273
+ hash_groups[block.content_hash].append(block)
274
+
275
+ # Find exact duplicates
276
+ duplicate_groups: List[DuplicateGroup] = []
277
+ processed_hashes: Set[str] = set()
278
+
279
+ for content_hash, blocks in hash_groups.items():
280
+ if len(blocks) > 1:
281
+ processed_hashes.add(content_hash)
282
+ duplicate_groups.append(DuplicateGroup(
283
+ block_type=blocks[0].block_type,
284
+ blocks=blocks,
285
+ similarity=1.0,
286
+ canonical_content=blocks[0].content
287
+ ))
288
+
289
+ # Find similar (but not exact) duplicates
290
+ if min_similarity < 1.0:
291
+ # Get blocks that weren't exact duplicates
292
+ remaining_blocks = [b for b in all_blocks if b.content_hash not in processed_hashes]
293
+
294
+ # Compare remaining blocks for similarity
295
+ used_indices: Set[int] = set()
296
+
297
+ for i, block1 in enumerate(remaining_blocks):
298
+ if i in used_indices:
299
+ continue
300
+
301
+ similar_blocks = [block1]
302
+ used_indices.add(i)
303
+
304
+ for j, block2 in enumerate(remaining_blocks[i+1:], start=i+1):
305
+ if j in used_indices:
306
+ continue
307
+
308
+ # Only compare blocks of the same type
309
+ if block1.block_type != block2.block_type:
310
+ continue
311
+
312
+ similarity = calculate_similarity(block1.content, block2.content)
313
+ if similarity >= min_similarity:
314
+ similar_blocks.append(block2)
315
+ used_indices.add(j)
316
+
317
+ if len(similar_blocks) > 1:
318
+ # Calculate average similarity within the group
319
+ total_sim = 0
320
+ count = 0
321
+ for k, b1 in enumerate(similar_blocks):
322
+ for b2 in similar_blocks[k+1:]:
323
+ total_sim += calculate_similarity(b1.content, b2.content)
324
+ count += 1
325
+ avg_similarity = total_sim / count if count > 0 else 1.0
326
+
327
+ duplicate_groups.append(DuplicateGroup(
328
+ block_type=similar_blocks[0].block_type,
329
+ blocks=similar_blocks,
330
+ similarity=avg_similarity,
331
+ canonical_content=similar_blocks[0].content
332
+ ))
333
+
334
+ # Sort by number of duplicates (most duplicates first)
335
+ duplicate_groups.sort(key=lambda g: len(g.blocks), reverse=True)
336
+
337
+ return duplicate_groups
338
+
339
+
340
+ def format_report(
341
+ duplicate_groups: List[DuplicateGroup],
342
+ show_content: bool = True,
343
+ max_content_preview: int = 200
344
+ ) -> str:
345
+ """
346
+ Format a report of duplicate content groups.
347
+
348
+ Args:
349
+ duplicate_groups: List of DuplicateGroup objects
350
+ show_content: Whether to show content preview
351
+ max_content_preview: Maximum characters for content preview
352
+
353
+ Returns:
354
+ Formatted report string
355
+ """
356
+ if not duplicate_groups:
357
+ return "No duplicate content found."
358
+
359
+ lines = []
360
+ lines.append(f"Found {len(duplicate_groups)} groups of duplicate/similar content\n")
361
+ lines.append("=" * 70)
362
+
363
+ for i, group in enumerate(duplicate_groups, 1):
364
+ similarity_label = "EXACT" if group.similarity == 1.0 else f"{group.similarity:.0%} similar"
365
+ lines.append(f"\n[{i}] {group.block_type.upper()} ({similarity_label}) - {len(group.blocks)} occurrences")
366
+ lines.append("-" * 50)
367
+
368
+ if show_content:
369
+ preview = group.canonical_content
370
+ if len(preview) > max_content_preview:
371
+ preview = preview[:max_content_preview] + "..."
372
+ # Indent the preview
373
+ preview_lines = preview.split('\n')
374
+ lines.append(" Content preview:")
375
+ for pl in preview_lines[:5]: # Limit to 5 lines
376
+ lines.append(f" {pl}")
377
+ if len(preview_lines) > 5:
378
+ lines.append(f" ... ({len(preview_lines) - 5} more lines)")
379
+ lines.append("")
380
+
381
+ lines.append(" Locations:")
382
+ for block in group.blocks:
383
+ rel_path = os.path.relpath(block.file_path)
384
+ lines.append(f" - {rel_path}:{block.line_number}")
385
+
386
+ lines.append("")
387
+
388
+ return '\n'.join(lines)
389
+
390
+
391
+ def generate_csv_report(duplicate_groups: List[DuplicateGroup]) -> str:
392
+ """
393
+ Generate a CSV report of duplicate content.
394
+
395
+ Returns:
396
+ CSV formatted string
397
+ """
398
+ lines = ["Block Type,Similarity,Occurrences,File Path,Line Number,Content Preview"]
399
+
400
+ for group in duplicate_groups:
401
+ similarity_str = "exact" if group.similarity == 1.0 else f"{group.similarity:.2f}"
402
+ # Escape content for CSV
403
+ preview = group.canonical_content[:100].replace('"', '""').replace('\n', ' ')
404
+
405
+ for block in group.blocks:
406
+ rel_path = os.path.relpath(block.file_path)
407
+ lines.append(f'"{group.block_type}","{similarity_str}",{len(group.blocks)},"{rel_path}",{block.line_number},"{preview}"')
408
+
409
+ return '\n'.join(lines)