diffinite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffinite-0.1.0.dist-info/METADATA +143 -0
- diffinite-0.1.0.dist-info/RECORD +8 -0
- diffinite-0.1.0.dist-info/WHEEL +5 -0
- diffinite-0.1.0.dist-info/entry_points.txt +2 -0
- diffinite-0.1.0.dist-info/licenses/LICENSE +201 -0
- diffinite-0.1.0.dist-info/licenses/NOTICE +6 -0
- diffinite-0.1.0.dist-info/top_level.txt +1 -0
- diffinite.py +1162 -0
diffinite.py
ADDED
|
@@ -0,0 +1,1162 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Diffinite PoC — Source Code Directory Diff → PDF Report
|
|
4
|
+
|
|
5
|
+
Compares source code files across two directories (A, B) using fuzzy file-name
|
|
6
|
+
matching, produces quantitative analysis (match ratio, additions, deletions),
|
|
7
|
+
and generates a styled PDF report with side-by-side visual diffs.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python main.py dir_a dir_b --output-pdf report.pdf [--by-word] [--no-comments]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import difflib
|
|
17
|
+
import html
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import List, Optional, Tuple
|
|
25
|
+
|
|
26
|
+
from charset_normalizer import from_bytes
|
|
27
|
+
from rapidfuzz import fuzz
|
|
28
|
+
from xhtml2pdf import pisa
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Logging
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
logging.basicConfig(
|
|
34
|
+
level=logging.INFO,
|
|
35
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
36
|
+
)
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Constants
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
FUZZY_THRESHOLD = 60 # minimum similarity score for file matching (0-100)
|
|
43
|
+
|
|
44
|
+
# Comment regex patterns keyed by file extension
|
|
45
|
+
_COMMENT_PATTERNS: dict[str, list[re.Pattern]] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _build_comment_patterns() -> dict[str, list[re.Pattern]]:
|
|
49
|
+
"""Build and cache compiled regex patterns for comment removal.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Mapping of file extension → list of compiled regex patterns.
|
|
53
|
+
"""
|
|
54
|
+
if _COMMENT_PATTERNS:
|
|
55
|
+
return _COMMENT_PATTERNS
|
|
56
|
+
|
|
57
|
+
# Python: # to end-of-line (avoid shebang-like false positives minimally)
|
|
58
|
+
py_line = re.compile(r"#[^\n]*", re.MULTILINE)
|
|
59
|
+
|
|
60
|
+
# C-family: // to end-of-line
|
|
61
|
+
c_line = re.compile(r"//[^\n]*", re.MULTILINE)
|
|
62
|
+
# C-family: /* ... */ (non-greedy, DOTALL)
|
|
63
|
+
c_block = re.compile(r"/\*.*?\*/", re.DOTALL)
|
|
64
|
+
|
|
65
|
+
# HTML/XML: <!-- ... -->
|
|
66
|
+
html_block = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
67
|
+
|
|
68
|
+
for ext in (".py",):
|
|
69
|
+
_COMMENT_PATTERNS[ext] = [py_line]
|
|
70
|
+
|
|
71
|
+
for ext in (".js", ".ts", ".c", ".cpp", ".h", ".hpp", ".java", ".cs", ".go", ".rs"):
|
|
72
|
+
_COMMENT_PATTERNS[ext] = [c_line, c_block]
|
|
73
|
+
|
|
74
|
+
for ext in (".html", ".xml", ".htm", ".svg"):
|
|
75
|
+
_COMMENT_PATTERNS[ext] = [html_block]
|
|
76
|
+
|
|
77
|
+
return _COMMENT_PATTERNS
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Data classes
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
@dataclass
|
|
84
|
+
class FileMatch:
|
|
85
|
+
"""A matched pair of files from dir_a and dir_b."""
|
|
86
|
+
|
|
87
|
+
rel_path_a: str
|
|
88
|
+
rel_path_b: str
|
|
89
|
+
similarity: float # 0-100
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class DiffResult:
|
|
94
|
+
"""Quantitative + visual diff result for one file pair."""
|
|
95
|
+
|
|
96
|
+
match: FileMatch
|
|
97
|
+
ratio: float # 0.0 – 1.0
|
|
98
|
+
additions: int
|
|
99
|
+
deletions: int
|
|
100
|
+
html_diff: str # side-by-side HTML table
|
|
101
|
+
error: Optional[str] = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
# Step 1: File collection & Fuzzy matching
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
def collect_files(directory: str) -> list[str]:
|
|
108
|
+
"""Recursively collect relative file paths under *directory*.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
directory: Root directory to scan.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Sorted list of relative POSIX-style paths.
|
|
115
|
+
"""
|
|
116
|
+
root = Path(directory).resolve()
|
|
117
|
+
paths: list[str] = []
|
|
118
|
+
for item in root.rglob("*"):
|
|
119
|
+
if item.is_file():
|
|
120
|
+
paths.append(item.relative_to(root).as_posix())
|
|
121
|
+
paths.sort()
|
|
122
|
+
return paths
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def match_files(
|
|
126
|
+
files_a: list[str],
|
|
127
|
+
files_b: list[str],
|
|
128
|
+
threshold: float = FUZZY_THRESHOLD,
|
|
129
|
+
) -> Tuple[list[FileMatch], list[str], list[str]]:
|
|
130
|
+
"""Match files from two lists using fuzzy string similarity.
|
|
131
|
+
|
|
132
|
+
Uses a greedy best-match strategy with the Hungarian-style approach:
|
|
133
|
+
build a full similarity matrix, then greedily pick the best remaining
|
|
134
|
+
pair until no pair exceeds the threshold.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
files_a: Relative paths from directory A.
|
|
138
|
+
files_b: Relative paths from directory B.
|
|
139
|
+
threshold: Minimum similarity score (0–100) to accept a match.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Tuple of (matched pairs, unmatched_a, unmatched_b).
|
|
143
|
+
"""
|
|
144
|
+
# Build similarity matrix: list of (score, idx_a, idx_b)
|
|
145
|
+
candidates: list[Tuple[float, int, int]] = []
|
|
146
|
+
for i, fa in enumerate(files_a):
|
|
147
|
+
for j, fb in enumerate(files_b):
|
|
148
|
+
score = fuzz.ratio(fa, fb)
|
|
149
|
+
if score >= threshold:
|
|
150
|
+
candidates.append((score, i, j))
|
|
151
|
+
|
|
152
|
+
# Sort descending by score
|
|
153
|
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
154
|
+
|
|
155
|
+
used_a: set[int] = set()
|
|
156
|
+
used_b: set[int] = set()
|
|
157
|
+
matches: list[FileMatch] = []
|
|
158
|
+
|
|
159
|
+
for score, i, j in candidates:
|
|
160
|
+
if i in used_a or j in used_b:
|
|
161
|
+
continue
|
|
162
|
+
matches.append(FileMatch(files_a[i], files_b[j], score))
|
|
163
|
+
used_a.add(i)
|
|
164
|
+
used_b.add(j)
|
|
165
|
+
|
|
166
|
+
unmatched_a = [files_a[i] for i in range(len(files_a)) if i not in used_a]
|
|
167
|
+
unmatched_b = [files_b[j] for j in range(len(files_b)) if j not in used_b]
|
|
168
|
+
|
|
169
|
+
return matches, unmatched_a, unmatched_b
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
# Step 2: File reading with encoding auto-detection
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
def read_file(path: str) -> Optional[str]:
|
|
176
|
+
"""Read a file and auto-detect its encoding via charset_normalizer.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
path: Absolute or relative file path.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Decoded text content, or None on failure.
|
|
183
|
+
"""
|
|
184
|
+
try:
|
|
185
|
+
raw = Path(path).read_bytes()
|
|
186
|
+
except OSError as exc:
|
|
187
|
+
logger.error("Cannot read %s: %s", path, exc)
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
if not raw:
|
|
191
|
+
return ""
|
|
192
|
+
|
|
193
|
+
result = from_bytes(raw).best()
|
|
194
|
+
if result is None:
|
|
195
|
+
logger.warning("Could not detect encoding for %s — skipping", path)
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
return str(result)
|
|
200
|
+
except Exception as exc: # noqa: BLE001
|
|
201
|
+
logger.error("Decoding failed for %s (%s): %s", path, result.encoding, exc)
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
# Step 3: Comment stripping
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
def strip_comments(text: str, extension: str) -> str:
|
|
209
|
+
"""Remove comments from *text* based on the file *extension*.
|
|
210
|
+
|
|
211
|
+
Uses conservative regex patterns. Does NOT handle edge cases inside
|
|
212
|
+
string literals (acceptable for PoC).
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
text: Source code text.
|
|
216
|
+
extension: Lowercase file extension including the dot, e.g. ".py".
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Text with comments removed.
|
|
220
|
+
"""
|
|
221
|
+
patterns = _build_comment_patterns().get(extension, [])
|
|
222
|
+
for pat in patterns:
|
|
223
|
+
text = pat.sub("", text)
|
|
224
|
+
return text
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
# Step 4: Diff analysis
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
def compute_diff(
|
|
231
|
+
text_a: str,
|
|
232
|
+
text_b: str,
|
|
233
|
+
by_word: bool = False,
|
|
234
|
+
) -> Tuple[float, int, int]:
|
|
235
|
+
"""Compute similarity ratio, additions, and deletions between two texts.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
text_a: Text from directory A.
|
|
239
|
+
text_b: Text from directory B.
|
|
240
|
+
by_word: If True, compare by whitespace-split tokens; else by lines.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
(ratio, additions, deletions) where ratio ∈ [0.0, 1.0].
|
|
244
|
+
"""
|
|
245
|
+
if by_word:
|
|
246
|
+
seq_a = text_a.split()
|
|
247
|
+
seq_b = text_b.split()
|
|
248
|
+
else:
|
|
249
|
+
seq_a = text_a.splitlines(keepends=True)
|
|
250
|
+
seq_b = text_b.splitlines(keepends=True)
|
|
251
|
+
|
|
252
|
+
matcher = difflib.SequenceMatcher(None, seq_a, seq_b, autojunk=False)
|
|
253
|
+
ratio = matcher.ratio()
|
|
254
|
+
|
|
255
|
+
additions = 0
|
|
256
|
+
deletions = 0
|
|
257
|
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
|
258
|
+
if tag == "insert":
|
|
259
|
+
additions += j2 - j1
|
|
260
|
+
elif tag == "delete":
|
|
261
|
+
deletions += i2 - i1
|
|
262
|
+
elif tag == "replace":
|
|
263
|
+
additions += j2 - j1
|
|
264
|
+
deletions += i2 - i1
|
|
265
|
+
|
|
266
|
+
return ratio, additions, deletions
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def generate_html_diff(
|
|
270
|
+
text_a: str,
|
|
271
|
+
text_b: str,
|
|
272
|
+
label_a: str = "A",
|
|
273
|
+
label_b: str = "B",
|
|
274
|
+
) -> str:
|
|
275
|
+
"""Generate a side-by-side HTML diff using table rows.
|
|
276
|
+
|
|
277
|
+
Uses <table>/<tr>/<td> structure inspired by diff2html's
|
|
278
|
+
line-by-line renderer for reliable xhtml2pdf rendering.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
text_a: Text from file A.
|
|
282
|
+
text_b: Text from file B.
|
|
283
|
+
label_a: Column header for A side.
|
|
284
|
+
label_b: Column header for B side.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
HTML string containing the diff table.
|
|
288
|
+
"""
|
|
289
|
+
lines_a = text_a.splitlines()
|
|
290
|
+
lines_b = text_b.splitlines()
|
|
291
|
+
|
|
292
|
+
matcher = difflib.SequenceMatcher(None, lines_a, lines_b, autojunk=False)
|
|
293
|
+
rows: list[str] = []
|
|
294
|
+
|
|
295
|
+
def _row(ln_a: str, code_a: str, cls_a: str,
|
|
296
|
+
ln_b: str, code_b: str, cls_b: str) -> str:
|
|
297
|
+
"""Build a single <tr> with 4 <td> cells: lnA, codeA, lnB, codeB."""
|
|
298
|
+
return (
|
|
299
|
+
f'<tr>'
|
|
300
|
+
f'<td class="ln {cls_a}">{ln_a}</td>'
|
|
301
|
+
f'<td class="code {cls_a}"><pre>{code_a}</pre></td>'
|
|
302
|
+
f'<td class="ln {cls_b}">{ln_b}</td>'
|
|
303
|
+
f'<td class="code {cls_b}"><pre>{code_b}</pre></td>'
|
|
304
|
+
f'</tr>'
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
|
308
|
+
if tag == "equal":
|
|
309
|
+
for off in range(i2 - i1):
|
|
310
|
+
c = html.escape(lines_a[i1 + off])
|
|
311
|
+
rows.append(_row(str(i1+off+1), c, "",
|
|
312
|
+
str(j1+off+1), c, ""))
|
|
313
|
+
elif tag == "replace":
|
|
314
|
+
mx = max(i2 - i1, j2 - j1)
|
|
315
|
+
for off in range(mx):
|
|
316
|
+
if i1 + off < i2:
|
|
317
|
+
la, ca, cla = str(i1+off+1), html.escape(lines_a[i1+off]), "del"
|
|
318
|
+
else:
|
|
319
|
+
la, ca, cla = "", "", "empty"
|
|
320
|
+
if j1 + off < j2:
|
|
321
|
+
lb, cb, clb = str(j1+off+1), html.escape(lines_b[j1+off]), "add"
|
|
322
|
+
else:
|
|
323
|
+
lb, cb, clb = "", "", "empty"
|
|
324
|
+
rows.append(_row(la, ca, cla, lb, cb, clb))
|
|
325
|
+
elif tag == "delete":
|
|
326
|
+
for off in range(i2 - i1):
|
|
327
|
+
c = html.escape(lines_a[i1 + off])
|
|
328
|
+
rows.append(_row(str(i1+off+1), c, "del", "", "", "empty"))
|
|
329
|
+
elif tag == "insert":
|
|
330
|
+
for off in range(j2 - j1):
|
|
331
|
+
c = html.escape(lines_b[j1 + off])
|
|
332
|
+
rows.append(_row("", "", "empty", str(j1+off+1), c, "add"))
|
|
333
|
+
|
|
334
|
+
body = "\n".join(rows)
|
|
335
|
+
return (
|
|
336
|
+
f'<table class="difftbl">'
|
|
337
|
+
f'<thead><tr>'
|
|
338
|
+
f'<th class="ln">#</th>'
|
|
339
|
+
f'<th class="code">{html.escape(label_a)}</th>'
|
|
340
|
+
f'<th class="ln">#</th>'
|
|
341
|
+
f'<th class="code">{html.escape(label_b)}</th>'
|
|
342
|
+
f'</tr></thead>\n'
|
|
343
|
+
f'<tbody>\n{body}\n</tbody>'
|
|
344
|
+
f'</table>'
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# Step 5: HTML report & PDF conversion
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
# Base CSS without @page (the @page rule is built dynamically per document)
|
|
352
|
+
_CSS_BODY = """\
|
|
353
|
+
body {
|
|
354
|
+
font-family: "Segoe UI", "Noto Sans KR", "Malgun Gothic", Arial, sans-serif;
|
|
355
|
+
font-size: 10px;
|
|
356
|
+
color: #1e1e1e;
|
|
357
|
+
background: #fff;
|
|
358
|
+
}
|
|
359
|
+
h1 {
|
|
360
|
+
font-size: 22px;
|
|
361
|
+
border-bottom: 3px solid #0078d4;
|
|
362
|
+
padding-bottom: 6px;
|
|
363
|
+
margin-bottom: 16px;
|
|
364
|
+
color: #0078d4;
|
|
365
|
+
}
|
|
366
|
+
h2 {
|
|
367
|
+
font-size: 16px;
|
|
368
|
+
margin-top: 28px;
|
|
369
|
+
color: #333;
|
|
370
|
+
}
|
|
371
|
+
h3 {
|
|
372
|
+
font-size: 13px;
|
|
373
|
+
margin-top: 20px;
|
|
374
|
+
color: #555;
|
|
375
|
+
}
|
|
376
|
+
/* Summary table */
|
|
377
|
+
table.summary {
|
|
378
|
+
border-collapse: collapse;
|
|
379
|
+
width: 100%;
|
|
380
|
+
margin: 12px 0 20px 0;
|
|
381
|
+
font-size: 10px;
|
|
382
|
+
}
|
|
383
|
+
table.summary th, table.summary td {
|
|
384
|
+
border: 1px solid #ccc;
|
|
385
|
+
padding: 5px 8px;
|
|
386
|
+
text-align: left;
|
|
387
|
+
}
|
|
388
|
+
table.summary th {
|
|
389
|
+
background: #0078d4;
|
|
390
|
+
color: #fff;
|
|
391
|
+
font-weight: 600;
|
|
392
|
+
}
|
|
393
|
+
table.summary tr:nth-child(even) {
|
|
394
|
+
background: #f4f8fb;
|
|
395
|
+
}
|
|
396
|
+
/* ---- Side-by-side diff table (diff2html-inspired) ---- */
|
|
397
|
+
.difftbl {
|
|
398
|
+
border-collapse: collapse;
|
|
399
|
+
table-layout: fixed;
|
|
400
|
+
width: 100%;
|
|
401
|
+
font-family: "Consolas", "Courier New", monospace;
|
|
402
|
+
font-size: 7.5px;
|
|
403
|
+
margin-bottom: 20px;
|
|
404
|
+
}
|
|
405
|
+
.difftbl th, .difftbl td {
|
|
406
|
+
border: 1px solid #ddd;
|
|
407
|
+
padding: 1px 3px;
|
|
408
|
+
vertical-align: top;
|
|
409
|
+
word-wrap: break-word;
|
|
410
|
+
overflow: hidden;
|
|
411
|
+
}
|
|
412
|
+
.difftbl thead th {
|
|
413
|
+
background: #444;
|
|
414
|
+
color: #fff;
|
|
415
|
+
font-weight: bold;
|
|
416
|
+
font-size: 8px;
|
|
417
|
+
padding: 3px 4px;
|
|
418
|
+
text-align: left;
|
|
419
|
+
}
|
|
420
|
+
/* Line-number columns — narrow fixed width */
|
|
421
|
+
.ln {
|
|
422
|
+
width: 28px;
|
|
423
|
+
text-align: right;
|
|
424
|
+
color: #999;
|
|
425
|
+
background: #f5f5f5;
|
|
426
|
+
font-size: 7px;
|
|
427
|
+
padding-right: 4px;
|
|
428
|
+
}
|
|
429
|
+
.difftbl thead th.ln {
|
|
430
|
+
background: #444;
|
|
431
|
+
color: #fff;
|
|
432
|
+
text-align: center;
|
|
433
|
+
}
|
|
434
|
+
/* Code columns — fill remaining width equally */
|
|
435
|
+
.code {
|
|
436
|
+
white-space: pre-wrap;
|
|
437
|
+
word-wrap: break-word;
|
|
438
|
+
}
|
|
439
|
+
.code pre {
|
|
440
|
+
margin: 0;
|
|
441
|
+
padding: 0;
|
|
442
|
+
font-size: inherit;
|
|
443
|
+
font-family: inherit;
|
|
444
|
+
white-space: pre-wrap;
|
|
445
|
+
word-wrap: break-word;
|
|
446
|
+
}
|
|
447
|
+
/* Diff row colours — from diff2html CSS variables */
|
|
448
|
+
.del { background: #fee8e9; } /* d2h-del-bg-color */
|
|
449
|
+
.add { background: #dfd; } /* d2h-ins-bg-color */
|
|
450
|
+
.empty { background: #f1f1f1; } /* d2h-empty-placeholder-bg-color */
|
|
451
|
+
/* Unmatched file list */
|
|
452
|
+
ul.unmatched {
|
|
453
|
+
font-size: 11px;
|
|
454
|
+
color: #a00;
|
|
455
|
+
}
|
|
456
|
+
.badge {
|
|
457
|
+
display: inline-block;
|
|
458
|
+
padding: 2px 8px;
|
|
459
|
+
border-radius: 4px;
|
|
460
|
+
font-size: 10px;
|
|
461
|
+
font-weight: bold;
|
|
462
|
+
color: #fff;
|
|
463
|
+
}
|
|
464
|
+
.badge-high { background: #28a745; }
|
|
465
|
+
.badge-mid { background: #ffc107; color: #333; }
|
|
466
|
+
.badge-low { background: #dc3545; }
|
|
467
|
+
.meta {
|
|
468
|
+
font-size: 11px;
|
|
469
|
+
color: #777;
|
|
470
|
+
margin-bottom: 20px;
|
|
471
|
+
}
|
|
472
|
+
/* ---- Annotation frame styles ---- */
|
|
473
|
+
.footer-table {
|
|
474
|
+
width: 100%;
|
|
475
|
+
font-family: "Segoe UI", Arial, sans-serif;
|
|
476
|
+
font-size: 8px;
|
|
477
|
+
color: #888;
|
|
478
|
+
border: none;
|
|
479
|
+
}
|
|
480
|
+
.footer-table td {
|
|
481
|
+
border: none;
|
|
482
|
+
padding: 0;
|
|
483
|
+
vertical-align: bottom;
|
|
484
|
+
}
|
|
485
|
+
.header-filename {
|
|
486
|
+
text-align: right;
|
|
487
|
+
font-family: "Segoe UI", Arial, sans-serif;
|
|
488
|
+
font-size: 7px;
|
|
489
|
+
color: #aaa;
|
|
490
|
+
}
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _ratio_badge(ratio: float) -> str:
|
|
495
|
+
"""Return an HTML badge span for a similarity ratio."""
|
|
496
|
+
pct = ratio * 100
|
|
497
|
+
if pct >= 80:
|
|
498
|
+
cls = "badge-high"
|
|
499
|
+
elif pct >= 50:
|
|
500
|
+
cls = "badge-mid"
|
|
501
|
+
else:
|
|
502
|
+
cls = "badge-low"
|
|
503
|
+
return f'<span class="badge {cls}">{pct:.1f}%</span>'
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _html_wrap(
|
|
507
|
+
title: str,
|
|
508
|
+
body: str,
|
|
509
|
+
annotation_html: str = "",
|
|
510
|
+
*,
|
|
511
|
+
has_footer: bool = False,
|
|
512
|
+
has_header: bool = False,
|
|
513
|
+
) -> str:
|
|
514
|
+
"""Wrap body content in a full HTML document with CSS.
|
|
515
|
+
|
|
516
|
+
Dynamically builds the @page CSS rule with optional @frame blocks
|
|
517
|
+
for footer and header annotations that repeat on every page.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
title: Page title.
|
|
521
|
+
body: Main HTML body content.
|
|
522
|
+
annotation_html: Annotation divs with IDs matching frame names.
|
|
523
|
+
has_footer: If True, add a footer @frame to @page.
|
|
524
|
+
has_header: If True, add a header @frame to @page.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Full HTML document string.
|
|
528
|
+
"""
|
|
529
|
+
margin_bottom = "2cm" if has_footer else "1.2cm"
|
|
530
|
+
margin_top = "2cm" if has_header else "1.2cm"
|
|
531
|
+
|
|
532
|
+
frames = ""
|
|
533
|
+
if has_footer:
|
|
534
|
+
frames += """
|
|
535
|
+
@frame footer_frame {
|
|
536
|
+
-pdf-frame-content: pageFooter;
|
|
537
|
+
left: 1.2cm;
|
|
538
|
+
right: 1.2cm;
|
|
539
|
+
bottom: 0.2cm;
|
|
540
|
+
height: 1cm;
|
|
541
|
+
}"""
|
|
542
|
+
if has_header:
|
|
543
|
+
frames += """
|
|
544
|
+
@frame header_frame {
|
|
545
|
+
-pdf-frame-content: pageHeader;
|
|
546
|
+
left: 1.2cm;
|
|
547
|
+
right: 1.2cm;
|
|
548
|
+
top: 0.2cm;
|
|
549
|
+
height: 1cm;
|
|
550
|
+
}"""
|
|
551
|
+
|
|
552
|
+
page_css = f"""@page {{
|
|
553
|
+
size: A4 landscape;
|
|
554
|
+
margin: {margin_top} 1.2cm {margin_bottom} 1.2cm;{frames}
|
|
555
|
+
}}"""
|
|
556
|
+
|
|
557
|
+
return f"""\
|
|
558
|
+
<!DOCTYPE html>
|
|
559
|
+
<html lang="ko">
|
|
560
|
+
<head>
|
|
561
|
+
<meta charset="utf-8">
|
|
562
|
+
<title>{html.escape(title)}</title>
|
|
563
|
+
<style>
|
|
564
|
+
{page_css}
|
|
565
|
+
{_CSS_BODY}
|
|
566
|
+
</style>
|
|
567
|
+
</head>
|
|
568
|
+
<body>
|
|
569
|
+
{annotation_html}
|
|
570
|
+
{body}
|
|
571
|
+
</body>
|
|
572
|
+
</html>
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def build_cover_html(
|
|
577
|
+
results: list[DiffResult],
|
|
578
|
+
unmatched_a: list[str],
|
|
579
|
+
unmatched_b: list[str],
|
|
580
|
+
dir_a: str,
|
|
581
|
+
dir_b: str,
|
|
582
|
+
by_word: bool,
|
|
583
|
+
compare_comment: bool,
|
|
584
|
+
) -> str:
|
|
585
|
+
"""Build the cover-page HTML with summary table and matching map.
|
|
586
|
+
|
|
587
|
+
Args:
|
|
588
|
+
results: List of DiffResult objects.
|
|
589
|
+
unmatched_a: Files only in dir_a.
|
|
590
|
+
unmatched_b: Files only in dir_b.
|
|
591
|
+
dir_a / dir_b: Directory paths.
|
|
592
|
+
by_word: Whether word-level comparison was used.
|
|
593
|
+
compare_comment: Whether comments were included.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Full HTML string for the cover page.
|
|
597
|
+
"""
|
|
598
|
+
unit = "word" if by_word else "line"
|
|
599
|
+
comment_mode = "included" if compare_comment else "excluded"
|
|
600
|
+
|
|
601
|
+
# Summary table rows
|
|
602
|
+
summary_rows = ""
|
|
603
|
+
for idx, r in enumerate(results, 1):
|
|
604
|
+
badge = _ratio_badge(r.ratio)
|
|
605
|
+
err = f' <em style="color:red">({html.escape(r.error)})</em>' if r.error else ""
|
|
606
|
+
summary_rows += (
|
|
607
|
+
f"<tr>"
|
|
608
|
+
f"<td>{idx}</td>"
|
|
609
|
+
f"<td>{html.escape(r.match.rel_path_a)}</td>"
|
|
610
|
+
f"<td>{html.escape(r.match.rel_path_b)}</td>"
|
|
611
|
+
f"<td>{r.match.similarity:.1f}</td>"
|
|
612
|
+
f"<td>{badge}{err}</td>"
|
|
613
|
+
f"<td style='color:green'>+{r.additions}</td>"
|
|
614
|
+
f"<td style='color:red'>-{r.deletions}</td>"
|
|
615
|
+
f"</tr>\n"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Unmatched lists
|
|
619
|
+
unmatched_html = ""
|
|
620
|
+
if unmatched_a or unmatched_b:
|
|
621
|
+
unmatched_html += "<h2>Unmatched Files</h2>\n"
|
|
622
|
+
if unmatched_a:
|
|
623
|
+
unmatched_html += f"<h3>Only in A ({html.escape(dir_a)})</h3>\n<ul class='unmatched'>\n"
|
|
624
|
+
for f in unmatched_a:
|
|
625
|
+
unmatched_html += f" <li>{html.escape(f)}</li>\n"
|
|
626
|
+
unmatched_html += "</ul>\n"
|
|
627
|
+
if unmatched_b:
|
|
628
|
+
unmatched_html += f"<h3>Only in B ({html.escape(dir_b)})</h3>\n<ul class='unmatched'>\n"
|
|
629
|
+
for f in unmatched_b:
|
|
630
|
+
unmatched_html += f" <li>{html.escape(f)}</li>\n"
|
|
631
|
+
unmatched_html += "</ul>\n"
|
|
632
|
+
|
|
633
|
+
body = f"""\
|
|
634
|
+
<h1>Diffinite — Source Code Diff Report</h1>
|
|
635
|
+
<p class="meta">
|
|
636
|
+
<strong>Dir A:</strong> {html.escape(dir_a)}<br>
|
|
637
|
+
<strong>Dir B:</strong> {html.escape(dir_b)}<br>
|
|
638
|
+
<strong>Comparison unit:</strong> {unit} |
|
|
639
|
+
<strong>Comments:</strong> {comment_mode} |
|
|
640
|
+
<strong>Matched pairs:</strong> {len(results)} |
|
|
641
|
+
<strong>Unmatched:</strong> {len(unmatched_a)} (A) / {len(unmatched_b)} (B)
|
|
642
|
+
</p>
|
|
643
|
+
|
|
644
|
+
<h2>Summary</h2>
|
|
645
|
+
<table class="summary">
|
|
646
|
+
<tr>
|
|
647
|
+
<th>#</th><th>File A</th><th>File B</th><th>Name Sim.</th>
|
|
648
|
+
<th>Content Match</th><th>Added</th><th>Deleted</th>
|
|
649
|
+
</tr>
|
|
650
|
+
{summary_rows}
|
|
651
|
+
</table>
|
|
652
|
+
|
|
653
|
+
{unmatched_html}
|
|
654
|
+
"""
|
|
655
|
+
return _html_wrap("Diffinite — Cover", body)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def _build_annotation_html(
|
|
659
|
+
*,
|
|
660
|
+
show_page_number: bool = False,
|
|
661
|
+
show_file_number: bool = False,
|
|
662
|
+
file_index: int = 0,
|
|
663
|
+
total_files: int = 0,
|
|
664
|
+
show_filename: bool = False,
|
|
665
|
+
filename: str = "",
|
|
666
|
+
) -> tuple[str, bool, bool]:
|
|
667
|
+
"""Build annotation divs using xhtml2pdf @frame mechanism.
|
|
668
|
+
|
|
669
|
+
Returns div elements whose IDs match @frame `-pdf-frame-content`
|
|
670
|
+
names, so xhtml2pdf renders them on every page.
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
show_page_number: Render 'Page n / N' at footer right.
|
|
674
|
+
show_file_number: Render 'File n / N' at footer left.
|
|
675
|
+
file_index: 1-based index of the current file.
|
|
676
|
+
total_files: Total number of matched file pairs.
|
|
677
|
+
show_filename: Render filename at header right.
|
|
678
|
+
filename: Filename string to display.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
Tuple of (annotation_html, has_footer, has_header).
|
|
682
|
+
"""
|
|
683
|
+
parts: list[str] = []
|
|
684
|
+
has_footer = show_page_number or show_file_number
|
|
685
|
+
has_header = show_filename and bool(filename)
|
|
686
|
+
|
|
687
|
+
if has_footer:
|
|
688
|
+
left_cell = ""
|
|
689
|
+
right_cell = ""
|
|
690
|
+
if show_file_number and total_files > 0:
|
|
691
|
+
left_cell = f'File {file_index} / {total_files}'
|
|
692
|
+
if show_page_number:
|
|
693
|
+
right_cell = 'Page <pdf:pagenumber> / <pdf:pagecount>'
|
|
694
|
+
parts.append(
|
|
695
|
+
f'<div id="pageFooter">'
|
|
696
|
+
f'<table class="footer-table"><tr>'
|
|
697
|
+
f'<td style="text-align:left;">{left_cell}</td>'
|
|
698
|
+
f'<td style="text-align:center;"></td>' # Bates placeholder (added post-hoc)
|
|
699
|
+
f'<td style="text-align:right;">{right_cell}</td>'
|
|
700
|
+
f'</tr></table>'
|
|
701
|
+
f'</div>'
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
if has_header:
|
|
705
|
+
parts.append(
|
|
706
|
+
f'<div id="pageHeader">'
|
|
707
|
+
f'<p class="header-filename">{html.escape(filename)}</p>'
|
|
708
|
+
f'</div>'
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
return "\n".join(parts), has_footer, has_header
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def build_diff_page_html(
|
|
715
|
+
result: DiffResult,
|
|
716
|
+
index: int,
|
|
717
|
+
unit: str,
|
|
718
|
+
*,
|
|
719
|
+
show_page_number: bool = False,
|
|
720
|
+
show_file_number: bool = False,
|
|
721
|
+
total_files: int = 0,
|
|
722
|
+
show_filename: bool = False,
|
|
723
|
+
) -> str:
|
|
724
|
+
"""Build a single-file diff page HTML.
|
|
725
|
+
|
|
726
|
+
Args:
|
|
727
|
+
result: DiffResult for this file pair.
|
|
728
|
+
index: 1-based index of this pair.
|
|
729
|
+
unit: "word" or "line".
|
|
730
|
+
show_page_number: Add page number annotation.
|
|
731
|
+
show_file_number: Add file sequence annotation.
|
|
732
|
+
total_files: Total number of file pairs.
|
|
733
|
+
show_filename: Add filename annotation.
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
Full HTML string for one diff page.
|
|
737
|
+
"""
|
|
738
|
+
r = result
|
|
739
|
+
if r.error:
|
|
740
|
+
body = (
|
|
741
|
+
f"<h2>{index}. {html.escape(r.match.rel_path_a)} ↔ "
|
|
742
|
+
f"{html.escape(r.match.rel_path_b)}</h2>\n"
|
|
743
|
+
f"<p style='color:red'>Error: {html.escape(r.error)}</p>\n"
|
|
744
|
+
)
|
|
745
|
+
else:
|
|
746
|
+
body = (
|
|
747
|
+
f"<h2>{index}. {html.escape(r.match.rel_path_a)} ↔ "
|
|
748
|
+
f"{html.escape(r.match.rel_path_b)}</h2>\n"
|
|
749
|
+
f"<p>Match ratio: {_ratio_badge(r.ratio)} "
|
|
750
|
+
f"<span style='color:green'>+{r.additions} {unit}(s)</span> "
|
|
751
|
+
f"<span style='color:red'>-{r.deletions} {unit}(s)</span></p>\n"
|
|
752
|
+
f"{r.html_diff}\n"
|
|
753
|
+
)
|
|
754
|
+
annotation_html, has_footer, has_header = _build_annotation_html(
|
|
755
|
+
show_page_number=show_page_number,
|
|
756
|
+
show_file_number=show_file_number,
|
|
757
|
+
file_index=index,
|
|
758
|
+
total_files=total_files,
|
|
759
|
+
show_filename=show_filename,
|
|
760
|
+
filename=r.match.rel_path_a,
|
|
761
|
+
)
|
|
762
|
+
return _html_wrap(
|
|
763
|
+
f"Diff — {r.match.rel_path_a}",
|
|
764
|
+
body,
|
|
765
|
+
annotation_html=annotation_html,
|
|
766
|
+
has_footer=has_footer,
|
|
767
|
+
has_header=has_header,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def html_to_pdf(html_content: str, output_path: str) -> bool:
|
|
772
|
+
"""Convert an HTML string to a PDF file via xhtml2pdf.
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
html_content: Full HTML document string.
|
|
776
|
+
output_path: Destination PDF file path.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
True if the PDF was created successfully, False otherwise.
|
|
780
|
+
"""
|
|
781
|
+
out = Path(output_path)
|
|
782
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
783
|
+
with open(str(out), "w+b") as fh:
|
|
784
|
+
status = pisa.CreatePDF(html_content, dest=fh)
|
|
785
|
+
if status.err:
|
|
786
|
+
logger.error("PDF conversion error for %s", output_path)
|
|
787
|
+
return False
|
|
788
|
+
return True
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def merge_pdfs(pdf_paths: list[str], output_path: str) -> None:
|
|
792
|
+
"""Merge multiple PDF files into a single output PDF.
|
|
793
|
+
|
|
794
|
+
Args:
|
|
795
|
+
pdf_paths: Ordered list of PDF file paths to merge.
|
|
796
|
+
output_path: Destination merged PDF file path.
|
|
797
|
+
"""
|
|
798
|
+
from pypdf import PdfWriter
|
|
799
|
+
|
|
800
|
+
writer = PdfWriter()
|
|
801
|
+
for p in pdf_paths:
|
|
802
|
+
if Path(p).exists() and Path(p).stat().st_size > 0:
|
|
803
|
+
writer.append(p)
|
|
804
|
+
else:
|
|
805
|
+
logger.warning("Skipping empty or missing PDF: %s", p)
|
|
806
|
+
|
|
807
|
+
out = Path(output_path)
|
|
808
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
809
|
+
with open(str(out), "wb") as fh:
|
|
810
|
+
writer.write(fh)
|
|
811
|
+
writer.close()
|
|
812
|
+
logger.info("Merged PDF saved → %s (%d bytes)", out.resolve(), out.stat().st_size)
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def add_bates_numbers(input_path: str, output_path: str) -> None:
|
|
816
|
+
"""Stamp Bates numbers on each page of a merged PDF.
|
|
817
|
+
|
|
818
|
+
Uses reportlab to create an overlay with Bates numbers at the
|
|
819
|
+
bottom-center of each page, then merges the overlay onto the
|
|
820
|
+
original pages.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
input_path: Path to the input merged PDF.
|
|
824
|
+
output_path: Path to the stamped output PDF.
|
|
825
|
+
"""
|
|
826
|
+
import io
|
|
827
|
+
|
|
828
|
+
from pypdf import PdfReader, PdfWriter
|
|
829
|
+
from reportlab.lib.pagesizes import landscape, A4
|
|
830
|
+
from reportlab.pdfgen import canvas
|
|
831
|
+
|
|
832
|
+
reader = PdfReader(input_path)
|
|
833
|
+
writer = PdfWriter()
|
|
834
|
+
total_pages = len(reader.pages)
|
|
835
|
+
digits = max(4, len(str(total_pages)))
|
|
836
|
+
|
|
837
|
+
for i, page in enumerate(reader.pages):
|
|
838
|
+
# Get page dimensions
|
|
839
|
+
box = page.mediabox
|
|
840
|
+
pw = float(box.width)
|
|
841
|
+
ph = float(box.height)
|
|
842
|
+
|
|
843
|
+
# Create an overlay PDF in memory
|
|
844
|
+
buf = io.BytesIO()
|
|
845
|
+
c = canvas.Canvas(buf, pagesize=(pw, ph))
|
|
846
|
+
bates = str(i + 1).zfill(digits)
|
|
847
|
+
c.setFont("Helvetica", 9)
|
|
848
|
+
c.setFillColorRGB(0.5, 0.5, 0.5)
|
|
849
|
+
c.drawCentredString(pw / 2, 18, bates)
|
|
850
|
+
c.save()
|
|
851
|
+
buf.seek(0)
|
|
852
|
+
|
|
853
|
+
# Merge overlay onto the original page
|
|
854
|
+
overlay_page = PdfReader(buf).pages[0]
|
|
855
|
+
page.merge_page(overlay_page)
|
|
856
|
+
writer.add_page(page)
|
|
857
|
+
|
|
858
|
+
out = Path(output_path)
|
|
859
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
860
|
+
with open(str(out), "wb") as fh:
|
|
861
|
+
writer.write(fh)
|
|
862
|
+
writer.close()
|
|
863
|
+
logger.info("Bates numbers added → %s", out.resolve())
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
def _stamp_bates_inplace(pdf_path: str, start_number: int, digits: int) -> None:
|
|
867
|
+
"""Stamp Bates numbers on a single PDF file in-place.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
pdf_path: Path to the PDF to stamp.
|
|
871
|
+
start_number: 0-based starting page number for Bates sequence.
|
|
872
|
+
digits: Number of zero-padded digits.
|
|
873
|
+
"""
|
|
874
|
+
import io
|
|
875
|
+
|
|
876
|
+
from pypdf import PdfReader, PdfWriter
|
|
877
|
+
from reportlab.pdfgen import canvas
|
|
878
|
+
|
|
879
|
+
reader = PdfReader(pdf_path)
|
|
880
|
+
writer = PdfWriter()
|
|
881
|
+
|
|
882
|
+
for i, page in enumerate(reader.pages):
|
|
883
|
+
box = page.mediabox
|
|
884
|
+
pw = float(box.width)
|
|
885
|
+
ph = float(box.height)
|
|
886
|
+
|
|
887
|
+
buf = io.BytesIO()
|
|
888
|
+
c = canvas.Canvas(buf, pagesize=(pw, ph))
|
|
889
|
+
bates = str(start_number + i + 1).zfill(digits)
|
|
890
|
+
c.setFont("Helvetica", 9)
|
|
891
|
+
c.setFillColorRGB(0.5, 0.5, 0.5)
|
|
892
|
+
c.drawCentredString(pw / 2, 18, bates)
|
|
893
|
+
c.save()
|
|
894
|
+
buf.seek(0)
|
|
895
|
+
|
|
896
|
+
overlay_page = PdfReader(buf).pages[0]
|
|
897
|
+
page.merge_page(overlay_page)
|
|
898
|
+
writer.add_page(page)
|
|
899
|
+
|
|
900
|
+
with open(pdf_path, "wb") as fh:
|
|
901
|
+
writer.write(fh)
|
|
902
|
+
writer.close()
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
# ---------------------------------------------------------------------------
|
|
906
|
+
# Orchestrator
|
|
907
|
+
# ---------------------------------------------------------------------------
|
|
908
|
+
def run_pipeline(
|
|
909
|
+
dir_a: str,
|
|
910
|
+
dir_b: str,
|
|
911
|
+
by_word: bool = False,
|
|
912
|
+
compare_comment: bool = True,
|
|
913
|
+
output_pdf: str = "report.pdf",
|
|
914
|
+
threshold: float = FUZZY_THRESHOLD,
|
|
915
|
+
*,
|
|
916
|
+
no_merge: bool = False,
|
|
917
|
+
show_page_number: bool = False,
|
|
918
|
+
show_file_number: bool = False,
|
|
919
|
+
show_bates_number: bool = False,
|
|
920
|
+
show_filename: bool = False,
|
|
921
|
+
) -> None:
|
|
922
|
+
"""Execute the full diff-to-PDF pipeline.
|
|
923
|
+
|
|
924
|
+
Uses divide-and-conquer: generates a cover PDF and individual per-file
|
|
925
|
+
diff PDFs, then optionally merges them into the final output.
|
|
926
|
+
|
|
927
|
+
Args:
|
|
928
|
+
dir_a: Path to the original source directory.
|
|
929
|
+
dir_b: Path to the comparison source directory.
|
|
930
|
+
by_word: True for word-level comparison; False for line-level.
|
|
931
|
+
compare_comment: True to include comments; False to strip before diff.
|
|
932
|
+
output_pdf: Output PDF file path.
|
|
933
|
+
threshold: Fuzzy matching threshold (0–100).
|
|
934
|
+
no_merge: If True, output individual PDFs instead of merging.
|
|
935
|
+
show_page_number: If True, stamp 'Page n / N' at bottom-right.
|
|
936
|
+
show_file_number: If True, stamp 'File n / N' at bottom-left.
|
|
937
|
+
show_bates_number: If True, stamp Bates numbers at bottom-center (merged only).
|
|
938
|
+
show_filename: If True, stamp filename at top-right.
|
|
939
|
+
"""
|
|
940
|
+
import tempfile
|
|
941
|
+
|
|
942
|
+
# Step 1 — collect & match
|
|
943
|
+
logger.info("Step 1: Collecting files …")
|
|
944
|
+
files_a = collect_files(dir_a)
|
|
945
|
+
files_b = collect_files(dir_b)
|
|
946
|
+
logger.info(" Dir A: %d files | Dir B: %d files", len(files_a), len(files_b))
|
|
947
|
+
|
|
948
|
+
matches, unmatched_a, unmatched_b = match_files(files_a, files_b, threshold=threshold)
|
|
949
|
+
logger.info(" Matched pairs: %d | Unmatched A: %d | Unmatched B: %d",
|
|
950
|
+
len(matches), len(unmatched_a), len(unmatched_b))
|
|
951
|
+
|
|
952
|
+
root_a = Path(dir_a).resolve()
|
|
953
|
+
root_b = Path(dir_b).resolve()
|
|
954
|
+
unit = "word" if by_word else "line"
|
|
955
|
+
|
|
956
|
+
# Steps 2-4 — read, preprocess, diff for each pair
|
|
957
|
+
results: list[DiffResult] = []
|
|
958
|
+
for m in matches:
|
|
959
|
+
abs_a = str(root_a / m.rel_path_a)
|
|
960
|
+
abs_b = str(root_b / m.rel_path_b)
|
|
961
|
+
ext = Path(m.rel_path_a).suffix.lower()
|
|
962
|
+
|
|
963
|
+
text_a = read_file(abs_a)
|
|
964
|
+
text_b = read_file(abs_b)
|
|
965
|
+
|
|
966
|
+
if text_a is None or text_b is None:
|
|
967
|
+
results.append(DiffResult(
|
|
968
|
+
match=m, ratio=0.0, additions=0, deletions=0,
|
|
969
|
+
html_diff="", error="Could not decode one or both files",
|
|
970
|
+
))
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
if not compare_comment:
|
|
974
|
+
text_a = strip_comments(text_a, ext)
|
|
975
|
+
text_b = strip_comments(text_b, ext)
|
|
976
|
+
|
|
977
|
+
ratio, additions, deletions = compute_diff(text_a, text_b, by_word)
|
|
978
|
+
|
|
979
|
+
html_diff = generate_html_diff(
|
|
980
|
+
text_a, text_b,
|
|
981
|
+
label_a=m.rel_path_a,
|
|
982
|
+
label_b=m.rel_path_b,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
results.append(DiffResult(
|
|
986
|
+
match=m,
|
|
987
|
+
ratio=ratio,
|
|
988
|
+
additions=additions,
|
|
989
|
+
deletions=deletions,
|
|
990
|
+
html_diff=html_diff,
|
|
991
|
+
))
|
|
992
|
+
|
|
993
|
+
total_files = len(results)
|
|
994
|
+
|
|
995
|
+
# Step 5 — divide-and-conquer PDF generation
|
|
996
|
+
logger.info("Step 5: Generating PDFs (divide-and-conquer) …")
|
|
997
|
+
|
|
998
|
+
# Determine output directory for no-merge mode
|
|
999
|
+
if no_merge:
|
|
1000
|
+
out_stem = Path(output_pdf).stem
|
|
1001
|
+
out_dir = Path(output_pdf).parent / f"{out_stem}_files"
|
|
1002
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
1003
|
+
logger.info(" No-merge mode — individual PDFs → %s", out_dir.resolve())
|
|
1004
|
+
|
|
1005
|
+
with tempfile.TemporaryDirectory(prefix="diffinite_") as tmpdir:
|
|
1006
|
+
pdf_parts: list[str] = []
|
|
1007
|
+
|
|
1008
|
+
# (1) Cover page
|
|
1009
|
+
cover_html = build_cover_html(
|
|
1010
|
+
results, unmatched_a, unmatched_b,
|
|
1011
|
+
dir_a, dir_b, by_word, compare_comment,
|
|
1012
|
+
)
|
|
1013
|
+
if no_merge:
|
|
1014
|
+
cover_dest = str(out_dir / "000_cover.pdf")
|
|
1015
|
+
else:
|
|
1016
|
+
cover_dest = os.path.join(tmpdir, "00_cover.pdf")
|
|
1017
|
+
if html_to_pdf(cover_html, cover_dest):
|
|
1018
|
+
pdf_parts.append(cover_dest)
|
|
1019
|
+
logger.info(" Cover page → OK")
|
|
1020
|
+
|
|
1021
|
+
# (2) Per-file diff pages
|
|
1022
|
+
for idx, r in enumerate(results, 1):
|
|
1023
|
+
diff_html = build_diff_page_html(
|
|
1024
|
+
r, idx, unit,
|
|
1025
|
+
show_page_number=show_page_number,
|
|
1026
|
+
show_file_number=show_file_number,
|
|
1027
|
+
total_files=total_files,
|
|
1028
|
+
show_filename=show_filename,
|
|
1029
|
+
)
|
|
1030
|
+
# Determine destination path
|
|
1031
|
+
safe_name = Path(r.match.rel_path_a).name.replace(" ", "_")
|
|
1032
|
+
if no_merge:
|
|
1033
|
+
diff_dest = str(out_dir / f"{idx:03d}_{safe_name}.pdf")
|
|
1034
|
+
else:
|
|
1035
|
+
diff_dest = os.path.join(tmpdir, f"{idx:03d}_diff.pdf")
|
|
1036
|
+
if html_to_pdf(diff_html, diff_dest):
|
|
1037
|
+
pdf_parts.append(diff_dest)
|
|
1038
|
+
logger.info(" Diff page %d (%s) → OK", idx, r.match.rel_path_a)
|
|
1039
|
+
else:
|
|
1040
|
+
logger.warning(" Diff page %d FAILED", idx)
|
|
1041
|
+
|
|
1042
|
+
# (3) Merge or skip
|
|
1043
|
+
if no_merge:
|
|
1044
|
+
# (4) Bates numbers for individual PDFs
|
|
1045
|
+
if show_bates_number and pdf_parts:
|
|
1046
|
+
logger.info(" Stamping Bates numbers on individual PDFs …")
|
|
1047
|
+
global_page = 0
|
|
1048
|
+
# Count total pages across all PDFs first
|
|
1049
|
+
from pypdf import PdfReader as _PR
|
|
1050
|
+
page_counts = []
|
|
1051
|
+
for p in pdf_parts:
|
|
1052
|
+
try:
|
|
1053
|
+
page_counts.append(len(_PR(p).pages))
|
|
1054
|
+
except Exception:
|
|
1055
|
+
page_counts.append(0)
|
|
1056
|
+
total_global_pages = sum(page_counts)
|
|
1057
|
+
digits = max(4, len(str(total_global_pages)))
|
|
1058
|
+
for p, pc in zip(pdf_parts, page_counts):
|
|
1059
|
+
if pc == 0:
|
|
1060
|
+
continue
|
|
1061
|
+
_stamp_bates_inplace(p, global_page, digits)
|
|
1062
|
+
global_page += pc
|
|
1063
|
+
logger.info(" No-merge mode — %d PDFs saved to %s", len(pdf_parts), out_dir.resolve())
|
|
1064
|
+
elif pdf_parts:
|
|
1065
|
+
merge_pdfs(pdf_parts, output_pdf)
|
|
1066
|
+
# (4) Bates numbers (only for merged PDFs)
|
|
1067
|
+
if show_bates_number:
|
|
1068
|
+
logger.info(" Stamping Bates numbers …")
|
|
1069
|
+
bates_tmp = os.path.join(tmpdir, "bates_tmp.pdf")
|
|
1070
|
+
os.replace(output_pdf, bates_tmp)
|
|
1071
|
+
add_bates_numbers(bates_tmp, output_pdf)
|
|
1072
|
+
else:
|
|
1073
|
+
logger.error("No PDF parts were generated — cannot create report")
|
|
1074
|
+
|
|
1075
|
+
logger.info("Done ✓")
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
# ---------------------------------------------------------------------------
|
|
1079
|
+
# CLI entry point
|
|
1080
|
+
# ---------------------------------------------------------------------------
|
|
1081
|
+
def main() -> None:
|
|
1082
|
+
"""Parse arguments and run the pipeline."""
|
|
1083
|
+
parser = argparse.ArgumentParser(
|
|
1084
|
+
description="Diffinite PoC — Compare two source directories and generate a PDF diff report.",
|
|
1085
|
+
)
|
|
1086
|
+
parser.add_argument("dir_a", help="Path to the original source directory (A)")
|
|
1087
|
+
parser.add_argument("dir_b", help="Path to the comparison source directory (B)")
|
|
1088
|
+
parser.add_argument(
|
|
1089
|
+
"--output-pdf", "-o",
|
|
1090
|
+
default="report.pdf",
|
|
1091
|
+
help="Output PDF file path (default: report.pdf)",
|
|
1092
|
+
)
|
|
1093
|
+
parser.add_argument(
|
|
1094
|
+
"--by-word",
|
|
1095
|
+
action="store_true",
|
|
1096
|
+
default=False,
|
|
1097
|
+
help="Compare by word instead of by line",
|
|
1098
|
+
)
|
|
1099
|
+
parser.add_argument(
|
|
1100
|
+
"--no-comments",
|
|
1101
|
+
action="store_true",
|
|
1102
|
+
default=False,
|
|
1103
|
+
help="Strip comments before comparison",
|
|
1104
|
+
)
|
|
1105
|
+
parser.add_argument(
|
|
1106
|
+
"--threshold",
|
|
1107
|
+
type=float,
|
|
1108
|
+
default=FUZZY_THRESHOLD,
|
|
1109
|
+
help=f"Fuzzy matching threshold (0–100, default: {FUZZY_THRESHOLD})",
|
|
1110
|
+
)
|
|
1111
|
+
parser.add_argument(
|
|
1112
|
+
"--no-merge",
|
|
1113
|
+
action="store_true",
|
|
1114
|
+
default=False,
|
|
1115
|
+
help="Generate individual PDFs per file instead of one merged PDF",
|
|
1116
|
+
)
|
|
1117
|
+
parser.add_argument(
|
|
1118
|
+
"--page-number",
|
|
1119
|
+
action="store_true",
|
|
1120
|
+
default=False,
|
|
1121
|
+
help="Show 'Page n / N' at the bottom-right of each page",
|
|
1122
|
+
)
|
|
1123
|
+
parser.add_argument(
|
|
1124
|
+
"--file-number",
|
|
1125
|
+
action="store_true",
|
|
1126
|
+
default=False,
|
|
1127
|
+
help="Show 'File n / N' at the bottom-left of each page",
|
|
1128
|
+
)
|
|
1129
|
+
parser.add_argument(
|
|
1130
|
+
"--bates-number",
|
|
1131
|
+
action="store_true",
|
|
1132
|
+
default=False,
|
|
1133
|
+
help="Stamp Bates numbers at the bottom-center of each page (merged mode only)",
|
|
1134
|
+
)
|
|
1135
|
+
parser.add_argument(
|
|
1136
|
+
"--show-filename",
|
|
1137
|
+
action="store_true",
|
|
1138
|
+
default=False,
|
|
1139
|
+
help="Show the filename at the top-right of each page",
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
args = parser.parse_args()
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
run_pipeline(
|
|
1146
|
+
dir_a=args.dir_a,
|
|
1147
|
+
dir_b=args.dir_b,
|
|
1148
|
+
by_word=args.by_word,
|
|
1149
|
+
compare_comment=not args.no_comments,
|
|
1150
|
+
output_pdf=args.output_pdf,
|
|
1151
|
+
threshold=args.threshold,
|
|
1152
|
+
no_merge=args.no_merge,
|
|
1153
|
+
show_page_number=args.page_number,
|
|
1154
|
+
show_file_number=args.file_number,
|
|
1155
|
+
show_bates_number=args.bates_number,
|
|
1156
|
+
show_filename=args.show_filename,
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
if __name__ == "__main__":
|
|
1161
|
+
main()
|
|
1162
|
+
|