diffinite 0.9.5__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffinite-0.9.5/src/diffinite.egg-info → diffinite-0.10.0}/PKG-INFO +1 -1
- {diffinite-0.9.5 → diffinite-0.10.0}/pyproject.toml +1 -1
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/cli.py +51 -27
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/models.py +6 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/pdf_gen.py +66 -25
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/pipeline.py +127 -52
- {diffinite-0.9.5 → diffinite-0.10.0/src/diffinite.egg-info}/PKG-INFO +1 -1
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_cli.py +40 -2
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_pdf_gen.py +76 -11
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_pipeline.py +4 -4
- {diffinite-0.9.5 → diffinite-0.10.0}/LICENSE +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/NOTICE +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/README.md +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/setup.cfg +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/__init__.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/__main__.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/collector.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/deep_compare.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/differ.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/evidence.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/fingerprint.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/__init__.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/_registry.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/_spec.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/c_family.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/csharp.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/data.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/go_rust_swift.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/java.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/javascript.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/markup.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/python.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/languages/scripting.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite/parser.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite.egg-info/SOURCES.txt +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite.egg-info/dependency_links.txt +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite.egg-info/entry_points.txt +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite.egg-info/requires.txt +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/src/diffinite.egg-info/top_level.txt +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_collector.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_deep_compare.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_differ.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_differ_extended.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_evidence.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_evidence_hash.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_fingerprint.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_languages.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_normalize.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_parser.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_plagiarism_dataset.py +0 -0
- {diffinite-0.9.5 → diffinite-0.10.0}/tests/test_sqlite_integration.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diffinite"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.10.0"
|
|
8
8
|
description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "Apache-2.0"}
|
|
@@ -58,14 +58,6 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
58
58
|
),
|
|
59
59
|
)
|
|
60
60
|
|
|
61
|
-
# ── Output ────────────────────────────────────────────────────────
|
|
62
|
-
parser.add_argument(
|
|
63
|
-
"--output-pdf", "-o",
|
|
64
|
-
default="report.pdf",
|
|
65
|
-
help="Output PDF file path (default: report.pdf). "
|
|
66
|
-
"Ignored when any --report-* option is specified.",
|
|
67
|
-
)
|
|
68
|
-
|
|
69
61
|
# ── Comparison options ────────────────────────────────────────────
|
|
70
62
|
parser.add_argument(
|
|
71
63
|
"--by-word",
|
|
@@ -74,7 +66,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
74
66
|
help="Compare by word instead of by line",
|
|
75
67
|
)
|
|
76
68
|
parser.add_argument(
|
|
77
|
-
"--
|
|
69
|
+
"--strip-comments",
|
|
78
70
|
action="store_true",
|
|
79
71
|
default=False,
|
|
80
72
|
help="Strip comments before comparison (uses 2-pass parser)",
|
|
@@ -85,7 +77,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
85
77
|
default=False,
|
|
86
78
|
help=(
|
|
87
79
|
"Collapse runs of 3+ blank lines after comment stripping. "
|
|
88
|
-
"Only effective with --
|
|
80
|
+
"Only effective with --strip-comments. WARNING: changes line "
|
|
89
81
|
"numbers — do not use for forensic line-tracing."
|
|
90
82
|
),
|
|
91
83
|
)
|
|
@@ -107,13 +99,14 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
107
99
|
)
|
|
108
100
|
parser.add_argument(
|
|
109
101
|
"--sort-by",
|
|
110
|
-
choices=["filename", "
|
|
102
|
+
choices=["filename", "path", "similarity", "ratio"],
|
|
111
103
|
default=None,
|
|
112
104
|
dest="sort_by",
|
|
113
105
|
help=(
|
|
114
106
|
"Sort matched file pairs in the report. "
|
|
115
|
-
"'filename' sorts by file
|
|
116
|
-
"'
|
|
107
|
+
"'filename' sorts by file basename, 'path' by full path, "
|
|
108
|
+
"'similarity' by name match score, 'ratio' by content "
|
|
109
|
+
"similarity. Default: insertion order (no sort)."
|
|
117
110
|
),
|
|
118
111
|
)
|
|
119
112
|
parser.add_argument(
|
|
@@ -179,7 +172,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
179
172
|
),
|
|
180
173
|
)
|
|
181
174
|
parser.add_argument(
|
|
182
|
-
"--
|
|
175
|
+
"--filename",
|
|
183
176
|
action="store_true",
|
|
184
177
|
default=False,
|
|
185
178
|
help="Show the filename at the top-right of each page",
|
|
@@ -204,15 +197,35 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
204
197
|
"plain delete/add. Works in both simple and deep modes."
|
|
205
198
|
),
|
|
206
199
|
)
|
|
200
|
+
parser.add_argument(
|
|
201
|
+
"--include-uncompared",
|
|
202
|
+
action=argparse.BooleanOptionalAction,
|
|
203
|
+
default=True,
|
|
204
|
+
help=(
|
|
205
|
+
"Include unmatched (uncompared) file lists in the report. "
|
|
206
|
+
"Use --no-include-uncompared to hide them (default: included)."
|
|
207
|
+
),
|
|
208
|
+
)
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
"--binary-handling",
|
|
211
|
+
choices=["exclude", "hash", "error"],
|
|
212
|
+
default="hash",
|
|
213
|
+
dest="binary_handling",
|
|
214
|
+
help=(
|
|
215
|
+
"How to handle binary (non-decodable) files: "
|
|
216
|
+
"'exclude' skips them entirely, 'hash' shows SHA-256 match "
|
|
217
|
+
"status, 'error' shows decode error (default: hash)."
|
|
218
|
+
),
|
|
219
|
+
)
|
|
207
220
|
|
|
208
221
|
# ── Report format options ─────────────────────────────────────────
|
|
209
222
|
format_group = parser.add_argument_group(
|
|
210
223
|
"Report Format",
|
|
211
224
|
"Output format(s). Multiple can be combined. "
|
|
212
|
-
"If none specified, defaults to
|
|
225
|
+
"If none specified, defaults to PDF (report.pdf).",
|
|
213
226
|
)
|
|
214
227
|
format_group.add_argument(
|
|
215
|
-
"--report-pdf",
|
|
228
|
+
"--report-pdf", "-o",
|
|
216
229
|
metavar="PATH",
|
|
217
230
|
default=None,
|
|
218
231
|
help="Generate a merged PDF report at the given path",
|
|
@@ -243,7 +256,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
243
256
|
"'--mode deep').",
|
|
244
257
|
)
|
|
245
258
|
deep_group.add_argument(
|
|
246
|
-
"--k-gram",
|
|
259
|
+
"--k-gram",
|
|
247
260
|
type=int,
|
|
248
261
|
default=DEFAULT_K,
|
|
249
262
|
dest="k_gram",
|
|
@@ -253,7 +266,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
253
266
|
),
|
|
254
267
|
)
|
|
255
268
|
deep_group.add_argument(
|
|
256
|
-
"--window",
|
|
269
|
+
"--window",
|
|
257
270
|
type=int,
|
|
258
271
|
default=DEFAULT_W,
|
|
259
272
|
dest="window",
|
|
@@ -263,12 +276,12 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
263
276
|
),
|
|
264
277
|
)
|
|
265
278
|
deep_group.add_argument(
|
|
266
|
-
"--threshold-deep",
|
|
279
|
+
"--threshold-deep",
|
|
267
280
|
type=float,
|
|
268
|
-
default=
|
|
281
|
+
default=5,
|
|
269
282
|
dest="threshold_deep",
|
|
270
283
|
help=(
|
|
271
|
-
"Minimum Jaccard similarity to report (default:
|
|
284
|
+
"Minimum Jaccard similarity 0–100 to report (default: 5). "
|
|
272
285
|
"Below 5%% is considered noise."
|
|
273
286
|
),
|
|
274
287
|
)
|
|
@@ -336,38 +349,45 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
336
349
|
|
|
337
350
|
args = parser.parse_args(argv)
|
|
338
351
|
|
|
352
|
+
# Convert threshold-deep from 0-100 (user-facing) to 0-1 (internal)
|
|
353
|
+
min_jaccard_internal = args.threshold_deep / 100.0
|
|
354
|
+
|
|
339
355
|
# Build analysis metadata (embedded in every report for transparency)
|
|
340
356
|
metadata = AnalysisMetadata(
|
|
341
357
|
exec_mode=args.mode,
|
|
342
358
|
k=args.k_gram,
|
|
343
359
|
w=args.window,
|
|
344
|
-
threshold=args.threshold_deep,
|
|
360
|
+
threshold=args.threshold_deep, # 0-100 scale in metadata
|
|
345
361
|
autojunk=not args.no_autojunk,
|
|
346
362
|
)
|
|
347
363
|
|
|
348
364
|
# Resolve encoding
|
|
349
365
|
encoding = args.encoding if args.encoding.lower() != "auto" else None
|
|
350
366
|
|
|
367
|
+
# Resolve default PDF output if no --report-* specified
|
|
368
|
+
report_pdf = args.report_pdf
|
|
369
|
+
if report_pdf is None and args.report_html is None and args.report_md is None and args.report_json is None:
|
|
370
|
+
report_pdf = "report.pdf"
|
|
371
|
+
|
|
351
372
|
run_pipeline(
|
|
352
373
|
dir_a=args.dir_a,
|
|
353
374
|
dir_b=args.dir_b,
|
|
354
375
|
by_word=args.by_word,
|
|
355
|
-
|
|
376
|
+
strip_comments=args.strip_comments,
|
|
356
377
|
squash_blanks=args.squash_blanks,
|
|
357
|
-
output_pdf=args.output_pdf,
|
|
358
378
|
threshold=args.threshold,
|
|
359
379
|
no_merge=args.no_merge,
|
|
360
380
|
show_page_number=args.page_number,
|
|
361
381
|
show_file_number=args.file_number,
|
|
362
382
|
show_bates_number=args.bates_number,
|
|
363
|
-
show_filename=args.
|
|
383
|
+
show_filename=args.filename,
|
|
364
384
|
collapse_identical=args.collapse_identical,
|
|
365
385
|
# Execution mode & deep compare
|
|
366
386
|
exec_mode=args.mode,
|
|
367
387
|
workers=args.workers,
|
|
368
388
|
kgram_size=args.k_gram,
|
|
369
389
|
window_size=args.window,
|
|
370
|
-
min_jaccard=
|
|
390
|
+
min_jaccard=min_jaccard_internal,
|
|
371
391
|
normalize=args.normalize,
|
|
372
392
|
metadata=metadata,
|
|
373
393
|
# Forensic options
|
|
@@ -377,7 +397,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
377
397
|
embed_hash=args.embed_hash,
|
|
378
398
|
bundle_path=args.bundle_path,
|
|
379
399
|
# Multi-format output
|
|
380
|
-
report_pdf=
|
|
400
|
+
report_pdf=report_pdf,
|
|
381
401
|
report_html=args.report_html,
|
|
382
402
|
report_md=args.report_md,
|
|
383
403
|
report_json=args.report_json,
|
|
@@ -388,10 +408,14 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
388
408
|
sort_order=args.sort_order,
|
|
389
409
|
# Moved block detection
|
|
390
410
|
detect_moved=args.detect_moved,
|
|
411
|
+
# Uncompared files
|
|
412
|
+
include_uncompared=args.include_uncompared,
|
|
391
413
|
# Bates prefix/suffix
|
|
392
414
|
bates_prefix=args.bates_prefix,
|
|
393
415
|
bates_suffix=args.bates_suffix,
|
|
394
416
|
bates_start=args.bates_start,
|
|
417
|
+
# Binary handling
|
|
418
|
+
binary_handling=args.binary_handling,
|
|
395
419
|
)
|
|
396
420
|
|
|
397
421
|
|
|
@@ -98,6 +98,12 @@ class DiffResult:
|
|
|
98
98
|
error: Optional[str] = None
|
|
99
99
|
"""None이 아니면 디코딩/읽기 실패 등의 에러 메시지. 이 경우 위 필드는 0/빈값."""
|
|
100
100
|
|
|
101
|
+
binary: bool = False
|
|
102
|
+
"""True if file pair was detected as binary (non-decodable)."""
|
|
103
|
+
|
|
104
|
+
hash_match: Optional[bool] = None
|
|
105
|
+
"""SHA-256 match status for binary files. None for text files."""
|
|
106
|
+
|
|
101
107
|
|
|
102
108
|
# ──────────────────────────────────────────────────────────────────────
|
|
103
109
|
# Winnowing 핑거프린트 엔트리
|
|
@@ -82,6 +82,7 @@ table.summary th, table.summary td {
|
|
|
82
82
|
border: 1px solid #ccc;
|
|
83
83
|
padding: 5px 8px;
|
|
84
84
|
text-align: left;
|
|
85
|
+
word-break: break-all;
|
|
85
86
|
}
|
|
86
87
|
table.summary th {
|
|
87
88
|
background: #0078d4;
|
|
@@ -178,6 +179,7 @@ table.deep th, table.deep td {
|
|
|
178
179
|
border: 1px solid #ccc;
|
|
179
180
|
padding: 4px 6px;
|
|
180
181
|
text-align: left;
|
|
182
|
+
word-break: break-all;
|
|
181
183
|
}
|
|
182
184
|
table.deep th {
|
|
183
185
|
background: #6c5ce7;
|
|
@@ -212,6 +214,27 @@ table.deep tr:nth-child(even) {
|
|
|
212
214
|
# ---------------------------------------------------------------------------
|
|
213
215
|
# Helpers
|
|
214
216
|
# ---------------------------------------------------------------------------
|
|
217
|
+
def _break_path(path_str: str) -> str:
|
|
218
|
+
"""Insert zero-width spaces after path separator symbols for line-breaking.
|
|
219
|
+
|
|
220
|
+
xhtml2pdf는 긴 파일 경로를 자동 줄바꿈하지 못하므로,
|
|
221
|
+
경로 구분자(/, \\, ., _) 뒤에 zero-width space를 삽입하여
|
|
222
|
+
자연스러운 줄바꿈 지점을 제공한다.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
path_str: HTML-escaped 경로 문자열.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
줄바꿈 힌트가 삽입된 경로 문자열.
|
|
229
|
+
"""
|
|
230
|
+
# HTML entity for zero-width space
|
|
231
|
+
zwsp = "​"
|
|
232
|
+
result = path_str
|
|
233
|
+
for sep in ("/", "\\", ".", "_"):
|
|
234
|
+
result = result.replace(sep, sep + zwsp)
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
|
|
215
238
|
def _ratio_badge(ratio: float) -> str:
|
|
216
239
|
"""Return an HTML badge span for a similarity ratio."""
|
|
217
240
|
pct = ratio * 100
|
|
@@ -251,7 +274,7 @@ def build_hash_table_html(
|
|
|
251
274
|
parts.append(
|
|
252
275
|
f'<tr>'
|
|
253
276
|
f'<td>{idx}</td>'
|
|
254
|
-
f'<td>{html.escape(h.rel_path)}</td>'
|
|
277
|
+
f'<td>{_break_path(html.escape(h.rel_path))}</td>'
|
|
255
278
|
f'<td style="font-family:monospace;font-size:8px">{short_hash}</td>'
|
|
256
279
|
f'<td>{h.size_bytes:,}</td>'
|
|
257
280
|
f'</tr>\n'
|
|
@@ -368,17 +391,18 @@ def build_cover_body(
|
|
|
368
391
|
dir_a: str,
|
|
369
392
|
dir_b: str,
|
|
370
393
|
by_word: bool,
|
|
371
|
-
|
|
394
|
+
strip_comments: bool,
|
|
372
395
|
*,
|
|
373
396
|
deep_results: Optional[list[DeepMatchResult]] = None,
|
|
374
397
|
metadata: Optional["AnalysisMetadata"] = None,
|
|
375
398
|
hash_table_html: Optional[str] = None,
|
|
399
|
+
include_uncompared: bool = True,
|
|
376
400
|
) -> str:
|
|
377
401
|
"""Build the cover-page body fragment (no DOCTYPE/html/head wrapper)."""
|
|
378
402
|
from diffinite.models import AnalysisMetadata as _AM # avoid circular at module level
|
|
379
403
|
|
|
380
404
|
unit = "word" if by_word else "line"
|
|
381
|
-
comment_mode = "
|
|
405
|
+
comment_mode = "stripped" if strip_comments else "included"
|
|
382
406
|
|
|
383
407
|
# Analysis metadata banner (transparency)
|
|
384
408
|
meta_html = ""
|
|
@@ -396,26 +420,43 @@ def build_cover_body(
|
|
|
396
420
|
|
|
397
421
|
summary_rows = ""
|
|
398
422
|
for idx, r in enumerate(results, 1):
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
423
|
+
if r.binary:
|
|
424
|
+
if r.hash_match:
|
|
425
|
+
status = '<span class="badge badge-high">✓ Binary Match</span>'
|
|
426
|
+
else:
|
|
427
|
+
status = '<span class="badge badge-low">✗ Binary Mismatch</span>'
|
|
428
|
+
summary_rows += (
|
|
429
|
+
f"<tr>"
|
|
430
|
+
f"<td>{idx}</td>"
|
|
431
|
+
f"<td>{_break_path(html.escape(r.match.rel_path_a))}</td>"
|
|
432
|
+
f"<td>{_break_path(html.escape(r.match.rel_path_b))}</td>"
|
|
433
|
+
f"<td>{r.match.similarity:.1f}</td>"
|
|
434
|
+
f"<td>{status}</td>"
|
|
435
|
+
f"<td>—</td>"
|
|
436
|
+
f"<td>—</td>"
|
|
437
|
+
f"</tr>\n"
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
badge = _ratio_badge(r.ratio)
|
|
441
|
+
err = (
|
|
442
|
+
f' <em style="color:red">({html.escape(r.error)})</em>'
|
|
443
|
+
if r.error else ""
|
|
444
|
+
)
|
|
445
|
+
summary_rows += (
|
|
446
|
+
f"<tr>"
|
|
447
|
+
f"<td>{idx}</td>"
|
|
448
|
+
f"<td>{_break_path(html.escape(r.match.rel_path_a))}</td>"
|
|
449
|
+
f"<td>{_break_path(html.escape(r.match.rel_path_b))}</td>"
|
|
450
|
+
f"<td>{r.match.similarity:.1f}</td>"
|
|
451
|
+
f"<td>{badge}{err}</td>"
|
|
452
|
+
f"<td style='color:green'>+{r.additions}</td>"
|
|
453
|
+
f"<td style='color:red'>-{r.deletions}</td>"
|
|
454
|
+
f"</tr>\n"
|
|
455
|
+
)
|
|
415
456
|
|
|
416
|
-
# Unmatched lists
|
|
457
|
+
# Unmatched lists (only when include_uncompared is True)
|
|
417
458
|
unmatched_html = ""
|
|
418
|
-
if unmatched_a or unmatched_b:
|
|
459
|
+
if include_uncompared and (unmatched_a or unmatched_b):
|
|
419
460
|
unmatched_html += "<h2>Unmatched Files</h2>\n"
|
|
420
461
|
if unmatched_a:
|
|
421
462
|
unmatched_html += (
|
|
@@ -423,7 +464,7 @@ def build_cover_body(
|
|
|
423
464
|
"<ul class='unmatched'>\n"
|
|
424
465
|
)
|
|
425
466
|
for f in unmatched_a:
|
|
426
|
-
unmatched_html += f" <li>{html.escape(f)}</li>\n"
|
|
467
|
+
unmatched_html += f" <li>{_break_path(html.escape(f))}</li>\n"
|
|
427
468
|
unmatched_html += "</ul>\n"
|
|
428
469
|
if unmatched_b:
|
|
429
470
|
unmatched_html += (
|
|
@@ -431,7 +472,7 @@ def build_cover_body(
|
|
|
431
472
|
"<ul class='unmatched'>\n"
|
|
432
473
|
)
|
|
433
474
|
for f in unmatched_b:
|
|
434
|
-
unmatched_html += f" <li>{html.escape(f)}</li>\n"
|
|
475
|
+
unmatched_html += f" <li>{_break_path(html.escape(f))}</li>\n"
|
|
435
476
|
unmatched_html += "</ul>\n"
|
|
436
477
|
|
|
437
478
|
deep_html = ""
|
|
@@ -447,8 +488,8 @@ def build_cover_body(
|
|
|
447
488
|
jbadge = _ratio_badge(jaccard)
|
|
448
489
|
deep_html += (
|
|
449
490
|
f"<tr>"
|
|
450
|
-
f"<td>{html.escape(dr.file_a)}</td>"
|
|
451
|
-
f"<td>{html.escape(b_file)}</td>"
|
|
491
|
+
f"<td>{_break_path(html.escape(dr.file_a))}</td>"
|
|
492
|
+
f"<td>{_break_path(html.escape(b_file))}</td>"
|
|
452
493
|
f"<td>{shared}</td>"
|
|
453
494
|
f"<td>{jbadge}</td>"
|
|
454
495
|
f"</tr>\n"
|
|
@@ -36,19 +36,20 @@ import json
|
|
|
36
36
|
import logging
|
|
37
37
|
import os
|
|
38
38
|
import tempfile
|
|
39
|
-
from pathlib import Path
|
|
39
|
+
from pathlib import Path, PurePosixPath
|
|
40
40
|
|
|
41
41
|
from diffinite.collector import collect_files, match_files, FUZZY_THRESHOLD
|
|
42
42
|
from diffinite.deep_compare import run_deep_compare
|
|
43
43
|
from diffinite.differ import compute_diff, generate_html_diff, read_file
|
|
44
44
|
from diffinite.evidence import (
|
|
45
|
+
_sha256_file,
|
|
45
46
|
compute_file_hashes,
|
|
46
47
|
create_evidence_bundle,
|
|
47
48
|
write_manifest,
|
|
48
49
|
)
|
|
49
50
|
from diffinite.fingerprint import DEFAULT_K, DEFAULT_W
|
|
50
51
|
from diffinite.models import AnalysisMetadata, DiffResult, DeepMatchResult
|
|
51
|
-
from diffinite.parser import strip_comments
|
|
52
|
+
from diffinite.parser import strip_comments as _strip_comments_fn
|
|
52
53
|
from diffinite.pdf_gen import (
|
|
53
54
|
_html_wrap,
|
|
54
55
|
add_bates_numbers,
|
|
@@ -113,15 +114,16 @@ def _generate_markdown_report(
|
|
|
113
114
|
dir_a: str,
|
|
114
115
|
dir_b: str,
|
|
115
116
|
by_word: bool,
|
|
116
|
-
|
|
117
|
+
strip_comments: bool,
|
|
117
118
|
deep_results: list[DeepMatchResult] | None,
|
|
118
119
|
output_path: str,
|
|
119
120
|
*,
|
|
120
121
|
metadata: AnalysisMetadata | None = None,
|
|
122
|
+
include_uncompared: bool = True,
|
|
121
123
|
) -> None:
|
|
122
124
|
"""Generate a Markdown summary report."""
|
|
123
125
|
unit = "word" if by_word else "line"
|
|
124
|
-
comment_mode = "
|
|
126
|
+
comment_mode = "stripped" if strip_comments else "included"
|
|
125
127
|
|
|
126
128
|
lines: list[str] = []
|
|
127
129
|
lines.append("# Diffinite — Source Code Diff Report\n")
|
|
@@ -141,16 +143,24 @@ def _generate_markdown_report(
|
|
|
141
143
|
lines.append("| # | File A | File B | Name Sim. | Match | +Added | −Deleted |")
|
|
142
144
|
lines.append("|---|--------|--------|:---------:|:-----:|:------:|:--------:|")
|
|
143
145
|
for idx, r in enumerate(results, 1):
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
146
|
+
if r.binary:
|
|
147
|
+
status = "✓ Match" if r.hash_match else "✗ Mismatch"
|
|
148
|
+
lines.append(
|
|
149
|
+
f"| {idx} | `{r.match.rel_path_a}` | `{r.match.rel_path_b}` "
|
|
150
|
+
f"| {r.match.similarity:.1f} | [Binary: {status}] "
|
|
151
|
+
f"| — | — |"
|
|
152
|
+
)
|
|
153
|
+
else:
|
|
154
|
+
pct = r.ratio * 100
|
|
155
|
+
err = f" ⚠ {r.error}" if r.error else ""
|
|
156
|
+
lines.append(
|
|
157
|
+
f"| {idx} | `{r.match.rel_path_a}` | `{r.match.rel_path_b}` "
|
|
158
|
+
f"| {r.match.similarity:.1f} | {pct:.1f}%{err} "
|
|
159
|
+
f"| +{r.additions} | −{r.deletions} |"
|
|
160
|
+
)
|
|
151
161
|
|
|
152
162
|
# Unmatched
|
|
153
|
-
if unmatched_a or unmatched_b:
|
|
163
|
+
if include_uncompared and (unmatched_a or unmatched_b):
|
|
154
164
|
lines.append("\n## Unmatched Files\n")
|
|
155
165
|
if unmatched_a:
|
|
156
166
|
lines.append(f"### Only in A (`{dir_a}`)\n")
|
|
@@ -188,11 +198,12 @@ def _generate_json_report(
|
|
|
188
198
|
dir_a: str,
|
|
189
199
|
dir_b: str,
|
|
190
200
|
by_word: bool,
|
|
191
|
-
|
|
201
|
+
strip_comments: bool,
|
|
192
202
|
deep_results: list[DeepMatchResult] | None,
|
|
193
203
|
output_path: str,
|
|
194
204
|
*,
|
|
195
205
|
metadata: AnalysisMetadata | None = None,
|
|
206
|
+
include_uncompared: bool = True,
|
|
196
207
|
) -> None:
|
|
197
208
|
"""Generate a JSON report for programmatic consumption.
|
|
198
209
|
|
|
@@ -201,7 +212,7 @@ def _generate_json_report(
|
|
|
201
212
|
re-running the pipeline.
|
|
202
213
|
"""
|
|
203
214
|
unit = "word" if by_word else "line"
|
|
204
|
-
comment_mode = "
|
|
215
|
+
comment_mode = "stripped" if strip_comments else "included"
|
|
205
216
|
|
|
206
217
|
meta_dict = None
|
|
207
218
|
if metadata is not None:
|
|
@@ -215,7 +226,7 @@ def _generate_json_report(
|
|
|
215
226
|
|
|
216
227
|
result_list = []
|
|
217
228
|
for r in results:
|
|
218
|
-
|
|
229
|
+
entry = {
|
|
219
230
|
"file_a": r.match.rel_path_a,
|
|
220
231
|
"file_b": r.match.rel_path_b,
|
|
221
232
|
"name_similarity": r.match.similarity,
|
|
@@ -224,7 +235,11 @@ def _generate_json_report(
|
|
|
224
235
|
"deletions": r.deletions,
|
|
225
236
|
"html_diff": r.html_diff,
|
|
226
237
|
"error": r.error,
|
|
227
|
-
|
|
238
|
+
"binary": r.binary,
|
|
239
|
+
}
|
|
240
|
+
if r.binary:
|
|
241
|
+
entry["hash_match"] = r.hash_match
|
|
242
|
+
result_list.append(entry)
|
|
228
243
|
|
|
229
244
|
deep_list = None
|
|
230
245
|
if deep_results is not None:
|
|
@@ -251,13 +266,13 @@ def _generate_json_report(
|
|
|
251
266
|
"comment_mode": comment_mode,
|
|
252
267
|
"summary": {
|
|
253
268
|
"matched_pairs": len(results),
|
|
254
|
-
"
|
|
255
|
-
"
|
|
269
|
+
"unmatched_a_count": len(unmatched_a),
|
|
270
|
+
"unmatched_b_count": len(unmatched_b),
|
|
256
271
|
},
|
|
257
272
|
"results": result_list,
|
|
258
273
|
"deep_results": deep_list,
|
|
259
|
-
"unmatched_a": unmatched_a,
|
|
260
|
-
"unmatched_b": unmatched_b,
|
|
274
|
+
"unmatched_a": unmatched_a if include_uncompared else [],
|
|
275
|
+
"unmatched_b": unmatched_b if include_uncompared else [],
|
|
261
276
|
}
|
|
262
277
|
|
|
263
278
|
out = Path(output_path)
|
|
@@ -276,21 +291,23 @@ def _generate_html_report(
|
|
|
276
291
|
dir_a: str,
|
|
277
292
|
dir_b: str,
|
|
278
293
|
by_word: bool,
|
|
279
|
-
|
|
294
|
+
strip_comments: bool,
|
|
280
295
|
deep_results: list[DeepMatchResult] | None,
|
|
281
296
|
output_path: str,
|
|
282
297
|
ln_col_width: int = 28,
|
|
283
298
|
*,
|
|
284
299
|
metadata: AnalysisMetadata | None = None,
|
|
285
300
|
hash_table_html: str | None = None,
|
|
301
|
+
include_uncompared: bool = True,
|
|
286
302
|
) -> None:
|
|
287
303
|
"""Generate a standalone HTML report with all diffs inline."""
|
|
288
304
|
cover_html_body = build_cover_body(
|
|
289
305
|
results, unmatched_a, unmatched_b,
|
|
290
|
-
dir_a, dir_b, by_word,
|
|
306
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
291
307
|
deep_results=deep_results,
|
|
292
308
|
metadata=metadata,
|
|
293
309
|
hash_table_html=hash_table_html,
|
|
310
|
+
include_uncompared=include_uncompared,
|
|
294
311
|
)
|
|
295
312
|
|
|
296
313
|
# Append all inline diffs
|
|
@@ -346,9 +363,8 @@ def run_pipeline(
|
|
|
346
363
|
dir_a: str,
|
|
347
364
|
dir_b: str,
|
|
348
365
|
by_word: bool = False,
|
|
349
|
-
|
|
366
|
+
strip_comments: bool = False,
|
|
350
367
|
squash_blanks: bool = False,
|
|
351
|
-
output_pdf: str = "report.pdf",
|
|
352
368
|
threshold: float = FUZZY_THRESHOLD,
|
|
353
369
|
*,
|
|
354
370
|
no_merge: bool = False,
|
|
@@ -384,10 +400,14 @@ def run_pipeline(
|
|
|
384
400
|
sort_order: str = "asc",
|
|
385
401
|
# Moved block detection
|
|
386
402
|
detect_moved: bool = False,
|
|
403
|
+
# Uncompared files
|
|
404
|
+
include_uncompared: bool = True,
|
|
387
405
|
# Bates prefix/suffix
|
|
388
406
|
bates_prefix: str = "",
|
|
389
407
|
bates_suffix: str = "",
|
|
390
408
|
bates_start: int = 1,
|
|
409
|
+
# Binary handling
|
|
410
|
+
binary_handling: str = "hash",
|
|
391
411
|
) -> None:
|
|
392
412
|
"""Execute the full diff-to-report pipeline.
|
|
393
413
|
|
|
@@ -410,7 +430,7 @@ def run_pipeline(
|
|
|
410
430
|
"""
|
|
411
431
|
# Determine effective output paths
|
|
412
432
|
if report_pdf is None and report_html is None and report_md is None and report_json is None:
|
|
413
|
-
report_pdf =
|
|
433
|
+
report_pdf = "report.pdf"
|
|
414
434
|
|
|
415
435
|
# Build default metadata if caller didn't provide one
|
|
416
436
|
if metadata is None:
|
|
@@ -463,15 +483,30 @@ def run_pipeline(
|
|
|
463
483
|
text_b = read_file(abs_b, encoding=encoding)
|
|
464
484
|
|
|
465
485
|
if text_a is None or text_b is None:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
486
|
+
if binary_handling == "exclude":
|
|
487
|
+
continue
|
|
488
|
+
elif binary_handling == "hash":
|
|
489
|
+
hash_a = _sha256_file(str(root_a / m.rel_path_a))
|
|
490
|
+
hash_b = _sha256_file(str(root_b / m.rel_path_b))
|
|
491
|
+
hash_match = hash_a == hash_b
|
|
492
|
+
results.append(DiffResult(
|
|
493
|
+
match=m,
|
|
494
|
+
ratio=1.0 if hash_match else 0.0,
|
|
495
|
+
additions=0, deletions=0,
|
|
496
|
+
html_diff="",
|
|
497
|
+
binary=True,
|
|
498
|
+
hash_match=hash_match,
|
|
499
|
+
))
|
|
500
|
+
else: # "error"
|
|
501
|
+
results.append(DiffResult(
|
|
502
|
+
match=m, ratio=0.0, additions=0, deletions=0,
|
|
503
|
+
html_diff="", error="Could not decode one or both files",
|
|
504
|
+
))
|
|
470
505
|
continue
|
|
471
506
|
|
|
472
|
-
if
|
|
473
|
-
text_a =
|
|
474
|
-
text_b =
|
|
507
|
+
if strip_comments:
|
|
508
|
+
text_a = _strip_comments_fn(text_a, ext, squash_blanks=squash_blanks)
|
|
509
|
+
text_b = _strip_comments_fn(text_b, ext, squash_blanks=squash_blanks)
|
|
475
510
|
|
|
476
511
|
all_line_counts.append(text_a.count("\n") + 1)
|
|
477
512
|
all_line_counts.append(text_b.count("\n") + 1)
|
|
@@ -493,11 +528,11 @@ def run_pipeline(
|
|
|
493
528
|
ln_col_width, max(all_line_counts) if all_line_counts else 0)
|
|
494
529
|
|
|
495
530
|
# Generate HTML diffs with unified column width
|
|
496
|
-
for
|
|
497
|
-
r
|
|
498
|
-
if r.error:
|
|
531
|
+
for r_idx, r in enumerate(results):
|
|
532
|
+
if r.error or r.binary:
|
|
499
533
|
continue
|
|
500
534
|
|
|
535
|
+
m = r.match
|
|
501
536
|
abs_a = str(root_a / m.rel_path_a)
|
|
502
537
|
abs_b = str(root_b / m.rel_path_b)
|
|
503
538
|
ext = Path(m.rel_path_a).suffix.lower()
|
|
@@ -506,9 +541,9 @@ def run_pipeline(
|
|
|
506
541
|
text_b = read_file(abs_b, encoding=encoding)
|
|
507
542
|
if text_a is None or text_b is None:
|
|
508
543
|
continue
|
|
509
|
-
if
|
|
510
|
-
text_a =
|
|
511
|
-
text_b =
|
|
544
|
+
if strip_comments:
|
|
545
|
+
text_a = _strip_comments_fn(text_a, ext, squash_blanks=squash_blanks)
|
|
546
|
+
text_b = _strip_comments_fn(text_b, ext, squash_blanks=squash_blanks)
|
|
512
547
|
|
|
513
548
|
html_diff = generate_html_diff(
|
|
514
549
|
text_a, text_b,
|
|
@@ -522,7 +557,7 @@ def run_pipeline(
|
|
|
522
557
|
by_word=by_word,
|
|
523
558
|
detect_moved=detect_moved,
|
|
524
559
|
)
|
|
525
|
-
results[
|
|
560
|
+
results[r_idx] = DiffResult(
|
|
526
561
|
match=r.match,
|
|
527
562
|
ratio=r.ratio,
|
|
528
563
|
additions=r.additions,
|
|
@@ -536,16 +571,16 @@ def run_pipeline(
|
|
|
536
571
|
if sort_by:
|
|
537
572
|
reverse = sort_order == "desc"
|
|
538
573
|
if sort_by == "filename":
|
|
574
|
+
results.sort(
|
|
575
|
+
key=lambda r: PurePosixPath(r.match.rel_path_a).name.lower(),
|
|
576
|
+
reverse=reverse,
|
|
577
|
+
)
|
|
578
|
+
elif sort_by == "path":
|
|
539
579
|
results.sort(key=lambda r: r.match.rel_path_a.lower(), reverse=reverse)
|
|
580
|
+
elif sort_by == "similarity":
|
|
581
|
+
results.sort(key=lambda r: r.match.similarity, reverse=reverse)
|
|
540
582
|
elif sort_by == "ratio":
|
|
541
583
|
results.sort(key=lambda r: r.ratio, reverse=reverse)
|
|
542
|
-
elif sort_by == "size":
|
|
543
|
-
def _file_size(r: DiffResult) -> int:
|
|
544
|
-
try:
|
|
545
|
-
return os.path.getsize(str(root_a / r.match.rel_path_a))
|
|
546
|
-
except OSError:
|
|
547
|
-
return 0
|
|
548
|
-
results.sort(key=_file_size, reverse=reverse)
|
|
549
584
|
logger.info(" Sorted by %s (%s)", sort_by, sort_order)
|
|
550
585
|
|
|
551
586
|
# Deep Compare (only in deep mode)
|
|
@@ -569,9 +604,10 @@ def run_pipeline(
|
|
|
569
604
|
logger.info("Generating JSON report …")
|
|
570
605
|
_generate_json_report(
|
|
571
606
|
results, unmatched_a, unmatched_b,
|
|
572
|
-
dir_a, dir_b, by_word,
|
|
607
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
573
608
|
deep_results, report_json,
|
|
574
609
|
metadata=metadata,
|
|
610
|
+
include_uncompared=include_uncompared,
|
|
575
611
|
)
|
|
576
612
|
|
|
577
613
|
# Markdown report
|
|
@@ -579,9 +615,10 @@ def run_pipeline(
|
|
|
579
615
|
logger.info("Generating Markdown report …")
|
|
580
616
|
_generate_markdown_report(
|
|
581
617
|
results, unmatched_a, unmatched_b,
|
|
582
|
-
dir_a, dir_b, by_word,
|
|
618
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
583
619
|
deep_results, report_md,
|
|
584
620
|
metadata=metadata,
|
|
621
|
+
include_uncompared=include_uncompared,
|
|
585
622
|
)
|
|
586
623
|
|
|
587
624
|
# HTML report
|
|
@@ -589,10 +626,11 @@ def run_pipeline(
|
|
|
589
626
|
logger.info("Generating HTML report …")
|
|
590
627
|
_generate_html_report(
|
|
591
628
|
results, unmatched_a, unmatched_b,
|
|
592
|
-
dir_a, dir_b, by_word,
|
|
629
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
593
630
|
deep_results, report_html, ln_col_width,
|
|
594
631
|
metadata=metadata,
|
|
595
632
|
hash_table_html=hash_table_html,
|
|
633
|
+
include_uncompared=include_uncompared,
|
|
596
634
|
)
|
|
597
635
|
|
|
598
636
|
# PDF report
|
|
@@ -600,7 +638,7 @@ def run_pipeline(
|
|
|
600
638
|
logger.info("Generating PDF report (divide-and-conquer) …")
|
|
601
639
|
_generate_pdf_report(
|
|
602
640
|
results, unmatched_a, unmatched_b,
|
|
603
|
-
dir_a, dir_b, by_word,
|
|
641
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
604
642
|
deep_results, report_pdf,
|
|
605
643
|
no_merge=no_merge,
|
|
606
644
|
show_page_number=show_page_number,
|
|
@@ -614,6 +652,7 @@ def run_pipeline(
|
|
|
614
652
|
bates_prefix=bates_prefix,
|
|
615
653
|
bates_suffix=bates_suffix,
|
|
616
654
|
bates_start=bates_start,
|
|
655
|
+
include_uncompared=include_uncompared,
|
|
617
656
|
)
|
|
618
657
|
|
|
619
658
|
logger.info("Done (reports) ✓")
|
|
@@ -658,7 +697,7 @@ def _generate_pdf_report(
|
|
|
658
697
|
dir_a: str,
|
|
659
698
|
dir_b: str,
|
|
660
699
|
by_word: bool,
|
|
661
|
-
|
|
700
|
+
strip_comments: bool,
|
|
662
701
|
deep_results: list[DeepMatchResult] | None,
|
|
663
702
|
output_pdf: str,
|
|
664
703
|
*,
|
|
@@ -674,6 +713,7 @@ def _generate_pdf_report(
|
|
|
674
713
|
bates_prefix: str = "",
|
|
675
714
|
bates_suffix: str = "",
|
|
676
715
|
bates_start: int = 1,
|
|
716
|
+
include_uncompared: bool = True,
|
|
677
717
|
) -> None:
|
|
678
718
|
"""Generate PDF report with divide-and-conquer merging."""
|
|
679
719
|
if no_merge:
|
|
@@ -686,10 +726,11 @@ def _generate_pdf_report(
|
|
|
686
726
|
# (1) Cover page
|
|
687
727
|
cover_body = build_cover_body(
|
|
688
728
|
results, unmatched_a, unmatched_b,
|
|
689
|
-
dir_a, dir_b, by_word,
|
|
729
|
+
dir_a, dir_b, by_word, strip_comments,
|
|
690
730
|
deep_results=deep_results,
|
|
691
731
|
metadata=metadata,
|
|
692
732
|
hash_table_html=hash_table_html,
|
|
733
|
+
include_uncompared=include_uncompared,
|
|
693
734
|
)
|
|
694
735
|
cover_html = _html_wrap("Diffinite — Cover", cover_body)
|
|
695
736
|
if no_merge:
|
|
@@ -700,9 +741,43 @@ def _generate_pdf_report(
|
|
|
700
741
|
if cover_ok:
|
|
701
742
|
logger.info(" Cover page → OK")
|
|
702
743
|
|
|
744
|
+
# ── Pre-flight: warn about large diffs that may slow PDF ────
|
|
745
|
+
# 500 KB of HTML ≈ 500+ source lines in side-by-side diff table.
|
|
746
|
+
# xhtml2pdf layout becomes noticeably slow above this threshold.
|
|
747
|
+
_LARGE_DIFF_BYTES = 500_000
|
|
748
|
+
large_files = [
|
|
749
|
+
(i, r) for i, r in enumerate(results, 1)
|
|
750
|
+
if not r.error and not r.binary and len(r.html_diff) > _LARGE_DIFF_BYTES
|
|
751
|
+
]
|
|
752
|
+
if large_files:
|
|
753
|
+
logger.warning(
|
|
754
|
+
"⚠ %d file(s) have large diffs — PDF rendering may be "
|
|
755
|
+
"slow or hang:", len(large_files),
|
|
756
|
+
)
|
|
757
|
+
for i, r in large_files:
|
|
758
|
+
size_kb = len(r.html_diff) / 1024
|
|
759
|
+
logger.warning(
|
|
760
|
+
" %d. %s (%.0f KB HTML)",
|
|
761
|
+
i, r.match.rel_path_a, size_kb,
|
|
762
|
+
)
|
|
763
|
+
logger.warning(
|
|
764
|
+
" Consider: --collapse-identical (shrink diffs), "
|
|
765
|
+
"--no-merge (split PDFs), or --report-html (fast export)."
|
|
766
|
+
)
|
|
767
|
+
|
|
703
768
|
# (2) Per-file diff pages
|
|
704
769
|
diff_pdf_pairs: list[tuple[str, DiffResult]] = []
|
|
705
770
|
for idx, r in enumerate(results, 1):
|
|
771
|
+
# Per-file warning for large diffs
|
|
772
|
+
if (not r.error and not r.binary
|
|
773
|
+
and len(r.html_diff) > _LARGE_DIFF_BYTES):
|
|
774
|
+
size_kb = len(r.html_diff) / 1024
|
|
775
|
+
logger.warning(
|
|
776
|
+
"⚠ Rendering PDF %d/%d (%s, %.0f KB) — "
|
|
777
|
+
"this may take a while…",
|
|
778
|
+
idx, len(results), r.match.rel_path_a, size_kb,
|
|
779
|
+
)
|
|
780
|
+
|
|
706
781
|
diff_html = build_diff_page_html(
|
|
707
782
|
r, idx, unit,
|
|
708
783
|
show_page_number=show_page_number,
|
|
@@ -69,7 +69,7 @@ class TestDeepCompareArgs:
|
|
|
69
69
|
"--mode", "deep",
|
|
70
70
|
"--k-gram", "5",
|
|
71
71
|
"--window", "3",
|
|
72
|
-
"--threshold-deep", "
|
|
72
|
+
"--threshold-deep", "20",
|
|
73
73
|
])
|
|
74
74
|
|
|
75
75
|
|
|
@@ -84,7 +84,7 @@ class TestAnnotationsAndReportFlags:
|
|
|
84
84
|
"-o", str(tmp_path / "out.pdf"),
|
|
85
85
|
"--collapse-identical",
|
|
86
86
|
"--page-number", "--file-number",
|
|
87
|
-
"--bates-number", "--
|
|
87
|
+
"--bates-number", "--filename",
|
|
88
88
|
])
|
|
89
89
|
|
|
90
90
|
def test_threshold_accepts_value(self, tmp_path):
|
|
@@ -107,3 +107,41 @@ class TestAnnotationsAndReportFlags:
|
|
|
107
107
|
])
|
|
108
108
|
from pathlib import Path
|
|
109
109
|
assert Path(json_path).exists()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class TestIncludeUncomparedFlag:
|
|
113
|
+
"""Verify --include-uncompared / --no-include-uncompared flags."""
|
|
114
|
+
|
|
115
|
+
def test_include_uncompared_default_true(self, tmp_path):
|
|
116
|
+
"""Default behavior includes uncompared files."""
|
|
117
|
+
d_a = tmp_path / "a"; d_a.mkdir()
|
|
118
|
+
d_b = tmp_path / "b"; d_b.mkdir()
|
|
119
|
+
(d_a / "only_a.py").write_text("x = 1\n", encoding="utf-8")
|
|
120
|
+
json_path = str(tmp_path / "out.json")
|
|
121
|
+
main([
|
|
122
|
+
str(d_a), str(d_b),
|
|
123
|
+
"--report-json", json_path,
|
|
124
|
+
])
|
|
125
|
+
import json
|
|
126
|
+
from pathlib import Path
|
|
127
|
+
data = json.loads(Path(json_path).read_text(encoding="utf-8"))
|
|
128
|
+
assert "only_a.py" in data["unmatched_a"]
|
|
129
|
+
|
|
130
|
+
def test_no_include_uncompared_excludes(self, tmp_path):
|
|
131
|
+
"""--no-include-uncompared excludes unmatched file lists."""
|
|
132
|
+
d_a = tmp_path / "a"; d_a.mkdir()
|
|
133
|
+
d_b = tmp_path / "b"; d_b.mkdir()
|
|
134
|
+
(d_a / "only_a.py").write_text("x = 1\n", encoding="utf-8")
|
|
135
|
+
json_path = str(tmp_path / "out.json")
|
|
136
|
+
main([
|
|
137
|
+
str(d_a), str(d_b),
|
|
138
|
+
"--report-json", json_path,
|
|
139
|
+
"--no-include-uncompared",
|
|
140
|
+
])
|
|
141
|
+
import json
|
|
142
|
+
from pathlib import Path
|
|
143
|
+
data = json.loads(Path(json_path).read_text(encoding="utf-8"))
|
|
144
|
+
assert data["unmatched_a"] == []
|
|
145
|
+
assert data["unmatched_b"] == []
|
|
146
|
+
# Summary counts should still show the real values
|
|
147
|
+
assert data["summary"]["unmatched_a_count"] == 1
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
5
|
from diffinite.models import DiffResult, FileMatch, DeepMatchResult
|
|
6
|
-
from diffinite.pdf_gen import build_cover_body, build_diff_page_html
|
|
6
|
+
from diffinite.pdf_gen import build_cover_body, build_diff_page_html, _break_path
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
# ---------------------------------------------------------------------------
|
|
@@ -37,7 +37,7 @@ def _cover(results=None, *, deep_results=None):
|
|
|
37
37
|
dir_a="dir_a",
|
|
38
38
|
dir_b="dir_b",
|
|
39
39
|
by_word=False,
|
|
40
|
-
|
|
40
|
+
strip_comments=False,
|
|
41
41
|
deep_results=deep_results,
|
|
42
42
|
)
|
|
43
43
|
|
|
@@ -55,8 +55,9 @@ class TestBuildCoverHtml:
|
|
|
55
55
|
|
|
56
56
|
def test_contains_file_names(self):
|
|
57
57
|
html = _cover()
|
|
58
|
-
|
|
59
|
-
assert "
|
|
58
|
+
# _break_path inserts ​ after path separators (., /, \, _)
|
|
59
|
+
assert "handler." in html
|
|
60
|
+
assert "looper." in html
|
|
60
61
|
|
|
61
62
|
def test_contains_ratio(self):
|
|
62
63
|
html = _cover()
|
|
@@ -79,10 +80,10 @@ class TestBuildCoverHtml:
|
|
|
79
80
|
dir_a="left",
|
|
80
81
|
dir_b="right",
|
|
81
82
|
by_word=False,
|
|
82
|
-
|
|
83
|
+
strip_comments=False,
|
|
83
84
|
)
|
|
84
|
-
|
|
85
|
-
assert "
|
|
85
|
+
# _break_path inserts ​ after separators, so check partial strings
|
|
86
|
+
assert "orphan" in html
|
|
86
87
|
|
|
87
88
|
def test_deep_results_without_channels(self):
|
|
88
89
|
deep = [
|
|
@@ -92,8 +93,8 @@ class TestBuildCoverHtml:
|
|
|
92
93
|
),
|
|
93
94
|
]
|
|
94
95
|
html = _cover(deep_results=deep)
|
|
95
|
-
assert "foo.
|
|
96
|
-
assert "bar.
|
|
96
|
+
assert "foo." in html
|
|
97
|
+
assert "bar." in html
|
|
97
98
|
|
|
98
99
|
def test_deep_results_display(self):
|
|
99
100
|
deep = [
|
|
@@ -104,8 +105,8 @@ class TestBuildCoverHtml:
|
|
|
104
105
|
),
|
|
105
106
|
]
|
|
106
107
|
html = _cover(deep_results=deep)
|
|
107
|
-
assert "foo.
|
|
108
|
-
assert "bar.
|
|
108
|
+
assert "foo." in html
|
|
109
|
+
assert "bar." in html
|
|
109
110
|
assert "50" in html # shared hashes
|
|
110
111
|
|
|
111
112
|
|
|
@@ -155,3 +156,67 @@ class TestBuildDiffPageHtml:
|
|
|
155
156
|
show_filename=True,
|
|
156
157
|
)
|
|
157
158
|
assert "annotated.py" in html
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# _break_path tests
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
class TestBreakPath:
|
|
165
|
+
"""Verify _break_path inserts zero-width spaces at path separators."""
|
|
166
|
+
|
|
167
|
+
def test_slash(self):
|
|
168
|
+
result = _break_path("src/main/java")
|
|
169
|
+
assert "src/​main/​java" == result
|
|
170
|
+
|
|
171
|
+
def test_backslash(self):
|
|
172
|
+
result = _break_path("src\\main\\java")
|
|
173
|
+
assert "src\\​main\\​java" == result
|
|
174
|
+
|
|
175
|
+
def test_dot(self):
|
|
176
|
+
result = _break_path("handler.java")
|
|
177
|
+
assert "handler.​java" == result
|
|
178
|
+
|
|
179
|
+
def test_underscore(self):
|
|
180
|
+
result = _break_path("my_file_name")
|
|
181
|
+
assert "my_​file_​name" == result
|
|
182
|
+
|
|
183
|
+
def test_combined(self):
|
|
184
|
+
result = _break_path("src/com/example/my_handler.java")
|
|
185
|
+
assert "​" in result
|
|
186
|
+
|
|
187
|
+
def test_empty(self):
|
|
188
|
+
assert _break_path("") == ""
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
# include_uncompared tests
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
class TestIncludeUncompared:
|
|
195
|
+
"""Verify include_uncompared parameter on build_cover_body."""
|
|
196
|
+
|
|
197
|
+
def test_excludes_unmatched_when_false(self):
|
|
198
|
+
html = build_cover_body(
|
|
199
|
+
_make_results(),
|
|
200
|
+
unmatched_a=["orphan_a.py"],
|
|
201
|
+
unmatched_b=["orphan_b.py"],
|
|
202
|
+
dir_a="left",
|
|
203
|
+
dir_b="right",
|
|
204
|
+
by_word=False,
|
|
205
|
+
strip_comments=False,
|
|
206
|
+
include_uncompared=False,
|
|
207
|
+
)
|
|
208
|
+
assert "orphan" not in html
|
|
209
|
+
assert "Unmatched Files" not in html
|
|
210
|
+
|
|
211
|
+
def test_includes_unmatched_by_default(self):
|
|
212
|
+
html = build_cover_body(
|
|
213
|
+
_make_results(),
|
|
214
|
+
unmatched_a=["orphan_a.py"],
|
|
215
|
+
unmatched_b=["orphan_b.py"],
|
|
216
|
+
dir_a="left",
|
|
217
|
+
dir_b="right",
|
|
218
|
+
by_word=False,
|
|
219
|
+
strip_comments=False,
|
|
220
|
+
)
|
|
221
|
+
assert "orphan" in html
|
|
222
|
+
assert "Unmatched Files" in html
|
|
@@ -23,9 +23,9 @@ class TestPipelineE2E:
|
|
|
23
23
|
run_pipeline(
|
|
24
24
|
dir_a=EXAMPLE_LEFT,
|
|
25
25
|
dir_b=EXAMPLE_RIGHT,
|
|
26
|
-
|
|
26
|
+
report_pdf=output,
|
|
27
27
|
by_word=False,
|
|
28
|
-
|
|
28
|
+
strip_comments=True,
|
|
29
29
|
)
|
|
30
30
|
assert Path(output).exists()
|
|
31
31
|
assert Path(output).stat().st_size > 0
|
|
@@ -37,7 +37,7 @@ class TestPipelineE2E:
|
|
|
37
37
|
run_pipeline(
|
|
38
38
|
dir_a=EXAMPLE_LEFT,
|
|
39
39
|
dir_b=EXAMPLE_RIGHT,
|
|
40
|
-
|
|
40
|
+
report_pdf=output,
|
|
41
41
|
exec_mode="deep",
|
|
42
42
|
workers=2,
|
|
43
43
|
kgram_size=5,
|
|
@@ -53,7 +53,7 @@ class TestPipelineE2E:
|
|
|
53
53
|
run_pipeline(
|
|
54
54
|
dir_a=EXAMPLE_LEFT,
|
|
55
55
|
dir_b=EXAMPLE_RIGHT,
|
|
56
|
-
|
|
56
|
+
report_pdf=output,
|
|
57
57
|
no_merge=True,
|
|
58
58
|
)
|
|
59
59
|
files_dir = tmp_path / "individual_files"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|