PyPI - diffinite - Versions diffs - 0.9.6__tar.gz → 0.10.0__tar.gz - Mend

diffinite 0.9.6tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{diffinite-0.9.6/src/diffinite.egg-info → diffinite-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffinite
-Version: 0.9.6
+Version: 0.10.0
 Summary: Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit
 Author: nash-dir
 License: Apache-2.0

{diffinite-0.9.6 → diffinite-0.10.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "diffinite"
-version = "0.9.6"
+version = "0.10.0"
 description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
 readme = "README.md"
 license = {text = "Apache-2.0"}

{diffinite-0.9.6 → diffinite-0.10.0}/src/diffinite/cli.py RENAMED Viewed

@@ -58,14 +58,6 @@ def main(argv: list[str] | None = None) -> None:
         ),
     )
-    # ── Output ────────────────────────────────────────────────────────
-    parser.add_argument(
-        "--output-pdf", "-o",
-        default="report.pdf",
-        help="Output PDF file path (default: report.pdf). "
-             "Ignored when any --report-* option is specified.",
-    )
     # ── Comparison options ────────────────────────────────────────────
     parser.add_argument(
         "--by-word",
@@ -74,7 +66,7 @@ def main(argv: list[str] | None = None) -> None:
         help="Compare by word instead of by line",
     )
     parser.add_argument(
-        "--no-comments",
+        "--strip-comments",
         action="store_true",
         default=False,
         help="Strip comments before comparison (uses 2-pass parser)",
@@ -85,7 +77,7 @@ def main(argv: list[str] | None = None) -> None:
         default=False,
         help=(
             "Collapse runs of 3+ blank lines after comment stripping. "
-            "Only effective with --no-comments. WARNING: changes line "
+            "Only effective with --strip-comments. WARNING: changes line "
             "numbers — do not use for forensic line-tracing."
         ),
     )
@@ -107,13 +99,14 @@ def main(argv: list[str] | None = None) -> None:
     )
     parser.add_argument(
         "--sort-by",
-        choices=["filename", "size", "ratio"],
+        choices=["filename", "path", "similarity", "ratio"],
         default=None,
         dest="sort_by",
         help=(
             "Sort matched file pairs in the report. "
-            "'filename' sorts by file path, 'size' by file size, "
-            "'ratio' by similarity ratio. Default: insertion order (no sort)."
+            "'filename' sorts by file basename, 'path' by full path, "
+            "'similarity' by name match score, 'ratio' by content "
+            "similarity. Default: insertion order (no sort)."
         ),
     )
     parser.add_argument(
@@ -179,7 +172,7 @@ def main(argv: list[str] | None = None) -> None:
         ),
     )
     parser.add_argument(
-        "--show-filename",
+        "--filename",
         action="store_true",
         default=False,
         help="Show the filename at the top-right of each page",
@@ -204,15 +197,35 @@ def main(argv: list[str] | None = None) -> None:
             "plain delete/add. Works in both simple and deep modes."
         ),
     )
+    parser.add_argument(
+        "--include-uncompared",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Include unmatched (uncompared) file lists in the report. "
+            "Use --no-include-uncompared to hide them (default: included)."
+        ),
+    )
+    parser.add_argument(
+        "--binary-handling",
+        choices=["exclude", "hash", "error"],
+        default="hash",
+        dest="binary_handling",
+        help=(
+            "How to handle binary (non-decodable) files: "
+            "'exclude' skips them entirely, 'hash' shows SHA-256 match "
+            "status, 'error' shows decode error (default: hash)."
+        ),
+    )
     # ── Report format options ─────────────────────────────────────────
     format_group = parser.add_argument_group(
         "Report Format",
         "Output format(s). Multiple can be combined. "
-        "If none specified, defaults to --output-pdf.",
+        "If none specified, defaults to PDF (report.pdf).",
     )
     format_group.add_argument(
-        "--report-pdf",
+        "--report-pdf", "-o",
         metavar="PATH",
         default=None,
         help="Generate a merged PDF report at the given path",
@@ -243,7 +256,7 @@ def main(argv: list[str] | None = None) -> None:
         "'--mode deep').",
     )
     deep_group.add_argument(
-        "--k-gram", "--kgram-size",
+        "--k-gram",
         type=int,
         default=DEFAULT_K,
         dest="k_gram",
@@ -253,7 +266,7 @@ def main(argv: list[str] | None = None) -> None:
         ),
     )
     deep_group.add_argument(
-        "--window", "--window-size",
+        "--window",
         type=int,
         default=DEFAULT_W,
         dest="window",
@@ -263,12 +276,12 @@ def main(argv: list[str] | None = None) -> None:
         ),
     )
     deep_group.add_argument(
-        "--threshold-deep", "--min-jaccard",
+        "--threshold-deep",
         type=float,
-        default=0.05,
+        default=5,
         dest="threshold_deep",
         help=(
-            "Minimum Jaccard similarity to report (default: 0.05). "
+            "Minimum Jaccard similarity 0–100 to report (default: 5). "
             "Below 5%% is considered noise."
         ),
     )
@@ -336,38 +349,45 @@ def main(argv: list[str] | None = None) -> None:
     args = parser.parse_args(argv)
+    # Convert threshold-deep from 0-100 (user-facing) to 0-1 (internal)
+    min_jaccard_internal = args.threshold_deep / 100.0
     # Build analysis metadata (embedded in every report for transparency)
     metadata = AnalysisMetadata(
         exec_mode=args.mode,
         k=args.k_gram,
         w=args.window,
-        threshold=args.threshold_deep,
+        threshold=args.threshold_deep,  # 0-100 scale in metadata
         autojunk=not args.no_autojunk,
     )
     # Resolve encoding
     encoding = args.encoding if args.encoding.lower() != "auto" else None
+    # Resolve default PDF output if no --report-* specified
+    report_pdf = args.report_pdf
+    if report_pdf is None and args.report_html is None and args.report_md is None and args.report_json is None:
+        report_pdf = "report.pdf"
     run_pipeline(
         dir_a=args.dir_a,
         dir_b=args.dir_b,
         by_word=args.by_word,
-        compare_comment=not args.no_comments,
+        strip_comments=args.strip_comments,
         squash_blanks=args.squash_blanks,
-        output_pdf=args.output_pdf,
         threshold=args.threshold,
         no_merge=args.no_merge,
         show_page_number=args.page_number,
         show_file_number=args.file_number,
         show_bates_number=args.bates_number,
-        show_filename=args.show_filename,
+        show_filename=args.filename,
         collapse_identical=args.collapse_identical,
         # Execution mode & deep compare
         exec_mode=args.mode,
         workers=args.workers,
         kgram_size=args.k_gram,
         window_size=args.window,
-        min_jaccard=args.threshold_deep,
+        min_jaccard=min_jaccard_internal,
         normalize=args.normalize,
         metadata=metadata,
         # Forensic options
@@ -377,7 +397,7 @@ def main(argv: list[str] | None = None) -> None:
         embed_hash=args.embed_hash,
         bundle_path=args.bundle_path,
         # Multi-format output
-        report_pdf=args.report_pdf,
+        report_pdf=report_pdf,
         report_html=args.report_html,
         report_md=args.report_md,
         report_json=args.report_json,
@@ -388,10 +408,14 @@ def main(argv: list[str] | None = None) -> None:
         sort_order=args.sort_order,
         # Moved block detection
         detect_moved=args.detect_moved,
+        # Uncompared files
+        include_uncompared=args.include_uncompared,
         # Bates prefix/suffix
         bates_prefix=args.bates_prefix,
         bates_suffix=args.bates_suffix,
         bates_start=args.bates_start,
+        # Binary handling
+        binary_handling=args.binary_handling,
     )

{diffinite-0.9.6 → diffinite-0.10.0}/src/diffinite/models.py RENAMED Viewed

@@ -98,6 +98,12 @@ class DiffResult:
     error: Optional[str] = None
     """None이 아니면 디코딩/읽기 실패 등의 에러 메시지. 이 경우 위 필드는 0/빈값."""
+    binary: bool = False
+    """True if file pair was detected as binary (non-decodable)."""
+    hash_match: Optional[bool] = None
+    """SHA-256 match status for binary files. None for text files."""
 # ──────────────────────────────────────────────────────────────────────
 # Winnowing 핑거프린트 엔트리

{diffinite-0.9.6 → diffinite-0.10.0}/src/diffinite/pdf_gen.py RENAMED Viewed

@@ -82,6 +82,7 @@ table.summary th, table.summary td {
     border: 1px solid #ccc;
     padding: 5px 8px;
     text-align: left;
+    word-break: break-all;
 }
 table.summary th {
     background: #0078d4;
@@ -178,6 +179,7 @@ table.deep th, table.deep td {
     border: 1px solid #ccc;
     padding: 4px 6px;
     text-align: left;
+    word-break: break-all;
 }
 table.deep th {
     background: #6c5ce7;
@@ -212,6 +214,27 @@ table.deep tr:nth-child(even) {
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
+def _break_path(path_str: str) -> str:
+    """Insert zero-width spaces after path separator symbols for line-breaking.
+    xhtml2pdf는 긴 파일 경로를 자동 줄바꿈하지 못하므로,
+    경로 구분자(/, \\, ., _) 뒤에 zero-width space를 삽입하여
+    자연스러운 줄바꿈 지점을 제공한다.
+    Args:
+        path_str: HTML-escaped 경로 문자열.
+    Returns:
+        줄바꿈 힌트가 삽입된 경로 문자열.
+    """
+    # HTML entity for zero-width space
+    zwsp = "&#8203;"
+    result = path_str
+    for sep in ("/", "\\", ".", "_"):
+        result = result.replace(sep, sep + zwsp)
+    return result
 def _ratio_badge(ratio: float) -> str:
     """Return an HTML badge span for a similarity ratio."""
     pct = ratio * 100
@@ -251,7 +274,7 @@ def build_hash_table_html(
             parts.append(
                 f'<tr>'
                 f'<td>{idx}</td>'
-                f'<td>{html.escape(h.rel_path)}</td>'
+                f'<td>{_break_path(html.escape(h.rel_path))}</td>'
                 f'<td style="font-family:monospace;font-size:8px">{short_hash}</td>'
                 f'<td>{h.size_bytes:,}</td>'
                 f'</tr>\n'
@@ -368,17 +391,18 @@ def build_cover_body(
     dir_a: str,
     dir_b: str,
     by_word: bool,
-    compare_comment: bool,
+    strip_comments: bool,
     *,
     deep_results: Optional[list[DeepMatchResult]] = None,
     metadata: Optional["AnalysisMetadata"] = None,
     hash_table_html: Optional[str] = None,
+    include_uncompared: bool = True,
 ) -> str:
     """Build the cover-page body fragment (no DOCTYPE/html/head wrapper)."""
     from diffinite.models import AnalysisMetadata as _AM  # avoid circular at module level
     unit = "word" if by_word else "line"
-    comment_mode = "included" if compare_comment else "excluded"
+    comment_mode = "stripped" if strip_comments else "included"
     # Analysis metadata banner (transparency)
     meta_html = ""
@@ -396,26 +420,43 @@ def build_cover_body(
     summary_rows = ""
     for idx, r in enumerate(results, 1):
-        badge = _ratio_badge(r.ratio)
-        err = (
-            f' <em style="color:red">({html.escape(r.error)})</em>'
-            if r.error else ""
-        )
-        summary_rows += (
-            f"<tr>"
-            f"<td>{idx}</td>"
-            f"<td>{html.escape(r.match.rel_path_a)}</td>"
-            f"<td>{html.escape(r.match.rel_path_b)}</td>"
-            f"<td>{r.match.similarity:.1f}</td>"
-            f"<td>{badge}{err}</td>"
-            f"<td style='color:green'>+{r.additions}</td>"
-            f"<td style='color:red'>-{r.deletions}</td>"
-            f"</tr>\n"
-        )
+        if r.binary:
+            if r.hash_match:
+                status = '<span class="badge badge-high">✓ Binary Match</span>'
+            else:
+                status = '<span class="badge badge-low">✗ Binary Mismatch</span>'
+            summary_rows += (
+                f"<tr>"
+                f"<td>{idx}</td>"
+                f"<td>{_break_path(html.escape(r.match.rel_path_a))}</td>"
+                f"<td>{_break_path(html.escape(r.match.rel_path_b))}</td>"
+                f"<td>{r.match.similarity:.1f}</td>"
+                f"<td>{status}</td>"
+                f"<td>—</td>"
+                f"<td>—</td>"
+                f"</tr>\n"
+            )
+        else:
+            badge = _ratio_badge(r.ratio)
+            err = (
+                f' <em style="color:red">({html.escape(r.error)})</em>'
+                if r.error else ""
+            )
+            summary_rows += (
+                f"<tr>"
+                f"<td>{idx}</td>"
+                f"<td>{_break_path(html.escape(r.match.rel_path_a))}</td>"
+                f"<td>{_break_path(html.escape(r.match.rel_path_b))}</td>"
+                f"<td>{r.match.similarity:.1f}</td>"
+                f"<td>{badge}{err}</td>"
+                f"<td style='color:green'>+{r.additions}</td>"
+                f"<td style='color:red'>-{r.deletions}</td>"
+                f"</tr>\n"
+            )
-    # Unmatched lists
+    # Unmatched lists (only when include_uncompared is True)
     unmatched_html = ""
-    if unmatched_a or unmatched_b:
+    if include_uncompared and (unmatched_a or unmatched_b):
         unmatched_html += "<h2>Unmatched Files</h2>\n"
         if unmatched_a:
             unmatched_html += (
@@ -423,7 +464,7 @@ def build_cover_body(
                 "<ul class='unmatched'>\n"
             )
             for f in unmatched_a:
-                unmatched_html += f"  <li>{html.escape(f)}</li>\n"
+                unmatched_html += f"  <li>{_break_path(html.escape(f))}</li>\n"
             unmatched_html += "</ul>\n"
         if unmatched_b:
             unmatched_html += (
@@ -431,7 +472,7 @@ def build_cover_body(
                 "<ul class='unmatched'>\n"
             )
             for f in unmatched_b:
-                unmatched_html += f"  <li>{html.escape(f)}</li>\n"
+                unmatched_html += f"  <li>{_break_path(html.escape(f))}</li>\n"
             unmatched_html += "</ul>\n"
     deep_html = ""
@@ -447,8 +488,8 @@ def build_cover_body(
                 jbadge = _ratio_badge(jaccard)
                 deep_html += (
                     f"<tr>"
-                    f"<td>{html.escape(dr.file_a)}</td>"
-                    f"<td>{html.escape(b_file)}</td>"
+                    f"<td>{_break_path(html.escape(dr.file_a))}</td>"
+                    f"<td>{_break_path(html.escape(b_file))}</td>"
                     f"<td>{shared}</td>"
                     f"<td>{jbadge}</td>"
                     f"</tr>\n"

{diffinite-0.9.6 → diffinite-0.10.0}/src/diffinite/pipeline.py RENAMED Viewed

@@ -36,19 +36,20 @@ import json
 import logging
 import os
 import tempfile
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 from diffinite.collector import collect_files, match_files, FUZZY_THRESHOLD
 from diffinite.deep_compare import run_deep_compare
 from diffinite.differ import compute_diff, generate_html_diff, read_file
 from diffinite.evidence import (
+    _sha256_file,
     compute_file_hashes,
     create_evidence_bundle,
     write_manifest,
 )
 from diffinite.fingerprint import DEFAULT_K, DEFAULT_W
 from diffinite.models import AnalysisMetadata, DiffResult, DeepMatchResult
-from diffinite.parser import strip_comments
+from diffinite.parser import strip_comments as _strip_comments_fn
 from diffinite.pdf_gen import (
     _html_wrap,
     add_bates_numbers,
@@ -113,15 +114,16 @@ def _generate_markdown_report(
     dir_a: str,
     dir_b: str,
     by_word: bool,
-    compare_comment: bool,
+    strip_comments: bool,
     deep_results: list[DeepMatchResult] | None,
     output_path: str,
     *,
     metadata: AnalysisMetadata | None = None,
+    include_uncompared: bool = True,
 ) -> None:
     """Generate a Markdown summary report."""
     unit = "word" if by_word else "line"
-    comment_mode = "included" if compare_comment else "excluded"
+    comment_mode = "stripped" if strip_comments else "included"
     lines: list[str] = []
     lines.append("# Diffinite — Source Code Diff Report\n")
@@ -141,16 +143,24 @@ def _generate_markdown_report(
     lines.append("| # | File A | File B | Name Sim. | Match | +Added | −Deleted |")
     lines.append("|---|--------|--------|:---------:|:-----:|:------:|:--------:|")
     for idx, r in enumerate(results, 1):
-        pct = r.ratio * 100
-        err = f" ⚠ {r.error}" if r.error else ""
-        lines.append(
-            f"| {idx} | `{r.match.rel_path_a}` | `{r.match.rel_path_b}` "
-            f"| {r.match.similarity:.1f} | {pct:.1f}%{err} "
-            f"| +{r.additions} | −{r.deletions} |"
-        )
+        if r.binary:
+            status = "✓ Match" if r.hash_match else "✗ Mismatch"
+            lines.append(
+                f"| {idx} | `{r.match.rel_path_a}` | `{r.match.rel_path_b}` "
+                f"| {r.match.similarity:.1f} | [Binary: {status}] "
+                f"| — | — |"
+            )
+        else:
+            pct = r.ratio * 100
+            err = f" ⚠ {r.error}" if r.error else ""
+            lines.append(
+                f"| {idx} | `{r.match.rel_path_a}` | `{r.match.rel_path_b}` "
+                f"| {r.match.similarity:.1f} | {pct:.1f}%{err} "
+                f"| +{r.additions} | −{r.deletions} |"
+            )
     # Unmatched
-    if unmatched_a or unmatched_b:
+    if include_uncompared and (unmatched_a or unmatched_b):
         lines.append("\n## Unmatched Files\n")
         if unmatched_a:
             lines.append(f"### Only in A (`{dir_a}`)\n")
@@ -188,11 +198,12 @@ def _generate_json_report(
     dir_a: str,
     dir_b: str,
     by_word: bool,
-    compare_comment: bool,
+    strip_comments: bool,
     deep_results: list[DeepMatchResult] | None,
     output_path: str,
     *,
     metadata: AnalysisMetadata | None = None,
+    include_uncompared: bool = True,
 ) -> None:
     """Generate a JSON report for programmatic consumption.
@@ -201,7 +212,7 @@ def _generate_json_report(
     re-running the pipeline.
     """
     unit = "word" if by_word else "line"
-    comment_mode = "included" if compare_comment else "excluded"
+    comment_mode = "stripped" if strip_comments else "included"
     meta_dict = None
     if metadata is not None:
@@ -215,7 +226,7 @@ def _generate_json_report(
     result_list = []
     for r in results:
-        result_list.append({
+        entry = {
             "file_a": r.match.rel_path_a,
             "file_b": r.match.rel_path_b,
             "name_similarity": r.match.similarity,
@@ -224,7 +235,11 @@ def _generate_json_report(
             "deletions": r.deletions,
             "html_diff": r.html_diff,
             "error": r.error,
-        })
+            "binary": r.binary,
+        }
+        if r.binary:
+            entry["hash_match"] = r.hash_match
+        result_list.append(entry)
     deep_list = None
     if deep_results is not None:
@@ -251,13 +266,13 @@ def _generate_json_report(
         "comment_mode": comment_mode,
         "summary": {
             "matched_pairs": len(results),
-            "unmatched_a": len(unmatched_a),
-            "unmatched_b": len(unmatched_b),
+            "unmatched_a_count": len(unmatched_a),
+            "unmatched_b_count": len(unmatched_b),
         },
         "results": result_list,
         "deep_results": deep_list,
-        "unmatched_a": unmatched_a,
-        "unmatched_b": unmatched_b,
+        "unmatched_a": unmatched_a if include_uncompared else [],
+        "unmatched_b": unmatched_b if include_uncompared else [],
     }
     out = Path(output_path)
@@ -276,21 +291,23 @@ def _generate_html_report(
     dir_a: str,
     dir_b: str,
     by_word: bool,
-    compare_comment: bool,
+    strip_comments: bool,
     deep_results: list[DeepMatchResult] | None,
     output_path: str,
     ln_col_width: int = 28,
     *,
     metadata: AnalysisMetadata | None = None,
     hash_table_html: str | None = None,
+    include_uncompared: bool = True,
 ) -> None:
     """Generate a standalone HTML report with all diffs inline."""
     cover_html_body = build_cover_body(
         results, unmatched_a, unmatched_b,
-        dir_a, dir_b, by_word, compare_comment,
+        dir_a, dir_b, by_word, strip_comments,
         deep_results=deep_results,
         metadata=metadata,
         hash_table_html=hash_table_html,
+        include_uncompared=include_uncompared,
     )
     # Append all inline diffs
@@ -346,9 +363,8 @@ def run_pipeline(
     dir_a: str,
     dir_b: str,
     by_word: bool = False,
-    compare_comment: bool = True,
+    strip_comments: bool = False,
     squash_blanks: bool = False,
-    output_pdf: str = "report.pdf",
     threshold: float = FUZZY_THRESHOLD,
     *,
     no_merge: bool = False,
@@ -384,10 +400,14 @@ def run_pipeline(
     sort_order: str = "asc",
     # Moved block detection
     detect_moved: bool = False,
+    # Uncompared files
+    include_uncompared: bool = True,
     # Bates prefix/suffix
     bates_prefix: str = "",
     bates_suffix: str = "",
     bates_start: int = 1,
+    # Binary handling
+    binary_handling: str = "hash",
 ) -> None:
     """Execute the full diff-to-report pipeline.
@@ -410,7 +430,7 @@ def run_pipeline(
     """
     # Determine effective output paths
     if report_pdf is None and report_html is None and report_md is None and report_json is None:
-        report_pdf = output_pdf
+        report_pdf = "report.pdf"
     # Build default metadata if caller didn't provide one
     if metadata is None:
@@ -463,15 +483,30 @@ def run_pipeline(
         text_b = read_file(abs_b, encoding=encoding)
         if text_a is None or text_b is None:
-            results.append(DiffResult(
-                match=m, ratio=0.0, additions=0, deletions=0,
-                html_diff="", error="Could not decode one or both files",
-            ))
+            if binary_handling == "exclude":
+                continue
+            elif binary_handling == "hash":
+                hash_a = _sha256_file(str(root_a / m.rel_path_a))
+                hash_b = _sha256_file(str(root_b / m.rel_path_b))
+                hash_match = hash_a == hash_b
+                results.append(DiffResult(
+                    match=m,
+                    ratio=1.0 if hash_match else 0.0,
+                    additions=0, deletions=0,
+                    html_diff="",
+                    binary=True,
+                    hash_match=hash_match,
+                ))
+            else:  # "error"
+                results.append(DiffResult(
+                    match=m, ratio=0.0, additions=0, deletions=0,
+                    html_diff="", error="Could not decode one or both files",
+                ))
             continue
-        if not compare_comment:
-            text_a = strip_comments(text_a, ext, squash_blanks=squash_blanks)
-            text_b = strip_comments(text_b, ext, squash_blanks=squash_blanks)
+        if strip_comments:
+            text_a = _strip_comments_fn(text_a, ext, squash_blanks=squash_blanks)
+            text_b = _strip_comments_fn(text_b, ext, squash_blanks=squash_blanks)
         all_line_counts.append(text_a.count("\n") + 1)
         all_line_counts.append(text_b.count("\n") + 1)
@@ -493,11 +528,11 @@ def run_pipeline(
                 ln_col_width, max(all_line_counts) if all_line_counts else 0)
     # Generate HTML diffs with unified column width
-    for m_idx, m in enumerate(matches):
-        r = results[m_idx]
-        if r.error:
+    for r_idx, r in enumerate(results):
+        if r.error or r.binary:
             continue
+        m = r.match
         abs_a = str(root_a / m.rel_path_a)
         abs_b = str(root_b / m.rel_path_b)
         ext = Path(m.rel_path_a).suffix.lower()
@@ -506,9 +541,9 @@ def run_pipeline(
         text_b = read_file(abs_b, encoding=encoding)
         if text_a is None or text_b is None:
             continue
-        if not compare_comment:
-            text_a = strip_comments(text_a, ext, squash_blanks=squash_blanks)
-            text_b = strip_comments(text_b, ext, squash_blanks=squash_blanks)
+        if strip_comments:
+            text_a = _strip_comments_fn(text_a, ext, squash_blanks=squash_blanks)
+            text_b = _strip_comments_fn(text_b, ext, squash_blanks=squash_blanks)
         html_diff = generate_html_diff(
             text_a, text_b,
@@ -522,7 +557,7 @@ def run_pipeline(
             by_word=by_word,
             detect_moved=detect_moved,
         )
-        results[m_idx] = DiffResult(
+        results[r_idx] = DiffResult(
             match=r.match,
             ratio=r.ratio,
             additions=r.additions,
@@ -536,16 +571,16 @@ def run_pipeline(
     if sort_by:
         reverse = sort_order == "desc"
         if sort_by == "filename":
+            results.sort(
+                key=lambda r: PurePosixPath(r.match.rel_path_a).name.lower(),
+                reverse=reverse,
+            )
+        elif sort_by == "path":
             results.sort(key=lambda r: r.match.rel_path_a.lower(), reverse=reverse)
+        elif sort_by == "similarity":
+            results.sort(key=lambda r: r.match.similarity, reverse=reverse)
         elif sort_by == "ratio":
             results.sort(key=lambda r: r.ratio, reverse=reverse)
-        elif sort_by == "size":
-            def _file_size(r: DiffResult) -> int:
-                try:
-                    return os.path.getsize(str(root_a / r.match.rel_path_a))
-                except OSError:
-                    return 0
-            results.sort(key=_file_size, reverse=reverse)
         logger.info("  Sorted by %s (%s)", sort_by, sort_order)
     # Deep Compare (only in deep mode)
@@ -569,9 +604,10 @@ def run_pipeline(
         logger.info("Generating JSON report …")
         _generate_json_report(
             results, unmatched_a, unmatched_b,
-            dir_a, dir_b, by_word, compare_comment,
+            dir_a, dir_b, by_word, strip_comments,
             deep_results, report_json,
             metadata=metadata,
+            include_uncompared=include_uncompared,
         )
     # Markdown report
@@ -579,9 +615,10 @@ def run_pipeline(
         logger.info("Generating Markdown report …")
         _generate_markdown_report(
             results, unmatched_a, unmatched_b,
-            dir_a, dir_b, by_word, compare_comment,
+            dir_a, dir_b, by_word, strip_comments,
             deep_results, report_md,
             metadata=metadata,
+            include_uncompared=include_uncompared,
         )
     # HTML report
@@ -589,10 +626,11 @@ def run_pipeline(
         logger.info("Generating HTML report …")
         _generate_html_report(
             results, unmatched_a, unmatched_b,
-            dir_a, dir_b, by_word, compare_comment,
+            dir_a, dir_b, by_word, strip_comments,
             deep_results, report_html, ln_col_width,
             metadata=metadata,
             hash_table_html=hash_table_html,
+            include_uncompared=include_uncompared,
         )
     # PDF report
@@ -600,7 +638,7 @@ def run_pipeline(
         logger.info("Generating PDF report (divide-and-conquer) …")
         _generate_pdf_report(
             results, unmatched_a, unmatched_b,
-            dir_a, dir_b, by_word, compare_comment,
+            dir_a, dir_b, by_word, strip_comments,
             deep_results, report_pdf,
             no_merge=no_merge,
             show_page_number=show_page_number,
@@ -614,6 +652,7 @@ def run_pipeline(
             bates_prefix=bates_prefix,
             bates_suffix=bates_suffix,
             bates_start=bates_start,
+            include_uncompared=include_uncompared,
         )
     logger.info("Done (reports) ✓")
@@ -658,7 +697,7 @@ def _generate_pdf_report(
     dir_a: str,
     dir_b: str,
     by_word: bool,
-    compare_comment: bool,
+    strip_comments: bool,
     deep_results: list[DeepMatchResult] | None,
     output_pdf: str,
     *,
@@ -674,6 +713,7 @@ def _generate_pdf_report(
     bates_prefix: str = "",
     bates_suffix: str = "",
     bates_start: int = 1,
+    include_uncompared: bool = True,
 ) -> None:
     """Generate PDF report with divide-and-conquer merging."""
     if no_merge:
@@ -686,10 +726,11 @@ def _generate_pdf_report(
         # (1) Cover page
         cover_body = build_cover_body(
             results, unmatched_a, unmatched_b,
-            dir_a, dir_b, by_word, compare_comment,
+            dir_a, dir_b, by_word, strip_comments,
             deep_results=deep_results,
             metadata=metadata,
             hash_table_html=hash_table_html,
+            include_uncompared=include_uncompared,
         )
         cover_html = _html_wrap("Diffinite — Cover", cover_body)
         if no_merge:
@@ -700,9 +741,43 @@ def _generate_pdf_report(
         if cover_ok:
             logger.info("  Cover page → OK")
+        # ── Pre-flight: warn about large diffs that may slow PDF ────
+        # 500 KB of HTML ≈ 500+ source lines in side-by-side diff table.
+        # xhtml2pdf layout becomes noticeably slow above this threshold.
+        _LARGE_DIFF_BYTES = 500_000
+        large_files = [
+            (i, r) for i, r in enumerate(results, 1)
+            if not r.error and not r.binary and len(r.html_diff) > _LARGE_DIFF_BYTES
+        ]
+        if large_files:
+            logger.warning(
+                "⚠ %d file(s) have large diffs — PDF rendering may be "
+                "slow or hang:", len(large_files),
+            )
+            for i, r in large_files:
+                size_kb = len(r.html_diff) / 1024
+                logger.warning(
+                    "  %d. %s (%.0f KB HTML)",
+                    i, r.match.rel_path_a, size_kb,
+                )
+            logger.warning(
+                "  Consider: --collapse-identical (shrink diffs), "
+                "--no-merge (split PDFs), or --report-html (fast export)."
+            )
         # (2) Per-file diff pages
         diff_pdf_pairs: list[tuple[str, DiffResult]] = []
         for idx, r in enumerate(results, 1):
+            # Per-file warning for large diffs
+            if (not r.error and not r.binary
+                    and len(r.html_diff) > _LARGE_DIFF_BYTES):
+                size_kb = len(r.html_diff) / 1024
+                logger.warning(
+                    "⚠ Rendering PDF %d/%d (%s, %.0f KB) — "
+                    "this may take a while…",
+                    idx, len(results), r.match.rel_path_a, size_kb,
+                )
             diff_html = build_diff_page_html(
                 r, idx, unit,
                 show_page_number=show_page_number,

{diffinite-0.9.6 → diffinite-0.10.0/src/diffinite.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffinite
-Version: 0.9.6
+Version: 0.10.0
 Summary: Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit
 Author: nash-dir
 License: Apache-2.0

{diffinite-0.9.6 → diffinite-0.10.0}/tests/test_cli.py RENAMED Viewed

@@ -69,7 +69,7 @@ class TestDeepCompareArgs:
             "--mode", "deep",
             "--k-gram", "5",
             "--window", "3",
-            "--threshold-deep", "0.20",
+            "--threshold-deep", "20",
         ])
@@ -84,7 +84,7 @@ class TestAnnotationsAndReportFlags:
             "-o", str(tmp_path / "out.pdf"),
             "--collapse-identical",
             "--page-number", "--file-number",
-            "--bates-number", "--show-filename",
+            "--bates-number", "--filename",
         ])
     def test_threshold_accepts_value(self, tmp_path):
@@ -107,3 +107,41 @@ class TestAnnotationsAndReportFlags:
         ])
         from pathlib import Path
         assert Path(json_path).exists()
+class TestIncludeUncomparedFlag:
+    """Verify --include-uncompared / --no-include-uncompared flags."""
+    def test_include_uncompared_default_true(self, tmp_path):
+        """Default behavior includes uncompared files."""
+        d_a = tmp_path / "a"; d_a.mkdir()
+        d_b = tmp_path / "b"; d_b.mkdir()
+        (d_a / "only_a.py").write_text("x = 1\n", encoding="utf-8")
+        json_path = str(tmp_path / "out.json")
+        main([
+            str(d_a), str(d_b),
+            "--report-json", json_path,
+        ])
+        import json
+        from pathlib import Path
+        data = json.loads(Path(json_path).read_text(encoding="utf-8"))
+        assert "only_a.py" in data["unmatched_a"]
+    def test_no_include_uncompared_excludes(self, tmp_path):
+        """--no-include-uncompared excludes unmatched file lists."""
+        d_a = tmp_path / "a"; d_a.mkdir()
+        d_b = tmp_path / "b"; d_b.mkdir()
+        (d_a / "only_a.py").write_text("x = 1\n", encoding="utf-8")
+        json_path = str(tmp_path / "out.json")
+        main([
+            str(d_a), str(d_b),
+            "--report-json", json_path,
+            "--no-include-uncompared",
+        ])
+        import json
+        from pathlib import Path
+        data = json.loads(Path(json_path).read_text(encoding="utf-8"))
+        assert data["unmatched_a"] == []
+        assert data["unmatched_b"] == []
+        # Summary counts should still show the real values
+        assert data["summary"]["unmatched_a_count"] == 1

{diffinite-0.9.6 → diffinite-0.10.0}/tests/test_pdf_gen.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import pytest
 from diffinite.models import DiffResult, FileMatch, DeepMatchResult
-from diffinite.pdf_gen import build_cover_body, build_diff_page_html
+from diffinite.pdf_gen import build_cover_body, build_diff_page_html, _break_path
 # ---------------------------------------------------------------------------
@@ -37,7 +37,7 @@ def _cover(results=None, *, deep_results=None):
         dir_a="dir_a",
         dir_b="dir_b",
         by_word=False,
-        compare_comment=True,
+        strip_comments=False,
         deep_results=deep_results,
     )
@@ -55,8 +55,9 @@ class TestBuildCoverHtml:
     def test_contains_file_names(self):
         html = _cover()
-        assert "handler.java" in html
-        assert "looper.java" in html
+        # _break_path inserts &#8203; after path separators (., /, \, _)
+        assert "handler." in html
+        assert "looper." in html
     def test_contains_ratio(self):
         html = _cover()
@@ -79,10 +80,10 @@ class TestBuildCoverHtml:
             dir_a="left",
             dir_b="right",
             by_word=False,
-            compare_comment=True,
+            strip_comments=False,
         )
-        assert "orphan_a.py" in html
-        assert "orphan_b.py" in html
+        # _break_path inserts &#8203; after separators, so check partial strings
+        assert "orphan" in html
     def test_deep_results_without_channels(self):
         deep = [
@@ -92,8 +93,8 @@ class TestBuildCoverHtml:
             ),
         ]
         html = _cover(deep_results=deep)
-        assert "foo.py" in html
-        assert "bar.py" in html
+        assert "foo." in html
+        assert "bar." in html
     def test_deep_results_display(self):
         deep = [
@@ -104,8 +105,8 @@ class TestBuildCoverHtml:
             ),
         ]
         html = _cover(deep_results=deep)
-        assert "foo.py" in html
-        assert "bar.py" in html
+        assert "foo." in html
+        assert "bar." in html
         assert "50" in html  # shared hashes
@@ -155,3 +156,67 @@ class TestBuildDiffPageHtml:
             show_filename=True,
         )
         assert "annotated.py" in html
+# ---------------------------------------------------------------------------
+# _break_path tests
+# ---------------------------------------------------------------------------
+class TestBreakPath:
+    """Verify _break_path inserts zero-width spaces at path separators."""
+    def test_slash(self):
+        result = _break_path("src/main/java")
+        assert "src/&#8203;main/&#8203;java" == result
+    def test_backslash(self):
+        result = _break_path("src\\main\\java")
+        assert "src\\&#8203;main\\&#8203;java" == result
+    def test_dot(self):
+        result = _break_path("handler.java")
+        assert "handler.&#8203;java" == result
+    def test_underscore(self):
+        result = _break_path("my_file_name")
+        assert "my_&#8203;file_&#8203;name" == result
+    def test_combined(self):
+        result = _break_path("src/com/example/my_handler.java")
+        assert "&#8203;" in result
+    def test_empty(self):
+        assert _break_path("") == ""
+# ---------------------------------------------------------------------------
+# include_uncompared tests
+# ---------------------------------------------------------------------------
+class TestIncludeUncompared:
+    """Verify include_uncompared parameter on build_cover_body."""
+    def test_excludes_unmatched_when_false(self):
+        html = build_cover_body(
+            _make_results(),
+            unmatched_a=["orphan_a.py"],
+            unmatched_b=["orphan_b.py"],
+            dir_a="left",
+            dir_b="right",
+            by_word=False,
+            strip_comments=False,
+            include_uncompared=False,
+        )
+        assert "orphan" not in html
+        assert "Unmatched Files" not in html
+    def test_includes_unmatched_by_default(self):
+        html = build_cover_body(
+            _make_results(),
+            unmatched_a=["orphan_a.py"],
+            unmatched_b=["orphan_b.py"],
+            dir_a="left",
+            dir_b="right",
+            by_word=False,
+            strip_comments=False,
+        )
+        assert "orphan" in html
+        assert "Unmatched Files" in html

{diffinite-0.9.6 → diffinite-0.10.0}/tests/test_pipeline.py RENAMED Viewed

@@ -23,9 +23,9 @@ class TestPipelineE2E:
         run_pipeline(
             dir_a=EXAMPLE_LEFT,
             dir_b=EXAMPLE_RIGHT,
-            output_pdf=output,
+            report_pdf=output,
             by_word=False,
-            compare_comment=False,
+            strip_comments=True,
         )
         assert Path(output).exists()
         assert Path(output).stat().st_size > 0
@@ -37,7 +37,7 @@ class TestPipelineE2E:
         run_pipeline(
             dir_a=EXAMPLE_LEFT,
             dir_b=EXAMPLE_RIGHT,
-            output_pdf=output,
+            report_pdf=output,
             exec_mode="deep",
             workers=2,
             kgram_size=5,
@@ -53,7 +53,7 @@ class TestPipelineE2E:
         run_pipeline(
             dir_a=EXAMPLE_LEFT,
             dir_b=EXAMPLE_RIGHT,
-            output_pdf=output,
+            report_pdf=output,
             no_merge=True,
         )
         files_dir = tmp_path / "individual_files"