PyPI - sigdetect - Versions diffs - 0.5.1__tar.gz → 0.5.2__tar.gz - Mend

sigdetect 0.5.1tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{sigdetect-0.5.1 → sigdetect-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.1
+Version: 0.5.2
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT

{sigdetect-0.5.1 → sigdetect-0.5.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sigdetect"
-version = "0.5.1"
+version = "0.5.2"
 description = "Signature detection and role attribution for PDFs"
 readme = "README.md"
 authors = [{ name = "BT Asmamaw", email = "basmamaw@angeiongroup.com" }]

{sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/cropping.py RENAMED Viewed

@@ -9,6 +9,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, overload
+from PIL import Image
 from .detector.file_result_model import FileResult
 from .detector.signature_model import Signature
@@ -53,6 +55,7 @@ def crop_signatures(
     return_bytes: Literal[False] = False,
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[Path]: ...
@@ -67,6 +70,7 @@ def crop_signatures(
     return_bytes: Literal[True],
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[SignatureCrop]: ...
@@ -80,6 +84,7 @@ def crop_signatures(
     return_bytes: bool = False,
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[Path] | list[SignatureCrop]:
     """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
@@ -87,6 +92,7 @@ def crop_signatures(
     the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
     When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
     and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
+    When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
     """
     if fitz is None:  # pragma: no cover - exercised when dependency absent
@@ -145,10 +151,12 @@ def crop_signatures(
             try:
                 image_bytes: bytes | None = None
                 pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
+                raw_bytes = pixmap.tobytes("png")
+                final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
                 if save_files and not docx_enabled:
-                    pixmap.save(png_destination)
+                    png_destination.write_bytes(final_bytes)
                 if return_bytes or docx_enabled:
-                    image_bytes = pixmap.tobytes("png")
+                    image_bytes = final_bytes
             except Exception as exc:  # pragma: no cover - defensive
                 if logger:
                     logger.warning(
@@ -221,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
     return buffer.getvalue()
+def _trim_signature_image_bytes(
+    image_bytes: bytes,
+    *,
+    pad_px: int = 4,
+    gap_px: int = 4,
+    min_density_ratio: float = 0.004,
+) -> bytes:
+    image = Image.open(io.BytesIO(image_bytes))
+    gray = image.convert("L")
+    width, height = gray.size
+    histogram = gray.histogram()
+    total_pixels = width * height
+    cutoff = int(total_pixels * 0.995)
+    cumulative = 0
+    white_level = 255
+    for idx, count in enumerate(histogram):
+        cumulative += count
+        if cumulative >= cutoff:
+            white_level = idx
+            break
+    if white_level < 200:
+        return image_bytes
+    thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
+    min_density = max(2, int(width * min_density_ratio))
+    pixels = gray.load()
+    row_densities: dict[int, list[int]] = {}
+    for threshold in thresholds:
+        row_density = []
+        for y in range(height):
+            dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
+            row_density.append(dark)
+        row_densities[threshold] = row_density
+    line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
+    scan_limit = None
+    descender_limit = height - 1
+    if line_bounds is not None:
+        line_start, line_end = line_bounds
+        scan_limit = max(0, line_start - 1)
+        descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
+    min_band_height = max(4, int(height * 0.02))
+    best = None
+    best_small = None
+    best_small_threshold = None
+    best_threshold = None
+    line_threshold = int(width * 0.6)
+    for threshold in thresholds:
+        row_density = row_densities[threshold]
+        segments: list[tuple[int, int]] = []
+        start: int | None = None
+        for y, dark in enumerate(row_density):
+            if scan_limit is not None and y > scan_limit:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+                break
+            if dark >= min_density:
+                if start is None:
+                    start = y
+            else:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+        if start is not None:
+            segments.append((start, height - 1))
+        if not segments:
+            continue
+        merged: list[list[int]] = []
+        for seg in segments:
+            if not merged:
+                merged.append([seg[0], seg[1]])
+                continue
+            if seg[0] - merged[-1][1] <= gap_px:
+                merged[-1][1] = seg[1]
+            else:
+                merged.append([seg[0], seg[1]])
+        candidates = []
+        for y0, y1 in merged:
+            min_x, max_x = width, -1
+            total_dark = 0
+            for y in range(y0, y1 + 1):
+                for x in range(width):
+                    if pixels[x, y] < threshold:
+                        total_dark += 1
+                        if x < min_x:
+                            min_x = x
+                        if x > max_x:
+                            max_x = x
+            if max_x < 0:
+                continue
+            band_height = y1 - y0 + 1
+            band_width = max_x - min_x + 1
+            score = total_dark * (band_height**1.3)
+            if line_bounds is not None:
+                distance = max(0, line_bounds[0] - y1)
+                proximity = 1.0 / (1.0 + (distance / 20.0))
+                score *= 1.0 + 0.5 * proximity
+            candidates.append(
+                {
+                    "y0": y0,
+                    "y1": y1,
+                    "min_x": min_x,
+                    "max_x": max_x,
+                    "total": total_dark,
+                    "height": band_height,
+                    "width": band_width,
+                    "score": score,
+                }
+            )
+        if not candidates:
+            continue
+        candidates.sort(key=lambda item: item["score"], reverse=True)
+        top_candidate = candidates[0]
+        if top_candidate["height"] >= min_band_height:
+            if best is None or top_candidate["score"] > best["score"]:
+                best = top_candidate
+                best_threshold = threshold
+        else:
+            if best_small is None or top_candidate["score"] > best_small["score"]:
+                best_small = top_candidate
+                best_small_threshold = threshold
+    if best is None:
+        best = best_small
+        best_threshold = best_small_threshold
+    if best is None:
+        return image_bytes
+    expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
+    expand_threshold = max(1, int(min_density * 0.4))
+    y0 = best["y0"]
+    y1 = best["y1"]
+    while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
+        y0 -= 1
+    while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
+        y1 += 1
+    min_x, max_x = width, -1
+    for y in range(y0, y1 + 1):
+        if expansion_density[y] >= line_threshold:
+            continue
+        for x in range(width):
+            if pixels[x, y] < thresholds[-1]:
+                if x < min_x:
+                    min_x = x
+                if x > max_x:
+                    max_x = x
+    if max_x >= 0:
+        best = {
+            "y0": y0,
+            "y1": y1,
+            "min_x": min_x,
+            "max_x": max_x,
+        }
+    x0 = max(0, best["min_x"] - pad_px)
+    x1 = min(width - 1, best["max_x"] + pad_px)
+    y0 = max(0, best["y0"] - pad_px)
+    y1 = min(height - 1, best["y1"] + pad_px)
+    if x1 <= x0 or y1 <= y0:
+        return image_bytes
+    if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
+        return image_bytes
+    cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
+    buffer = io.BytesIO()
+    cropped.save(buffer, format="PNG")
+    return buffer.getvalue()
+def _detect_horizontal_rule_cutoff(
+    row_density: list[int],
+    width: int,
+) -> tuple[int, int] | None:
+    if not row_density:
+        return None
+    line_threshold = int(width * 0.6)
+    max_thickness = 4
+    segments: list[tuple[int, int]] = []
+    start = None
+    for y, density in enumerate(row_density):
+        if density >= line_threshold:
+            if start is None:
+                start = y
+        else:
+            if start is not None:
+                segments.append((start, y - 1))
+                start = None
+    if start is not None:
+        segments.append((start, len(row_density) - 1))
+    if not segments:
+        return None
+    total_dark = sum(row_density)
+    if total_dark <= 0:
+        return None
+    for y0, y1 in segments:
+        thickness = y1 - y0 + 1
+        if thickness > max_thickness:
+            continue
+        above_dark = sum(row_density[:y0])
+        below_dark = sum(row_density[y1 + 1 :])
+        if above_dark < 40:
+            continue
+        midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
+        if midpoint_ratio >= 0.35:
+            return (y0, y1)
+        if above_dark >= max(40, int(below_dark * 0.3)):
+            return (y0, y1)
+    return None
 def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
     width = float(page.rect.width)
     height = float(page.rect.height)

{sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect/wet_detection.py RENAMED Viewed

@@ -94,6 +94,17 @@ def apply_wet_detection(
     original_mixed = file_result.MixedContent
     try:
         added = _detect(pdf_path, configuration, file_result, logger=logger)
+        if added and configuration.Profile == "hipaa":
+            updated = False
+            for signature in file_result.Signatures:
+                if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
+                    signature.Role = "patient"
+                    signature.Scores = {"patient": int(signature.Score or 0)}
+                    signature.Evidence = list(signature.Evidence or [])
+                    signature.Evidence.append("role_default:patient")
+                    updated = True
+            if updated:
+                _refresh_metadata(file_result)
         if not added:
             _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
         return added
@@ -136,6 +147,18 @@ def _detect(
                     scale=configuration.WetOcrDpi / 72.0,
                 )
             )
+            if not candidates:
+                candidates = list(
+                    _build_candidates(
+                        ocr_lines,
+                        image=image,
+                        page_rect=page.rect,
+                        pix_width=pixmap.width,
+                        pix_height=pixmap.height,
+                        scale=configuration.WetOcrDpi / 72.0,
+                        min_y_ratio=0.2,
+                    )
+                )
             candidates.extend(_image_candidates(page))
             candidates = _filter_candidates_for_page(candidates)
             accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
     pix_width: int,
     pix_height: int,
     scale: float,
+    min_y_ratio: float = 0.4,
 ) -> Iterable[WetCandidate]:
     for line in lines:
         normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
         if len(normalized) > 80:
             # Ignore long paragraph-like OCR lines
             continue
-        if (line.bottom / pix_height) < 0.4:
+        if (line.bottom / pix_height) < min_y_ratio:
             # Ignore lines in the upper section of the page
             continue
         role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
 ) -> tuple[float, float, float, float]:
     x0 = line.left / scale
     x1 = line.right / scale
-    y1 = (pix_height - line.top) / scale
+    y_top = (pix_height - line.top) / scale
+    y_bottom = (pix_height - line.bottom) / scale
     pad_x = max(14.0, (x1 - x0) * 0.25)
     left = max(page_rect.x0, x0 - pad_x)
     right = min(page_rect.x1, x1 + pad_x)
     gap = 14.0
-    signature_height = 70.0
-    top = min(page_rect.y1, y1 + gap)
-    bottom = min(page_rect.y1, top + signature_height)
-    if bottom <= top:
-        bottom = min(page_rect.y1, top + signature_height)
+    line_height = max(1.0, (line.bottom - line.top) / scale)
+    signature_height = max(70.0, line_height * 6.0)
+    upper = min(page_rect.y1, y_bottom - gap)
+    upper = max(page_rect.y0, upper)
+    lower = max(page_rect.y0, upper - signature_height)
     if stroke_y is not None:
-        # Anchor to the detected stroke under the OCR label when available.
+        # Anchor to the detected stroke (signature line) beneath the label.
         sy = (pix_height - stroke_y) / scale
-        if sy < top:
-            top = sy
-        bottom = max(bottom, sy + signature_height)
+        field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
+        field_upper = min(page_rect.y1, y_bottom - gap)
+        if field_upper > field_lower + 6.0:
+            lower = field_lower
+            upper = field_upper
+        else:
+            upper = min(page_rect.y1, field_lower + signature_height)
+            lower = max(page_rect.y0, upper - signature_height)
-    return (float(left), float(top), float(right), float(bottom))
+    return (float(left), float(lower), float(right), float(upper))
 def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
 def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
     best_by_role: dict[str, Signature] = {}
+    best_unknown: Signature | None = None
     for signature in signatures:
         role = (signature.Role or "unknown").strip().lower()
         if role == "unknown":
+            if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
+                best_unknown = signature
             continue
         existing = best_by_role.get(role)
         if existing is None or _signature_rank(signature) > _signature_rank(existing):
             best_by_role[role] = signature
-    return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    if best_by_role:
+        return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    return [best_unknown] if best_unknown is not None else []
 def _mark_manual_review(file_result: FileResult, reason: str) -> None:

{sigdetect-0.5.1 → sigdetect-0.5.2}/src/sigdetect.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.1
+Version: 0.5.2
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT

{sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_cropping.py RENAMED Viewed

@@ -1,12 +1,14 @@
+import io
 from pathlib import Path
 import pytest
 from pypdf import PdfWriter
+from PIL import Image, ImageDraw
 from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject, TextStringObject
 from sigdetect.api import CropSignatureImages, DetectPdf
 from sigdetect.config import DetectConfiguration
-from sigdetect.cropping import SignatureCrop, crop_signatures
+from sigdetect.cropping import SignatureCrop, _trim_signature_image_bytes, crop_signatures
 from sigdetect.detector.pypdf2_engine import PyPDF2Detector
 pytest.importorskip("fitz")
@@ -43,6 +45,53 @@ def _pdf_with_signature(path: Path) -> None:
         writer.write(handle)
+def _build_test_crop_bytes() -> bytes:
+    image = Image.new("RGB", (200, 100), "white")
+    draw = ImageDraw.Draw(image)
+    draw.rectangle([20, 10, 80, 20], fill="black")
+    draw.rectangle([10, 60, 190, 80], fill="black")
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return buffer.getvalue()
+def test_trim_signature_image_bytes_prefers_lower_band() -> None:
+    original = _build_test_crop_bytes()
+    trimmed = _trim_signature_image_bytes(original, pad_px=2)
+    original_image = Image.open(io.BytesIO(original))
+    trimmed_image = Image.open(io.BytesIO(trimmed))
+    assert trimmed_image.height < original_image.height
+    gray = trimmed_image.convert("L")
+    pixels = gray.load()
+    width, height = gray.size
+    top_dark = sum(1 for x in range(width) for y in range(min(8, height)) if pixels[x, y] < 240)
+    assert top_dark == 0
+def test_trim_signature_image_bytes_respects_horizontal_rule() -> None:
+    image = Image.new("RGB", (200, 120), "white")
+    draw = ImageDraw.Draw(image)
+    # Signature scribble above the line.
+    draw.line([20, 20, 180, 30], fill="black", width=3)
+    draw.line([25, 28, 140, 18], fill="black", width=2)
+    # Horizontal rule separating signature from print name.
+    draw.line([10, 50, 190, 50], fill="black", width=2)
+    # Text-ish block below the line.
+    draw.rectangle([20, 70, 120, 85], fill="black")
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    trimmed = _trim_signature_image_bytes(buffer.getvalue(), pad_px=2)
+    trimmed_image = Image.open(io.BytesIO(trimmed)).convert("L")
+    width, height = trimmed_image.size
+    # Ensure we trimmed off the lower text block (should be well above original height).
+    assert height < 90
 def test_crop_signatures(tmp_path: Path):
     pdf_path = tmp_path / "doc.pdf"
     _pdf_with_signature(pdf_path)

{sigdetect-0.5.1 → sigdetect-0.5.2}/tests/test_wet_detection.py RENAMED Viewed

@@ -1,15 +1,19 @@
 from pathlib import Path
+from PIL import Image
 from pypdf import PdfWriter
 from sigdetect.config import DetectConfiguration
 from sigdetect.detector.file_result_model import FileResult
 from sigdetect.detector.signature_model import Signature
 from sigdetect.wet_detection import (
+    OcrLine,
     WetCandidate,
     _dedupe_wet_signatures,
     _filter_candidates_for_page,
     _image_candidates,
+    _build_candidates,
     _refresh_metadata,
     apply_wet_detection,
     should_run_wet_pipeline,
@@ -130,6 +134,46 @@ def test_apply_wet_detection_preserves_esign_flags(monkeypatch, tmp_path: Path)
     assert file_result.MixedContent is False
+def test_apply_wet_detection_defaults_unknown_to_patient(monkeypatch, tmp_path: Path) -> None:
+    pdf_path = tmp_path / "doc.pdf"
+    _blank_pdf(pdf_path)
+    configuration = DetectConfiguration(
+        pdf_root=tmp_path,
+        out_dir=tmp_path,
+        engine="pypdf2",
+        profile="hipaa",
+    )
+    file_result = _empty_file_result("doc.pdf")
+    monkeypatch.setattr("sigdetect.wet_detection._ensure_dependencies", lambda: None)
+    def fake_detect(pdf_path, configuration, file_result, logger=None):
+        file_result.Signatures.append(
+            Signature(
+                Page=1,
+                FieldName="wet_signature_detected",
+                Role="unknown",
+                Score=88,
+                Scores={"unknown": 88},
+                Evidence=["wet:true"],
+                Hint="WetSignatureOCR",
+                RenderType="wet",
+                BoundingBox=(10.0, 10.0, 100.0, 40.0),
+            )
+        )
+        _refresh_metadata(file_result)
+        return True
+    monkeypatch.setattr("sigdetect.wet_detection._detect", fake_detect)
+    applied = apply_wet_detection(pdf_path, configuration, file_result)
+    assert applied is True
+    assert file_result.Signatures
+    assert file_result.Signatures[0].Role == "patient"
+    assert "role_default:patient" in (file_result.Signatures[0].Evidence or [])
 def test_image_candidate_detection_infers_role_from_nearby_text() -> None:
     class Rect:
         def __init__(self, x0, y0, x1, y1):
@@ -213,3 +257,57 @@ def test_dedupe_wet_signatures_keeps_best_per_role() -> None:
     assert filtered[0].Role == "patient"
     assert filtered[0].Page == 2
     assert "image_signature:true" in filtered[0].Evidence
+def test_dedupe_wet_signatures_keeps_unknown_when_only() -> None:
+    def make_signature(page: int, role: str, score: int, evidence: list[str]) -> Signature:
+        return Signature(
+            Page=page,
+            FieldName="wet_signature_detected",
+            Role=role,
+            Score=score,
+            Scores={role: score},
+            Evidence=evidence,
+            Hint="WetSignatureOCR",
+            RenderType="wet",
+            BoundingBox=(0.0, 0.0, 10.0, 10.0),
+        )
+    unknown = make_signature(1, "unknown", 90, ["ocr_line:signature", "stroke:no"])
+    filtered = _dedupe_wet_signatures([unknown])
+    assert len(filtered) == 1
+    assert filtered[0].Role == "unknown"
+def test_build_candidates_respects_min_y_ratio() -> None:
+    class DummyPageRect:
+        x0, y0, x1, y1 = 0.0, 0.0, 600.0, 800.0
+    image = Image.new("RGB", (100, 100), "white")
+    line = OcrLine(text="Signature", confidence=0.9, left=10, top=10, right=90, bottom=30)
+    candidates_default = list(
+        _build_candidates(
+            [line],
+            image=image,
+            page_rect=DummyPageRect(),
+            pix_width=100,
+            pix_height=100,
+            scale=1.0,
+        )
+    )
+    candidates_relaxed = list(
+        _build_candidates(
+            [line],
+            image=image,
+            page_rect=DummyPageRect(),
+            pix_width=100,
+            pix_height=100,
+            scale=1.0,
+            min_y_ratio=0.2,
+        )
+    )
+    assert not candidates_default
+    assert candidates_relaxed