PyPI - sigdetect - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sigdetect 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sigdetect/cropping.py CHANGED Viewed

@@ -9,6 +9,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, overload
+from PIL import Image
 from .detector.file_result_model import FileResult
 from .detector.signature_model import Signature
@@ -53,6 +55,7 @@ def crop_signatures(
     return_bytes: Literal[False] = False,
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[Path]: ...
@@ -67,6 +70,7 @@ def crop_signatures(
     return_bytes: Literal[True],
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[SignatureCrop]: ...
@@ -80,6 +84,7 @@ def crop_signatures(
     return_bytes: bool = False,
     save_files: bool = True,
     docx: bool = False,
+    trim: bool = True,
 ) -> list[Path] | list[SignatureCrop]:
     """Render each signature bounding box to a PNG image and optionally wrap it in DOCX.
@@ -87,6 +92,7 @@ def crop_signatures(
     the files to ``output_dir``. Set ``save_files=False`` to skip writing PNGs to disk.
     When ``docx=True``, DOCX files are written instead of PNGs. When ``return_bytes`` is True
     and ``docx=True``, ``SignatureCrop.docx_bytes`` will contain the DOCX payload.
+    When ``trim`` is enabled, the crop is tightened around the detected ink where possible.
     """
     if fitz is None:  # pragma: no cover - exercised when dependency absent
@@ -145,10 +151,12 @@ def crop_signatures(
             try:
                 image_bytes: bytes | None = None
                 pixmap = page.get_pixmap(matrix=matrix, clip=clip, alpha=False)
+                raw_bytes = pixmap.tobytes("png")
+                final_bytes = _trim_signature_image_bytes(raw_bytes) if trim else raw_bytes
                 if save_files and not docx_enabled:
-                    pixmap.save(png_destination)
+                    png_destination.write_bytes(final_bytes)
                 if return_bytes or docx_enabled:
-                    image_bytes = pixmap.tobytes("png")
+                    image_bytes = final_bytes
             except Exception as exc:  # pragma: no cover - defensive
                 if logger:
                     logger.warning(
@@ -221,6 +229,233 @@ def _build_docx_bytes(image_bytes: bytes) -> bytes:
     return buffer.getvalue()
+def _trim_signature_image_bytes(
+    image_bytes: bytes,
+    *,
+    pad_px: int = 4,
+    gap_px: int = 4,
+    min_density_ratio: float = 0.004,
+) -> bytes:
+    image = Image.open(io.BytesIO(image_bytes))
+    gray = image.convert("L")
+    width, height = gray.size
+    histogram = gray.histogram()
+    total_pixels = width * height
+    cutoff = int(total_pixels * 0.995)
+    cumulative = 0
+    white_level = 255
+    for idx, count in enumerate(histogram):
+        cumulative += count
+        if cumulative >= cutoff:
+            white_level = idx
+            break
+    if white_level < 200:
+        return image_bytes
+    thresholds = [min(254, max(200, white_level - delta)) for delta in (6, 4, 2, 1, 0)]
+    min_density = max(2, int(width * min_density_ratio))
+    pixels = gray.load()
+    row_densities: dict[int, list[int]] = {}
+    for threshold in thresholds:
+        row_density = []
+        for y in range(height):
+            dark = sum(1 for x in range(width) if pixels[x, y] < threshold)
+            row_density.append(dark)
+        row_densities[threshold] = row_density
+    line_bounds = _detect_horizontal_rule_cutoff(row_densities[thresholds[-1]], width)
+    scan_limit = None
+    descender_limit = height - 1
+    if line_bounds is not None:
+        line_start, line_end = line_bounds
+        scan_limit = max(0, line_start - 1)
+        descender_limit = min(height - 1, line_end + max(2, int(height * 0.02)))
+    min_band_height = max(4, int(height * 0.02))
+    best = None
+    best_small = None
+    best_small_threshold = None
+    best_threshold = None
+    line_threshold = int(width * 0.6)
+    for threshold in thresholds:
+        row_density = row_densities[threshold]
+        segments: list[tuple[int, int]] = []
+        start: int | None = None
+        for y, dark in enumerate(row_density):
+            if scan_limit is not None and y > scan_limit:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+                break
+            if dark >= min_density:
+                if start is None:
+                    start = y
+            else:
+                if start is not None:
+                    segments.append((start, y - 1))
+                    start = None
+        if start is not None:
+            segments.append((start, height - 1))
+        if not segments:
+            continue
+        merged: list[list[int]] = []
+        for seg in segments:
+            if not merged:
+                merged.append([seg[0], seg[1]])
+                continue
+            if seg[0] - merged[-1][1] <= gap_px:
+                merged[-1][1] = seg[1]
+            else:
+                merged.append([seg[0], seg[1]])
+        candidates = []
+        for y0, y1 in merged:
+            min_x, max_x = width, -1
+            total_dark = 0
+            for y in range(y0, y1 + 1):
+                for x in range(width):
+                    if pixels[x, y] < threshold:
+                        total_dark += 1
+                        if x < min_x:
+                            min_x = x
+                        if x > max_x:
+                            max_x = x
+            if max_x < 0:
+                continue
+            band_height = y1 - y0 + 1
+            band_width = max_x - min_x + 1
+            score = total_dark * (band_height**1.3)
+            if line_bounds is not None:
+                distance = max(0, line_bounds[0] - y1)
+                proximity = 1.0 / (1.0 + (distance / 20.0))
+                score *= 1.0 + 0.5 * proximity
+            candidates.append(
+                {
+                    "y0": y0,
+                    "y1": y1,
+                    "min_x": min_x,
+                    "max_x": max_x,
+                    "total": total_dark,
+                    "height": band_height,
+                    "width": band_width,
+                    "score": score,
+                }
+            )
+        if not candidates:
+            continue
+        candidates.sort(key=lambda item: item["score"], reverse=True)
+        top_candidate = candidates[0]
+        if top_candidate["height"] >= min_band_height:
+            if best is None or top_candidate["score"] > best["score"]:
+                best = top_candidate
+                best_threshold = threshold
+        else:
+            if best_small is None or top_candidate["score"] > best_small["score"]:
+                best_small = top_candidate
+                best_small_threshold = threshold
+    if best is None:
+        best = best_small
+        best_threshold = best_small_threshold
+    if best is None:
+        return image_bytes
+    expansion_density = row_densities.get(best_threshold, row_densities[thresholds[-1]])
+    expand_threshold = max(1, int(min_density * 0.4))
+    y0 = best["y0"]
+    y1 = best["y1"]
+    while y0 > 0 and expansion_density[y0 - 1] >= expand_threshold:
+        y0 -= 1
+    while y1 < descender_limit and expansion_density[y1 + 1] >= expand_threshold:
+        y1 += 1
+    min_x, max_x = width, -1
+    for y in range(y0, y1 + 1):
+        if expansion_density[y] >= line_threshold:
+            continue
+        for x in range(width):
+            if pixels[x, y] < thresholds[-1]:
+                if x < min_x:
+                    min_x = x
+                if x > max_x:
+                    max_x = x
+    if max_x >= 0:
+        best = {
+            "y0": y0,
+            "y1": y1,
+            "min_x": min_x,
+            "max_x": max_x,
+        }
+    x0 = max(0, best["min_x"] - pad_px)
+    x1 = min(width - 1, best["max_x"] + pad_px)
+    y0 = max(0, best["y0"] - pad_px)
+    y1 = min(height - 1, best["y1"] + pad_px)
+    if x1 <= x0 or y1 <= y0:
+        return image_bytes
+    if (x1 - x0) < max(10, int(width * 0.2)) or (y1 - y0) < max(6, int(height * 0.08)):
+        return image_bytes
+    cropped = image.crop((x0, y0, x1 + 1, y1 + 1))
+    buffer = io.BytesIO()
+    cropped.save(buffer, format="PNG")
+    return buffer.getvalue()
+def _detect_horizontal_rule_cutoff(
+    row_density: list[int],
+    width: int,
+) -> tuple[int, int] | None:
+    if not row_density:
+        return None
+    line_threshold = int(width * 0.6)
+    max_thickness = 4
+    segments: list[tuple[int, int]] = []
+    start = None
+    for y, density in enumerate(row_density):
+        if density >= line_threshold:
+            if start is None:
+                start = y
+        else:
+            if start is not None:
+                segments.append((start, y - 1))
+                start = None
+    if start is not None:
+        segments.append((start, len(row_density) - 1))
+    if not segments:
+        return None
+    total_dark = sum(row_density)
+    if total_dark <= 0:
+        return None
+    for y0, y1 in segments:
+        thickness = y1 - y0 + 1
+        if thickness > max_thickness:
+            continue
+        above_dark = sum(row_density[:y0])
+        below_dark = sum(row_density[y1 + 1 :])
+        if above_dark < 40:
+            continue
+        midpoint_ratio = ((y0 + y1) / 2.0) / max(1, len(row_density))
+        if midpoint_ratio >= 0.35:
+            return (y0, y1)
+        if above_dark >= max(40, int(below_dark * 0.3)):
+            return (y0, y1)
+    return None
 def _to_clip_rect(page, bbox: tuple[float, float, float, float]):
     width = float(page.rect.width)
     height = float(page.rect.height)

sigdetect/wet_detection.py CHANGED Viewed

@@ -94,6 +94,17 @@ def apply_wet_detection(
     original_mixed = file_result.MixedContent
     try:
         added = _detect(pdf_path, configuration, file_result, logger=logger)
+        if added and configuration.Profile == "hipaa":
+            updated = False
+            for signature in file_result.Signatures:
+                if signature.RenderType == "wet" and (signature.Role or "unknown") == "unknown":
+                    signature.Role = "patient"
+                    signature.Scores = {"patient": int(signature.Score or 0)}
+                    signature.Evidence = list(signature.Evidence or [])
+                    signature.Evidence.append("role_default:patient")
+                    updated = True
+            if updated:
+                _refresh_metadata(file_result)
         if not added:
             _mark_manual_review(file_result, "NoHighConfidenceWetSignature")
         return added
@@ -136,6 +147,18 @@ def _detect(
                     scale=configuration.WetOcrDpi / 72.0,
                 )
             )
+            if not candidates:
+                candidates = list(
+                    _build_candidates(
+                        ocr_lines,
+                        image=image,
+                        page_rect=page.rect,
+                        pix_width=pixmap.width,
+                        pix_height=pixmap.height,
+                        scale=configuration.WetOcrDpi / 72.0,
+                        min_y_ratio=0.2,
+                    )
+                )
             candidates.extend(_image_candidates(page))
             candidates = _filter_candidates_for_page(candidates)
             accepted = [
@@ -247,6 +270,7 @@ def _build_candidates(
     pix_width: int,
     pix_height: int,
     scale: float,
+    min_y_ratio: float = 0.4,
 ) -> Iterable[WetCandidate]:
     for line in lines:
         normalized = line.text.lower()
@@ -255,7 +279,7 @@ def _build_candidates(
         if len(normalized) > 80:
             # Ignore long paragraph-like OCR lines
             continue
-        if (line.bottom / pix_height) < 0.4:
+        if (line.bottom / pix_height) < min_y_ratio:
             # Ignore lines in the upper section of the page
             continue
         role = _infer_role(normalized)
@@ -338,28 +362,33 @@ def _expand_bbox(
 ) -> tuple[float, float, float, float]:
     x0 = line.left / scale
     x1 = line.right / scale
-    y1 = (pix_height - line.top) / scale
+    y_top = (pix_height - line.top) / scale
+    y_bottom = (pix_height - line.bottom) / scale
     pad_x = max(14.0, (x1 - x0) * 0.25)
     left = max(page_rect.x0, x0 - pad_x)
     right = min(page_rect.x1, x1 + pad_x)
     gap = 14.0
-    signature_height = 70.0
-    top = min(page_rect.y1, y1 + gap)
-    bottom = min(page_rect.y1, top + signature_height)
-    if bottom <= top:
-        bottom = min(page_rect.y1, top + signature_height)
+    line_height = max(1.0, (line.bottom - line.top) / scale)
+    signature_height = max(70.0, line_height * 6.0)
+    upper = min(page_rect.y1, y_bottom - gap)
+    upper = max(page_rect.y0, upper)
+    lower = max(page_rect.y0, upper - signature_height)
     if stroke_y is not None:
-        # Anchor to the detected stroke under the OCR label when available.
+        # Anchor to the detected stroke (signature line) beneath the label.
         sy = (pix_height - stroke_y) / scale
-        if sy < top:
-            top = sy
-        bottom = max(bottom, sy + signature_height)
+        field_lower = min(page_rect.y1, max(page_rect.y0, sy + 2.0))
+        field_upper = min(page_rect.y1, y_bottom - gap)
+        if field_upper > field_lower + 6.0:
+            lower = field_lower
+            upper = field_upper
+        else:
+            upper = min(page_rect.y1, field_lower + signature_height)
+            lower = max(page_rect.y0, upper - signature_height)
-    return (float(left), float(top), float(right), float(bottom))
+    return (float(left), float(lower), float(right), float(upper))
 def _stroke_under_line(image: Image.Image, line: OcrLine) -> tuple[bool, float | None]:
@@ -513,14 +542,19 @@ def _signature_rank(signature: Signature) -> tuple[int, int, int]:
 def _dedupe_wet_signatures(signatures: Sequence[Signature]) -> list[Signature]:
     best_by_role: dict[str, Signature] = {}
+    best_unknown: Signature | None = None
     for signature in signatures:
         role = (signature.Role or "unknown").strip().lower()
         if role == "unknown":
+            if best_unknown is None or _signature_rank(signature) > _signature_rank(best_unknown):
+                best_unknown = signature
             continue
         existing = best_by_role.get(role)
         if existing is None or _signature_rank(signature) > _signature_rank(existing):
             best_by_role[role] = signature
-    return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    if best_by_role:
+        return sorted(best_by_role.values(), key=lambda sig: (int(sig.Page or 0), sig.Role or ""))
+    return [best_unknown] if best_unknown is not None else []
 def _mark_manual_review(file_result: FileResult, reason: str) -> None:

{sigdetect-0.5.1.dist-info → sigdetect-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sigdetect
-Version: 0.5.1
+Version: 0.5.2
 Summary: Signature detection and role attribution for PDFs
 Author-email: BT Asmamaw <basmamaw@angeiongroup.com>
 License: MIT

{sigdetect-0.5.1.dist-info → sigdetect-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -2,11 +2,11 @@ sigdetect/__init__.py,sha256=YvnTwlC1jfq83EhQS_1JjiiHK7_wJCCU1JvHv5E1qWY,573
 sigdetect/api.py,sha256=hDfa6z4SoHth1Dw9HDfSPiytMQrqu_oyBZlXBwSh9g4,11010
 sigdetect/cli.py,sha256=X5GqZ-PK67vz4OHN5r7h-V0hO886ZblUiUdKDuFowtU,10930
 sigdetect/config.py,sha256=3SP1rkcWBGXloCDFomBJRMRKZOvXuHQbhIBqpVrzYmY,8365
-sigdetect/cropping.py,sha256=HfOJrV2Xv9Eo0lCIl3mukz49agKB6h2TML99B0qQJNc,8837
+sigdetect/cropping.py,sha256=IyiBfIEHBLvOv8t_d-O51BfpljTFpE-dG_RxDxJAzAo,16339
 sigdetect/eda.py,sha256=S92G1Gjmepri__D0n_V6foq0lQgH-RXI9anW8A58jfw,4681
 sigdetect/logging_setup.py,sha256=LMF8ao_a-JwH0S522T6aYTFX3e8Ajjv_5ODS2YiBcHA,6404
 sigdetect/utils.py,sha256=T9rubLf5T9JmjOHYMOba1j34fhOJaWocAXccnGTxRUE,5198
-sigdetect/wet_detection.py,sha256=zvi11XUmm_xLZ4BLvxInwMQg8YLcyQzEYAM9QSdJOIs,18259
+sigdetect/wet_detection.py,sha256=ofKijykm4fKrvFaVkEkPPKL9iKeRNvlAiKkD2vHxD8k,20025
 sigdetect/data/role_rules.retainer.yml,sha256=IFdwKnDBXR2cTkdfrsZ6ku6CXD8S_dg5A3vKRKLW5h8,2532
 sigdetect/data/role_rules.yml,sha256=HuLKsZR_A6sD9XvY4NHiY_VG3dS5ERNCBF9-Mxawomw,2751
 sigdetect/data/vendor_patterns.yml,sha256=NRbZNQxcx_GuL6n1jAphBn6MM6ChCpeWGCsjbRx-PEo,384
@@ -17,8 +17,8 @@ sigdetect/detector/file_result_model.py,sha256=j2gTc9Sw3fJOHlexYsR_m5DiwHA8DzIzA
 sigdetect/detector/pymupdf_engine.py,sha256=N6oxvUa-48VvvhjbMk0R0kfScsggNKS7u5FLSeBRfWw,17358
 sigdetect/detector/pypdf2_engine.py,sha256=kB8cIp_gMvCla0LIBi9sd19g0361Oc9TjCW_ZViUBJQ,47410
 sigdetect/detector/signature_model.py,sha256=T2Hmfkfz_hZsDzwOhepxfNmkedxQp3_XHdrP8yGKoCk,1322
-sigdetect-0.5.1.dist-info/METADATA,sha256=_Jnyl9_A1yZUrKwWxUxVB-9rcMG3MdUqiN5WX_zlpqQ,14131
-sigdetect-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-sigdetect-0.5.1.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
-sigdetect-0.5.1.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
-sigdetect-0.5.1.dist-info/RECORD,,
+sigdetect-0.5.2.dist-info/METADATA,sha256=jLin7USVPqeA5tS7KCuPRRt1PLwdt-oJWhWuKSQa6hE,14131
+sigdetect-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+sigdetect-0.5.2.dist-info/entry_points.txt,sha256=iqtfKjBU44-omM7Sh-idGz2ahw19oAvpvSyKZVArG3o,48
+sigdetect-0.5.2.dist-info/top_level.txt,sha256=PKlfwUobkRC0viwiSXmhtw83G26FSNpimWYC1Uy00FY,10
+sigdetect-0.5.2.dist-info/RECORD,,

{sigdetect-0.5.1.dist-info → sigdetect-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sigdetect-0.5.1.dist-info → sigdetect-0.5.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sigdetect-0.5.1.dist-info → sigdetect-0.5.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sigdetect 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

sigdetect 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl